def split_ATAC(data, bam_file=None): """ splits a BAM into nucleosome-free (NF) and mono/di/tri nucleosome BAMs based on the estimated insert sizes uses the current working BAM file if no BAM file is supplied """ sambamba = config_utils.get_program("sambamba", data) num_cores = dd.get_num_cores(data) base_cmd = f'{sambamba} view --format bam --nthreads {num_cores} ' bam_file = bam_file if bam_file else dd.get_work_bam(data) out_stem = os.path.splitext(bam_file)[0] split_files = {} # we can only split these fractions from paired runs if not bam.is_paired(bam_file): split_files["full"] = bam_file data = tz.assoc_in(data, ['atac', 'align'], split_files) return data # reads on the negative strand have a negative template_length value for arange in ATACRanges.values(): out_file = f"{out_stem}-{arange.label}.bam" if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: cmd = base_cmd +\ f'-F "(template_length > {arange.min} and template_length < {arange.max}) or ' +\ f'(template_length) < {-arange.min} and template_length > {-arange.max})" ' +\ f'{bam_file} > {tx_out_file}' message = f'Splitting {arange.label} regions from {bam_file}.' do.run(cmd, message) bam.index(out_file, dd.get_config(data)) split_files[arange.label] = out_file split_files["full"] = bam_file data = tz.assoc_in(data, ['atac', 'align'], split_files) return data
def calculate_complexity_metrics(work_bam, data): """ the work_bam should have duplicates marked but not removed mitochondrial reads should be removed """ bedtools = config_utils.get_program("bedtools", dd.get_config(data)) work_dir = dd.get_work_dir(data) metrics_dir = os.path.join(work_dir, "metrics", "atac") utils.safe_makedir(metrics_dir) metrics_file = os.path.join( metrics_dir, f"{dd.get_sample_name(data)}-atac-metrics.csv") if utils.file_exists(metrics_file): data = tz.assoc_in(data, ['atac', 'complexity_metrics_file'], metrics_file) return data # BAM file must be sorted by read name work_bam = bam.sort(work_bam, dd.get_config(data), order="queryname") with file_transaction(metrics_file) as tx_metrics_file: with open(tx_metrics_file, "w") as out_handle: out_handle.write("mt,m0,m1,m2\n") cmd = ( f"{bedtools} bamtobed -bedpe -i {work_bam} | " "awk 'BEGIN{OFS=\"\\t\"}{print $1,$2,$4,$6,$9,$10}' | " "sort | " "uniq -c | " "awk 'BEGIN{mt=0;m0=0;m1=0;m2=0}($1==1){m1=m1+1} " "($1==2){m2=m2+1}{m0=m0+1}{mt=mt+$1}END{printf \"%d,%d,%d,%d\\n\", mt,m0,m1,m2}' >> " f"{tx_metrics_file}") message = f"Calculating ATAC-seq complexity metrics on {work_bam}, saving as {metrics_file}." do.run(cmd, message) data = tz.assoc_in(data, ['atac', 'complexity_metrics_file'], metrics_file) return data
def create_ataqv_report(samples): """ make the ataqv report from a set of ATAC-seq samples """ data = samples[0][0] new_samples = [] reportdir = os.path.join(dd.get_work_dir(data), "qc", "ataqv") sentinel = os.path.join(reportdir, "index.html") if utils.file_exists(sentinel): ataqv_output = {"base": sentinel, "secondary": get_ataqv_report_files(reportdir)} new_data = [] for data in dd.sample_data_iterator(samples): data = tz.assoc_in(data, ["ataqv_report"], ataqv_output) new_data.append(data) return dd.get_samples_from_datalist(new_data) mkarv = config_utils.get_program("mkarv", dd.get_config(data)) ataqv_files = [] for data in dd.sample_data_iterator(samples): qc = dd.get_summary_qc(data) ataqv_file = tz.get_in(("ataqv", "base"), qc, None) if ataqv_file and utils.file_exists(ataqv_file): ataqv_files.append(ataqv_file) if not ataqv_files: return samples ataqv_json_file_string = " ".join(ataqv_files) with file_transaction(reportdir) as txreportdir: cmd = f"{mkarv} {txreportdir} {ataqv_json_file_string}" message = f"Creating ataqv report from {ataqv_json_file_string}." do.run(cmd, message) new_data = [] ataqv_output = {"base": sentinel, "secondary": get_ataqv_report_files(reportdir)} for data in dd.sample_data_iterator(samples): data = tz.assoc_in(data, ["ataqv_report"], ataqv_output) new_data.append(data) return dd.get_samples_from_datalist(new_data)
def read_content_section(lines): scalars = {} histograms = {} vector_definitions = {} vectors_by_index = {} while lines: # scalars scalar, lines = read_scalar(lines) if scalar is not None: emitter, signal, recorder, value = scalar scalars = assoc_in(scalars, [emitter, signal, recorder], value) continue # histograms histogram, lines = read_histogram(lines) if histogram is not None: emitter, signal, fields, bins, hist = histogram for field_name, field_value in fields.items(): scalars = assoc_in(scalars, [emitter, signal, field_name], field_value) histograms = assoc_in(histograms, [emitter, signal], (bins, hist)) continue # vector definitions vector, lines = read_vector(lines) if vector is not None: index, emitter, signal, column_spec = vector vector_definitions[index] = (emitter, signal, column_spec) vectors_by_index[index] = {"event": [], "time": [], "value": []} continue # vector data vector_data, lines = read_vector_data(lines) if vector_data is not None: index, column_string = vector_data columns = column_string.split() if index not in vector_definitions: raise ValueError(f"Missing definition for vector {index}") _, _, column_spec = vector_definitions[index] event, time, value = parse_vector_columns(columns, column_spec) v = vectors_by_index[index] v["event"].append(event) v["time"].append(time) v["value"].append(value) continue # something should have matched by now, if not we're finished break vectors = {} for i, v in vectors_by_index.items(): emitter, signal, _ = vector_definitions[i] vectors = assoc_in(vectors, (emitter, signal), v) return (scalars, histograms, vectors), lines
def chipseq_count(data): """ count reads mapping to ChIP/ATAC consensus peaks with featureCounts """ method = dd.get_chip_method(data) if method == "chip": in_bam = dd.get_work_bam(data) elif method == "atac": in_bam = tz.get_in(("atac", "align", "NF"), data) out_dir = os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data)) sorted_bam = bam.sort(in_bam, dd.get_config(data), order="queryname", out_dir=safe_makedir(out_dir)) consensus_file = tz.get_in(("peaks_files", "consensus", "main"), data) saf_file = os.path.splitext(consensus_file)[0] + ".saf" work_dir = dd.get_work_dir(data) out_dir = os.path.join(work_dir, "consensus") safe_makedir(out_dir) count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts" summary_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts.summary" if file_exists(count_file) and _is_fixed_count_file(count_file): if method == "atac": data = tz.assoc_in(data, ("peak_counts", "NF"), count_file) elif method == "chip": data = tz.assoc_in(data, ("peak_counts"), count) return [[data]] featureCounts = config_utils.get_program("featureCounts", dd.get_config(data)) paired_flag = _paired_flag(in_bam) strand_flag = _strand_flag(data) cmd = ( "{featureCounts} -F SAF -a {saf_file} -o {tx_count_file} -s {strand_flag} " "{paired_flag} {sorted_bam}") message = ("Count reads in {sorted_bam} overlapping {saf_file} using " "featureCounts.") with file_transaction(data, [count_file, summary_file]) as tx_files: tx_count_file, tx_summary_file = tx_files do.run(cmd.format(**locals()), message.format(**locals())) fixed_count_file = _format_count_file(count_file, data) fixed_summary_file = _change_sample_name(summary_file, dd.get_sample_name(data), data=data) shutil.move(fixed_count_file, count_file) shutil.move(fixed_summary_file, summary_file) if method == "atac": data = tz.assoc_in(data, ("peak_counts", "NF"), count_file) elif method == "chip": data = tz.assoc_in(data, ("peak_counts"), count) return [[data]]
def parse_simple(jobs, grid_dim): _jobs = [] dim_length = get_dim_length(grid_dim.values()) for job in jobs: for v_id in range(dim_length): _job = job for keys, values in grid_dim.items(): split_keys = keys.split('.') value = values[v_id] _job = assoc_in(_job, split_keys, value) if len(grid_dim.values()) == 1: _job = assoc_in(_job, ['labels', split_keys[-1]], value) _jobs.append(_job) return _jobs
def adjust_saturation(self, sender): self.CIColorControls = self.CIColorControls.copy() self.CIColorControls.setValue_forKey_(sender.value, 'inputSaturation') self._state_history.append( assoc_in(self._state_history[-1], ['CIColorControls'], self.CIColorControls)) self.update_image()
def adjust_exposure(self, sender): self.CIExposureAdjust = self.CIExposureAdjust.copy() self.CIExposureAdjust.setValue_forKey_(sender.value, 'inputEV') self._state_history.append( assoc_in(self._state_history[-1], ['CIExposureAdjust'], self.CIExposureAdjust)) self.update_image()
def test_change_project_directory(testing_ui, testing_project_directory, second_project_directory): def change_project_directory(project_directory): textEdit = testing_ui.settings_widget.project_settings_widget.textEdit jj = json.loads(textEdit.toPlainText()) jj['project_directory'] = str(project_directory) textEdit.setPlainText(json.dumps(jj)) testing_ui.settings_widget.project_settings_widget.commit() original_project = call_map.project_settings_module.Project( testing_project_directory) original_project.update_settings( original_project.load_from_persistent_storage()) assert original_project.settings[call_map.project_settings_module.modules] change_project_directory(second_project_directory) second_project = call_map.project_settings_module.Project( second_project_directory) second_project.update_settings( second_project.load_from_persistent_storage()) assert (testing_ui.settings_widget.project_settings_widget.project. project_directory == second_project_directory) assert tz.assoc_in(original_project.settings, [ call_map.project_settings_module.project_settings, 'project_directory' ], second_project.project_directory) == second_project.settings change_project_directory(testing_project_directory)
def calling(data): """Main function to parallelize peak calling.""" method = dd.get_chip_method(data) caller_fn = get_callers()[data["peak_fn"]] if method == "chip": chip_bam = data.get("work_bam") input_bam = data.get("work_bam_input", None) name = dd.get_sample_name(data) out_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), data["peak_fn"], name)) out_files = caller_fn(name, chip_bam, input_bam, dd.get_genome_build(data), out_dir, dd.get_chip_method(data), data["resources"], data) greylistdir = greylisting(data) data.update({"peaks_files": out_files}) if greylistdir: data["greylist"] = greylistdir if method == "atac": for fraction in atac.ATACRanges.keys(): chip_bam = tz.get_in(("atac", "align", fraction), data) logger.info( f"Running peak calling with {data['peak_fn']} on the {fraction} fraction of {chip_bam}." ) name = dd.get_sample_name(data) + f"-{fraction}" out_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), data["peak_fn"], name)) out_files = caller_fn(name, chip_bam, None, dd.get_genome_build(data), out_dir, dd.get_chip_method(data), data["resources"], data) data = tz.assoc_in(data, ("peaks_files", fraction), out_files) return [[data]]
def create_peaktable(samples): """create a table of peak counts per sample to use with differential peak calling """ data = dd.get_data_from_sample(samples[0]) peakcounts = [] out_dir = os.path.join(dd.get_work_dir(data), "consensus") out_file = os.path.join(out_dir, "consensus-counts.tsv") if dd.get_chip_method(data) == "chip": for data in dd.sample_data_iterator(samples): peakcounts.append(tz.get_in(("peak_counts"), data)) elif dd.get_chip_method(data) == "atac": for data in dd.sample_data_iterator(samples): if bam.is_paired(dd.get_work_bam(data)): peakcounts.append(tz.get_in(("peak_counts", "NF"), data)) else: logger.info(f"Creating peak table from full BAM file because " f"{dd.get_work_bam(data)} is single-ended.") peakcounts.append(tz.get_in(("peak_counts", "full"), data)) combined_peaks = count.combine_count_files(peakcounts, out_file, ext=".counts") new_data = [] for data in dd.sample_data_iterator(samples): data = tz.assoc_in(data, ("peak_counts", "peaktable"), combined_peaks) new_data.append(data) new_samples = dd.get_samples_from_datalist(new_data) return new_samples
def call_consensus(samples): """ call consensus peaks on the narrow/Broad peakfiles from a set of ChiP/ATAC samples """ data = samples[0][0] new_samples = [] consensusdir = os.path.join(dd.get_work_dir(data), "consensus") utils.safe_makedir(consensusdir) peakfiles = [] for data in dd.sample_data_iterator(samples): if dd.get_chip_method(data) == "chip": for fn in tz.get_in(("peaks_files", "macs2"), data, []): if "narrowPeak" in fn: peakfiles.append(fn) break elif "broadPeak" in fn: peakfiles.append(fn) break elif dd.get_chip_method(data) == "atac": for fn in tz.get_in(("peaks_files", "NF", "macs2"), data, []): if "narrowPeak" in fn: peakfiles.append(fn) consensusfile = os.path.join(consensusdir, "consensus.bed") if not peakfiles: logger.info( "No suitable peak files found, skipping consensus peak calling.") return samples consensusfile = consensus(peakfiles, consensusfile, data) for data in dd.sample_data_iterator(samples): new_samples.append([ tz.assoc_in(data, ("peaks_files", "consensus"), {"main": consensusfile}) ]) return new_samples
def calling(data): """Main function to parallelize peak calling.""" method = dd.get_chip_method(data) caller_fn = get_callers()[data["peak_fn"]] if method == "chip": chip_bam = data.get("work_bam") input_bam = data.get("work_bam_input", None) name = dd.get_sample_name(data) out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["peak_fn"], name)) out_files = caller_fn(name, chip_bam, input_bam, dd.get_genome_build(data), out_dir, dd.get_chip_method(data), data["resources"], data) greylistdir = greylisting(data) data.update({"peaks_files": out_files}) if greylistdir: data["greylist"] = greylistdir if method == "atac": fractions = list(ATACRanges.keys()) + ["full"] for fraction in fractions: MIN_READS_TO_CALL = 1000 chip_bam = tz.get_in(("atac", "align", fraction), data) if not bam.has_nalignments(chip_bam, MIN_READS_TO_CALL, data): logger.warn(f"{chip_bam} has less than {MIN_READS_TO_CALL}, peak calling will fail so skip this fraction.") continue logger.info(f"Running peak calling with {data['peak_fn']} on the {fraction} fraction of {chip_bam}.") name = dd.get_sample_name(data) + f"-{fraction}" out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["peak_fn"], name)) out_files = caller_fn(name, chip_bam, None, dd.get_genome_build(data), out_dir, dd.get_chip_method(data), data["resources"], data) data = tz.assoc_in(data, ("peaks_files", fraction), out_files) return [[data]]
def adjust_shadows(self, sender): self.CIHighlightShadowAdjust = self.CIHighlightShadowAdjust.copy() self.CIHighlightShadowAdjust.setValue_forKey_(sender.value, 'inputShadowAmount') self._state_history.append( assoc_in(self._state_history[-1], ['CIHighlightShadowAdjust'], self.CIHighlightShadowAdjust)) self.update_image()
def raw_shadow(self, sender): shadow = sender.value self._CIHighlightShadowAdjust_raw.setValue_forKey_( shadow, 'inputShadowAmount') self._state_history.append( assoc_in(self._state_history[-1], ['rawfilter', 'inputLinearSpaceFilter'], self._CIHighlightShadowAdjust_raw)) self.update_image()
def parse_list(jobs, grid_dim): _jobs = [] for job in jobs: for grid_val in grid_dim: _job = job for keys, value in grid_val.items(): k_list = keys.split('.') old_v = get_in(k_list, job) new_v = deepmerge(old_v, value) _job = assoc_in(_job, k_list, new_v) _jobs.append(_job) return _jobs
def adjust_midtonecontrast(self, sender): self.CIToneCurve = self.CIToneCurve.copy() self.CIToneCurve.setValue_forKey_( self._CIVector.vectorWithX_Y_(0.25, 0.25 * (1.0 - sender.value)), 'inputPoint1') self.CIToneCurve.setValue_forKey_( self._CIVector.vectorWithX_Y_(0.75, 0.75 * (1.0 + sender.value)), 'inputPoint3') self._state_history.append( assoc_in(self._state_history[-1], ['CIToneCurve'], self.CIToneCurve)) self.update_image()
def _extract_entity(tweet, from_, get_out, root_from, root_to='entities'): li = get_in([root_from, from_], tweet) if not li: return tweet extracted = [h[get_out] for h in li] if not tweet[root_to]: tweet[root_to] = {} tweet = assoc_in(tweet, [root_to, from_], extracted) return tweet
def call_consensus(samples): """ call consensus peaks on the narrowPeak files from a set of ChiP/ATAC samples """ data = samples[0][0] new_samples = [] consensusdir = os.path.join(dd.get_work_dir(data), "consensus") utils.safe_makedir(consensusdir) peakfiles = [] for data in dd.sample_data_iterator(samples): if dd.get_chip_method(data) == "chip": for fn in tz.get_in(("peaks_files", "macs2"), data, []): if "narrowPeak" in fn: peakfiles.append(fn) elif "broadPeak" in fn: peakfiles.append(fn) elif dd.get_chip_method(data) == "atac": if bam.is_paired(dd.get_work_bam(data)): for fn in tz.get_in(("peaks_files", "NF", "macs2"), data, []): if "narrowPeak" in fn: peakfiles.append(fn) else: logger.info( f"Using peaks from full fraction since {dd.get_work_bam(data)} is single-ended." ) for fn in tz.get_in(("peaks_files", "full", "macs2"), data, []): if "narrowPeak" in fn: peakfiles.append(fn) consensusfile = os.path.join(consensusdir, "consensus.bed") if not peakfiles: logger.info( "No suitable peak files found, skipping consensus peak calling.") return samples consensusfile = consensus(peakfiles, consensusfile, data) if not utils.file_exists(consensusfile): logger.warning("No consensus peaks found.") return samples saffile = consensus_to_saf(consensusfile, os.path.splitext(consensusfile)[0] + ".saf") for data in dd.sample_data_iterator(samples): data = tz.assoc_in(data, ("peaks_files", "consensus"), {"main": consensusfile}) new_samples.append([data]) return new_samples
def mk_node(ds, sources, cache, sources_path): existing = cache.get(ds.id, None) doc = ds.doc_without_lineage_sources if existing is not None: _ds, _doc, _sources = existing if not check_sources(sources, _sources): raise InvalidDocException('Inconsistent lineage for repeated dataset with _id: {}'.format(ds.id)) if doc != _doc: raise InvalidDocException('Inconsistent metadata for repeated dataset with _id: {}'.format(ds.id)) return _ds out_ds = toolz.assoc_in(doc, sources_path, sources) cache[ds.id] = (out_ds, doc, sources) return out_ds
def check_inconsistent_lineage(clirunner, index): """ A -> B | | | v +--> C -> D | +--> E Add node E, then try adding A with modified E in the lineage, should fail to add ABCD """ ds = SimpleDocNav(gen_dataset_test_dag(1313, force_tree=True)) child_docs = [ds.sources[x].doc for x in ('ae', )] modified_doc = toolz.assoc_in( ds.doc, 'lineage.source_datasets.ae.label'.split('.'), 'modified') prefix = write_files({ 'lineage.yml': yaml.safe_dump_all(child_docs), 'main.yml': yaml.safe_dump(modified_doc), }) clirunner(['dataset', 'add', str(prefix / 'lineage.yml')]) assert index.datasets.get(ds.sources['ae'].id) is not None r = clirunner(['dataset', 'add', str(prefix / 'main.yml')]) assert 'ERROR Inconsistent lineage dataset' in r.output assert index.datasets.has(ds.id) is False assert index.datasets.has(ds.sources['ab'].id) is False assert index.datasets.has(ds.sources['ac'].id) is False assert index.datasets.has(ds.sources['ac'].sources['cd'].id) is False # now again but skipping verification check r = clirunner( ['dataset', 'add', '--no-verify-lineage', str(prefix / 'main.yml')]) assert index.datasets.has(ds.id) assert index.datasets.has(ds.sources['ab'].id) assert index.datasets.has(ds.sources['ac'].id) assert index.datasets.has(ds.sources['ac'].sources['cd'].id)
def run_args(args) -> pd.DataFrame: with open(args.config) as f: config = yaml.load(f) for k, v in vars(args).items(): if v is not None and "." in k: config = toolz.assoc_in(config, k.split("."), v) print(k, v) if args.logdir is not None: config['train']['logdir'] = args.logdir try: cfg = voluptuous.Schema({ 'train': TrainConfig.schema, 'version': str, }, extra=voluptuous.REMOVE_EXTRA, required=True)(config) except voluptuous.error.Error as e: logger.error(humanize_error(config, e)) sys.exit(1) logger.info(f"Parsed config\n{pformat(cfg)}") formatter = logging.Formatter( "%(asctime)s [%(levelname)5s]:%(name)20s: %(message)s") train_cfg: TrainConfig = cfg['train'] os.makedirs(train_cfg.logdir, exist_ok=True) fn = os.path.join(train_cfg.logdir, f"{getattr(args, 'name', 'mincall')}.log") h = (logging.FileHandler(fn)) h.setLevel(logging.INFO) h.setFormatter(formatter) name_filter = ExtraFieldsFilter({"run_name": args.name}) root_logger = logging.getLogger() root_logger.addHandler(h) root_logger.addFilter(name_filter) logging.info(f"Added handler to {fn}") try: with tf.Graph().as_default(): return run(cfg['train']) finally: root_logger.removeHandler(h) root_logger.removeFilter(name_filter)
def getIfPath(self,path): ''' returns a structured nested dictionary with all values and keys in a given path ''' if not path.endswith('/'): path+='/' try: nd = self.nested_dict() keylen = len(path.split('/')) if path.endswith('/'): keylen = len(path.split('/')) - 1 for key, value in self.session.kv.find(path).items(): keysToPut = [i for i in key.split('/')[keylen:]] nd.update(toolz.assoc_in(nd, keysToPut, value)) if len(nd) > 0: return json.dumps(dict(nd)) return 'Not Defined' except Exception as e: logging.error(traceback.format_exc()) return json.dumps(traceback.format_exc())
def create_peaktable(samples): """create a table of peak counts per sample to use with differential peak calling """ data = dd.get_data_from_sample(samples[0]) peakcounts = [] out_dir = os.path.join(dd.get_work_dir(data), "consensus") out_file = os.path.join(out_dir, "consensus-counts.tsv") if dd.get_chip_method(data) == "chip": for data in dd.sample_data_iterator(samples): peakcounts.append(tz.get_in(("peak_counts"), data)) elif dd.get_chip_method(data) == "atac": for data in dd.sample_data_iterator(samples): peakcounts.append(tz.get_in(("peak_counts", "NF"), data)) combined_peaks = count.combine_count_files(peakcounts, out_file, ext=".counts") new_data = [] for data in dd.sample_data_iterator(samples): data = tz.assoc_in(data, ("peak_counts", "peaktable"), combined_peaks) new_data.append(data) new_samples = dd.get_samples_from_datalist(new_data) return new_samples
def pressure_correct(ic, path=None, sam_src="."): """Pressure correct the velocity fields using SAM""" if path is None: path = tempfile.mkdtemp(dir=".") prm = default_parameters() for key, val in [('nstop', 0), ('nsave3d', 1), ('nstat', 0), ('nstatfrq', 1), ('dt', .0001), ('nsave3dstart', 0)]: prm = assoc_in(prm, ['parameters', key], val) case = InitialConditionCase(ic=ic, path=path, sam_src=sam_src, prm=prm) case.run() case.convert_files_to_netcdf() files = glob.glob(os.path.join(case.path, 'OUT_3D', '*.nc')) ic = ic.drop('time') ds = xr.open_dataset(files[-1]).load()\ .assign_coords(x=ic.x, y=ic.y) shutil.rmtree(path) return ds
def clean_chipseq_alignment(data): # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr")) method = dd.get_chip_method(data) if method == "atac": data = shift_ATAC(data) work_bam = dd.get_work_bam(data) work_bam = bam.sort(work_bam, dd.get_config(data)) bam.index(work_bam, dd.get_config(data)) # an unfiltered BAM file is useful for calculating some metrics later data = tz.assoc_in(data, ['chipseq', 'align', "unfiltered"], work_bam) clean_bam = remove_nonassembled_chrom(work_bam, data) clean_bam = remove_mitochondrial_reads(clean_bam, data) data = atac.calculate_complexity_metrics(clean_bam, data) if not dd.get_keep_multimapped(data): clean_bam = remove_multimappers(clean_bam, data) if not dd.get_keep_duplicates(data): clean_bam = bam.remove_duplicates(clean_bam, data) data["work_bam"] = clean_bam # for ATAC-seq, brewak alignments into NF, mono/di/tri nucleosome BAM files if method == "atac": data = atac.split_ATAC(data) encode_bed = tz.get_in( ["genome_resources", "variation", "encode_blacklist"], data) if encode_bed: data["work_bam"] = remove_blacklist_regions(dd.get_work_bam(data), encode_bed, data['config']) bam.index(data["work_bam"], data['config']) try: data["bigwig"] = _normalized_bam_coverage(dd.get_sample_name(data), dd.get_work_bam(data), data) except subprocess.CalledProcessError: logger.warning(f"{dd.get_work_bam(data)} was too sparse to normalize, " f" falling back to non-normalized coverage.") data["bigwig"] = _bam_coverage(dd.get_sample_name(data), dd.get_work_bam(data), data) return [[data]]
def run_args(args): if args.config is not None: with open(args.config) as f: config = yaml.load(f) else: config = {'version': "v0.1"} for k, v in vars(args).items(): if v is not None and "." in k: config = toolz.assoc_in(config, k.split("."), v) if args.logdir is not None: config['embed']['logdir'] = args.logdir try: cfg = voluptuous.Schema({ 'embed': EmbeddingCfg.schema, 'version': str, }, extra=voluptuous.REMOVE_EXTRA, required=True)(config) logger.info(f"Parsed config\n{pformat(cfg)}") run(cfg['embed']) except voluptuous.error.Error as e: logger.error(humanize_error(config, e)) sys.exit(1)
def doc_without_lineage_sources(self): if self._doc_without is None: self._doc_without = toolz.assoc_in(self._doc, self._sources_path, {}) return self._doc_without
def load_results(path): results_dict = {} with path.open() as f: # version version_line = next(f) version_match = re.match(r"version (\d+)$", version_line) version = int(version_match.group(1)) if version != 2: raise ValueError("unknown version") results_dict["version"] = version # run run_line = next(f) run_match = re.match(r"run (\S+)$", run_line) results_dict["run"] = run_match.group(1) # attributes found_attributes = set() attr_regex = r"attr (\S+) (.+)$" for _ in range(len(ATTRIBUTES)): attr_line = next(f) attr_match = re.match(attr_regex, attr_line) attribute, value = attr_match.groups() assert attribute not in found_attributes found_attributes.add(attribute) if attribute in INT_ATTRIBUTES: value = int(value) elif attribute in DATETIME_ATTRIBUTES: value = datetime.datetime.strptime(value, "%Y%m%d-%H:%M:%S") results_dict[attribute] = value assert found_attributes == set(ATTRIBUTES) # itervars l = next(f) itervars = {} itervar_regex = r"itervar (\S+) (\S+)$" match = re.match(itervar_regex, l) while match: var, value = match.groups() assert var not in itervars itervars[var] = value l = next(f) match = re.match(itervar_regex, l) results_dict["itervars"] = itervars # params params = {} param_regex = r"param (\S+) (\S+)$" match = re.match(param_regex, l) while match: param, value = match.groups() assert param not in params params[param] = value l = next(f) match = re.match(param_regex, l) results_dict["params"] = params # scalars scalars = {} assert l == "\n" l = next(f) scalar_regex = "scalar (\S+) (\S+):(\S+) (\S+)$" statistic_regex = "statistic (\S+) (\S+):histogram$" field_regex = "field (\S+) (\S+)$" bin_regex = "bin\s+(\S+)\s+(\S+)$" while True: scalar_match = re.match(scalar_regex, l) statistic_match = re.match(statistic_regex, l) attr_match = re.match(attr_regex, l) if scalar_match: emitter, signal, recorder, value = scalar_match.groups() scalars = assoc_in(scalars, [emitter, signal, recorder], parse_value(value)) l = next(f) continue elif statistic_match: # histogram fields emitter, signal = statistic_match.groups() found_fields = set() for _ in range(len(HISTOGRAM_FIELDS)): l = next(f) field_match = re.match(field_regex, l) recorder, value = field_match.groups() found_fields.add(recorder) scalars = assoc_in(scalars, [emitter, signal, recorder], parse_value(value)) assert found_fields == set(HISTOGRAM_FIELDS) # histogram bins bin_edges = [] hist = [] l = next(f) bin_match = re.match(bin_regex, l) while bin_match: left_bin_edge, count = bin_match.groups() bin_edges.append(float(left_bin_edge)) hist.append(int(count)) l = next(f) bin_match = re.match(bin_regex, l) bin_edges.append(math.inf) scalars = assoc_in( scalars, [emitter, signal, "histogram"], (np.array(hist), np.array(bin_edges)) ) elif attr_match: assert attr_match.group(1) == "source" # ignore source of signals l = next(f) else: assert l == "\n" try: next(f) except StopIteration: break else: assert False results_dict["scalars"] = scalars return Results(**results_dict)
def timed_learner(*args: Any, **kwargs: Any) -> LearnerReturnType: t0 = time() (p, d, l) = learner(*args, **kwargs) return p, d, fp.assoc_in(l, [learner_name, 'running_time'], "%2.3f s" % (time() - t0))