Exemple #1
0
def split_ATAC(data, bam_file=None):
    """
    splits a BAM into nucleosome-free (NF) and mono/di/tri nucleosome BAMs based
    on the estimated insert sizes
    uses the current working BAM file if no BAM file is supplied
    """
    sambamba = config_utils.get_program("sambamba", data)
    num_cores = dd.get_num_cores(data)
    base_cmd = f'{sambamba} view --format bam --nthreads {num_cores} '
    bam_file = bam_file if bam_file else dd.get_work_bam(data)
    out_stem = os.path.splitext(bam_file)[0]
    split_files = {}
    # we can only split these fractions from paired runs
    if not bam.is_paired(bam_file):
        split_files["full"] = bam_file
        data = tz.assoc_in(data, ['atac', 'align'], split_files)
        return data
    # reads on the negative strand have a negative template_length value
    for arange in ATACRanges.values():
        out_file = f"{out_stem}-{arange.label}.bam"
        if not utils.file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                cmd = base_cmd +\
                    f'-F "(template_length > {arange.min} and template_length < {arange.max}) or ' +\
                    f'(template_length) < {-arange.min} and template_length > {-arange.max})" ' +\
                    f'{bam_file} > {tx_out_file}'
                message = f'Splitting {arange.label} regions from {bam_file}.'
                do.run(cmd, message)
            bam.index(out_file, dd.get_config(data))
        split_files[arange.label] = out_file
    split_files["full"] = bam_file
    data = tz.assoc_in(data, ['atac', 'align'], split_files)
    return data
Exemple #2
0
def calculate_complexity_metrics(work_bam, data):
    """
    the work_bam should have duplicates marked but not removed
    mitochondrial reads should be removed 
    """
    bedtools = config_utils.get_program("bedtools", dd.get_config(data))
    work_dir = dd.get_work_dir(data)
    metrics_dir = os.path.join(work_dir, "metrics", "atac")
    utils.safe_makedir(metrics_dir)
    metrics_file = os.path.join(
        metrics_dir, f"{dd.get_sample_name(data)}-atac-metrics.csv")
    if utils.file_exists(metrics_file):
        data = tz.assoc_in(data, ['atac', 'complexity_metrics_file'],
                           metrics_file)
        return data
    # BAM file must be sorted by read name
    work_bam = bam.sort(work_bam, dd.get_config(data), order="queryname")
    with file_transaction(metrics_file) as tx_metrics_file:
        with open(tx_metrics_file, "w") as out_handle:
            out_handle.write("mt,m0,m1,m2\n")
        cmd = (
            f"{bedtools} bamtobed -bedpe -i {work_bam} | "
            "awk 'BEGIN{OFS=\"\\t\"}{print $1,$2,$4,$6,$9,$10}' | "
            "sort | "
            "uniq -c | "
            "awk 'BEGIN{mt=0;m0=0;m1=0;m2=0}($1==1){m1=m1+1} "
            "($1==2){m2=m2+1}{m0=m0+1}{mt=mt+$1}END{printf \"%d,%d,%d,%d\\n\", mt,m0,m1,m2}' >> "
            f"{tx_metrics_file}")
        message = f"Calculating ATAC-seq complexity metrics on {work_bam}, saving as {metrics_file}."
        do.run(cmd, message)
    data = tz.assoc_in(data, ['atac', 'complexity_metrics_file'], metrics_file)
    return data
Exemple #3
0
def create_ataqv_report(samples):
    """
    make the ataqv report from a set of ATAC-seq samples
    """
    data = samples[0][0]
    new_samples = []
    reportdir = os.path.join(dd.get_work_dir(data), "qc", "ataqv")
    sentinel = os.path.join(reportdir, "index.html")
    if utils.file_exists(sentinel):
        ataqv_output = {"base": sentinel, "secondary": get_ataqv_report_files(reportdir)}
        new_data = []
        for data in dd.sample_data_iterator(samples):
            data = tz.assoc_in(data, ["ataqv_report"], ataqv_output)
            new_data.append(data)
        return dd.get_samples_from_datalist(new_data)
    mkarv = config_utils.get_program("mkarv", dd.get_config(data))
    ataqv_files = []
    for data in dd.sample_data_iterator(samples):
        qc = dd.get_summary_qc(data)
        ataqv_file = tz.get_in(("ataqv", "base"), qc, None)
        if ataqv_file and utils.file_exists(ataqv_file):
            ataqv_files.append(ataqv_file)
    if not ataqv_files:
        return samples
    ataqv_json_file_string = " ".join(ataqv_files)
    with file_transaction(reportdir) as txreportdir:
        cmd = f"{mkarv} {txreportdir} {ataqv_json_file_string}"
        message = f"Creating ataqv report from {ataqv_json_file_string}."
        do.run(cmd, message)
    new_data = []
    ataqv_output = {"base": sentinel, "secondary": get_ataqv_report_files(reportdir)}
    for data in dd.sample_data_iterator(samples):
        data = tz.assoc_in(data, ["ataqv_report"], ataqv_output)
        new_data.append(data)
    return dd.get_samples_from_datalist(new_data)
def read_content_section(lines):
    scalars = {}
    histograms = {}
    vector_definitions = {}
    vectors_by_index = {}

    while lines:
        # scalars
        scalar, lines = read_scalar(lines)
        if scalar is not None:
            emitter, signal, recorder, value = scalar
            scalars = assoc_in(scalars, [emitter, signal, recorder], value)
            continue

        # histograms
        histogram, lines = read_histogram(lines)
        if histogram is not None:
            emitter, signal, fields, bins, hist = histogram
            for field_name, field_value in fields.items():
                scalars = assoc_in(scalars, [emitter, signal, field_name],
                                   field_value)
                histograms = assoc_in(histograms, [emitter, signal],
                                      (bins, hist))
            continue

        # vector definitions
        vector, lines = read_vector(lines)
        if vector is not None:
            index, emitter, signal, column_spec = vector
            vector_definitions[index] = (emitter, signal, column_spec)
            vectors_by_index[index] = {"event": [], "time": [], "value": []}
            continue

        # vector data
        vector_data, lines = read_vector_data(lines)
        if vector_data is not None:
            index, column_string = vector_data
            columns = column_string.split()

            if index not in vector_definitions:
                raise ValueError(f"Missing definition for vector {index}")
            _, _, column_spec = vector_definitions[index]

            event, time, value = parse_vector_columns(columns, column_spec)
            v = vectors_by_index[index]
            v["event"].append(event)
            v["time"].append(time)
            v["value"].append(value)
            continue

        # something should have matched by now, if not we're finished
        break

    vectors = {}
    for i, v in vectors_by_index.items():
        emitter, signal, _ = vector_definitions[i]
        vectors = assoc_in(vectors, (emitter, signal), v)

    return (scalars, histograms, vectors), lines
Exemple #5
0
def chipseq_count(data):
    """
    count reads mapping to ChIP/ATAC consensus peaks with featureCounts
    """
    method = dd.get_chip_method(data)
    if method == "chip":
        in_bam = dd.get_work_bam(data)
    elif method == "atac":
        in_bam = tz.get_in(("atac", "align", "NF"), data)
    out_dir = os.path.join(dd.get_work_dir(data), "align",
                           dd.get_sample_name(data))
    sorted_bam = bam.sort(in_bam,
                          dd.get_config(data),
                          order="queryname",
                          out_dir=safe_makedir(out_dir))
    consensus_file = tz.get_in(("peaks_files", "consensus", "main"), data)
    saf_file = os.path.splitext(consensus_file)[0] + ".saf"
    work_dir = dd.get_work_dir(data)
    out_dir = os.path.join(work_dir, "consensus")
    safe_makedir(out_dir)
    count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts"
    summary_file = os.path.join(out_dir,
                                dd.get_sample_name(data)) + ".counts.summary"
    if file_exists(count_file) and _is_fixed_count_file(count_file):
        if method == "atac":
            data = tz.assoc_in(data, ("peak_counts", "NF"), count_file)
        elif method == "chip":
            data = tz.assoc_in(data, ("peak_counts"), count)
        return [[data]]
    featureCounts = config_utils.get_program("featureCounts",
                                             dd.get_config(data))
    paired_flag = _paired_flag(in_bam)
    strand_flag = _strand_flag(data)

    cmd = (
        "{featureCounts} -F SAF -a {saf_file} -o {tx_count_file} -s {strand_flag} "
        "{paired_flag} {sorted_bam}")

    message = ("Count reads in {sorted_bam} overlapping {saf_file} using "
               "featureCounts.")
    with file_transaction(data, [count_file, summary_file]) as tx_files:
        tx_count_file, tx_summary_file = tx_files
        do.run(cmd.format(**locals()), message.format(**locals()))
    fixed_count_file = _format_count_file(count_file, data)
    fixed_summary_file = _change_sample_name(summary_file,
                                             dd.get_sample_name(data),
                                             data=data)
    shutil.move(fixed_count_file, count_file)
    shutil.move(fixed_summary_file, summary_file)
    if method == "atac":
        data = tz.assoc_in(data, ("peak_counts", "NF"), count_file)
    elif method == "chip":
        data = tz.assoc_in(data, ("peak_counts"), count)
    return [[data]]
Exemple #6
0
def parse_simple(jobs, grid_dim):
    _jobs = []
    dim_length = get_dim_length(grid_dim.values())
    for job in jobs:
        for v_id in range(dim_length):
            _job = job
            for keys, values in grid_dim.items():
                split_keys = keys.split('.')
                value = values[v_id]
                _job = assoc_in(_job, split_keys, value)
            if len(grid_dim.values()) == 1:
                _job = assoc_in(_job, ['labels', split_keys[-1]], value)
            _jobs.append(_job)
    return _jobs
Exemple #7
0
 def adjust_saturation(self, sender):
     self.CIColorControls = self.CIColorControls.copy()
     self.CIColorControls.setValue_forKey_(sender.value, 'inputSaturation')
     self._state_history.append(
         assoc_in(self._state_history[-1], ['CIColorControls'],
                  self.CIColorControls))
     self.update_image()
Exemple #8
0
 def adjust_exposure(self, sender):
     self.CIExposureAdjust = self.CIExposureAdjust.copy()
     self.CIExposureAdjust.setValue_forKey_(sender.value, 'inputEV')
     self._state_history.append(
         assoc_in(self._state_history[-1], ['CIExposureAdjust'],
                  self.CIExposureAdjust))
     self.update_image()
Exemple #9
0
def test_change_project_directory(testing_ui, testing_project_directory,
                                  second_project_directory):
    def change_project_directory(project_directory):
        textEdit = testing_ui.settings_widget.project_settings_widget.textEdit
        jj = json.loads(textEdit.toPlainText())
        jj['project_directory'] = str(project_directory)
        textEdit.setPlainText(json.dumps(jj))
        testing_ui.settings_widget.project_settings_widget.commit()

    original_project = call_map.project_settings_module.Project(
        testing_project_directory)

    original_project.update_settings(
        original_project.load_from_persistent_storage())

    assert original_project.settings[call_map.project_settings_module.modules]

    change_project_directory(second_project_directory)
    second_project = call_map.project_settings_module.Project(
        second_project_directory)
    second_project.update_settings(
        second_project.load_from_persistent_storage())

    assert (testing_ui.settings_widget.project_settings_widget.project.
            project_directory == second_project_directory)

    assert tz.assoc_in(original_project.settings, [
        call_map.project_settings_module.project_settings, 'project_directory'
    ], second_project.project_directory) == second_project.settings

    change_project_directory(testing_project_directory)
Exemple #10
0
def calling(data):
    """Main function to parallelize peak calling."""
    method = dd.get_chip_method(data)
    caller_fn = get_callers()[data["peak_fn"]]
    if method == "chip":
        chip_bam = data.get("work_bam")
        input_bam = data.get("work_bam_input", None)
        name = dd.get_sample_name(data)
        out_dir = utils.safe_makedir(
            os.path.join(dd.get_work_dir(data), data["peak_fn"], name))
        out_files = caller_fn(name, chip_bam, input_bam,
                              dd.get_genome_build(data), out_dir,
                              dd.get_chip_method(data), data["resources"],
                              data)
        greylistdir = greylisting(data)
        data.update({"peaks_files": out_files})
        if greylistdir:
            data["greylist"] = greylistdir
    if method == "atac":
        for fraction in atac.ATACRanges.keys():
            chip_bam = tz.get_in(("atac", "align", fraction), data)
            logger.info(
                f"Running peak calling with {data['peak_fn']} on the {fraction} fraction of {chip_bam}."
            )
            name = dd.get_sample_name(data) + f"-{fraction}"
            out_dir = utils.safe_makedir(
                os.path.join(dd.get_work_dir(data), data["peak_fn"], name))
            out_files = caller_fn(name, chip_bam, None,
                                  dd.get_genome_build(data), out_dir,
                                  dd.get_chip_method(data), data["resources"],
                                  data)
            data = tz.assoc_in(data, ("peaks_files", fraction), out_files)
    return [[data]]
Exemple #11
0
def create_peaktable(samples):
    """create a table of peak counts per sample to use with differential peak calling
    """
    data = dd.get_data_from_sample(samples[0])
    peakcounts = []
    out_dir = os.path.join(dd.get_work_dir(data), "consensus")
    out_file = os.path.join(out_dir, "consensus-counts.tsv")
    if dd.get_chip_method(data) == "chip":
        for data in dd.sample_data_iterator(samples):
            peakcounts.append(tz.get_in(("peak_counts"), data))
    elif dd.get_chip_method(data) == "atac":
        for data in dd.sample_data_iterator(samples):
            if bam.is_paired(dd.get_work_bam(data)):
                peakcounts.append(tz.get_in(("peak_counts", "NF"), data))
            else:
                logger.info(f"Creating peak table from full BAM file because "
                            f"{dd.get_work_bam(data)} is single-ended.")
                peakcounts.append(tz.get_in(("peak_counts", "full"), data))
    combined_peaks = count.combine_count_files(peakcounts,
                                               out_file,
                                               ext=".counts")
    new_data = []
    for data in dd.sample_data_iterator(samples):
        data = tz.assoc_in(data, ("peak_counts", "peaktable"), combined_peaks)
        new_data.append(data)
    new_samples = dd.get_samples_from_datalist(new_data)
    return new_samples
Exemple #12
0
def call_consensus(samples):
    """
    call consensus peaks on the narrow/Broad peakfiles from a set of
    ChiP/ATAC samples
    """
    data = samples[0][0]
    new_samples = []
    consensusdir = os.path.join(dd.get_work_dir(data), "consensus")
    utils.safe_makedir(consensusdir)
    peakfiles = []
    for data in dd.sample_data_iterator(samples):
        if dd.get_chip_method(data) == "chip":
            for fn in tz.get_in(("peaks_files", "macs2"), data, []):
                if "narrowPeak" in fn:
                    peakfiles.append(fn)
                    break
                elif "broadPeak" in fn:
                    peakfiles.append(fn)
                    break
        elif dd.get_chip_method(data) == "atac":
            for fn in tz.get_in(("peaks_files", "NF", "macs2"), data, []):
                if "narrowPeak" in fn:
                    peakfiles.append(fn)
    consensusfile = os.path.join(consensusdir, "consensus.bed")
    if not peakfiles:
        logger.info(
            "No suitable peak files found, skipping consensus peak calling.")
        return samples
    consensusfile = consensus(peakfiles, consensusfile, data)
    for data in dd.sample_data_iterator(samples):
        new_samples.append([
            tz.assoc_in(data, ("peaks_files", "consensus"),
                        {"main": consensusfile})
        ])
    return new_samples
Exemple #13
0
def calling(data):
    """Main function to parallelize peak calling."""
    method = dd.get_chip_method(data)
    caller_fn = get_callers()[data["peak_fn"]]
    if method == "chip":
        chip_bam = data.get("work_bam")
        input_bam = data.get("work_bam_input", None)
        name = dd.get_sample_name(data)
        out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["peak_fn"], name))
        out_files = caller_fn(name, chip_bam, input_bam, dd.get_genome_build(data), out_dir,
                            dd.get_chip_method(data), data["resources"], data)
        greylistdir = greylisting(data)
        data.update({"peaks_files": out_files})
        if greylistdir:
            data["greylist"] = greylistdir
    if method == "atac":
        fractions = list(ATACRanges.keys()) + ["full"]
        for fraction in fractions:
            MIN_READS_TO_CALL = 1000
            chip_bam = tz.get_in(("atac", "align", fraction), data)
            if not bam.has_nalignments(chip_bam, MIN_READS_TO_CALL, data):
                logger.warn(f"{chip_bam} has less than {MIN_READS_TO_CALL}, peak calling will fail so skip this fraction.")
                continue
            logger.info(f"Running peak calling with {data['peak_fn']} on the {fraction} fraction of {chip_bam}.")
            name = dd.get_sample_name(data) + f"-{fraction}"
            out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["peak_fn"], name))
            out_files = caller_fn(name, chip_bam, None, dd.get_genome_build(data), out_dir,
                                  dd.get_chip_method(data), data["resources"], data)
            data = tz.assoc_in(data, ("peaks_files", fraction), out_files)
    return [[data]]
Exemple #14
0
 def adjust_shadows(self, sender):
     self.CIHighlightShadowAdjust = self.CIHighlightShadowAdjust.copy()
     self.CIHighlightShadowAdjust.setValue_forKey_(sender.value,
                                                   'inputShadowAmount')
     self._state_history.append(
         assoc_in(self._state_history[-1], ['CIHighlightShadowAdjust'],
                  self.CIHighlightShadowAdjust))
     self.update_image()
Exemple #15
0
 def raw_shadow(self, sender):
     shadow = sender.value
     self._CIHighlightShadowAdjust_raw.setValue_forKey_(
         shadow, 'inputShadowAmount')
     self._state_history.append(
         assoc_in(self._state_history[-1],
                  ['rawfilter', 'inputLinearSpaceFilter'],
                  self._CIHighlightShadowAdjust_raw))
     self.update_image()
Exemple #16
0
def parse_list(jobs, grid_dim):
    _jobs = []
    for job in jobs:
        for grid_val in grid_dim:
            _job = job
            for keys, value in grid_val.items():
                k_list = keys.split('.')
                old_v = get_in(k_list, job)
                new_v = deepmerge(old_v, value)
                _job = assoc_in(_job, k_list, new_v)
            _jobs.append(_job)
    return _jobs
Exemple #17
0
 def adjust_midtonecontrast(self, sender):
     self.CIToneCurve = self.CIToneCurve.copy()
     self.CIToneCurve.setValue_forKey_(
         self._CIVector.vectorWithX_Y_(0.25, 0.25 * (1.0 - sender.value)),
         'inputPoint1')
     self.CIToneCurve.setValue_forKey_(
         self._CIVector.vectorWithX_Y_(0.75, 0.75 * (1.0 + sender.value)),
         'inputPoint3')
     self._state_history.append(
         assoc_in(self._state_history[-1], ['CIToneCurve'],
                  self.CIToneCurve))
     self.update_image()
def _extract_entity(tweet, from_, get_out, root_from, root_to='entities'):
    li = get_in([root_from, from_], tweet)
    if not li:
        return tweet

    extracted = [h[get_out] for h in li]

    if not tweet[root_to]:
        tweet[root_to] = {}

    tweet = assoc_in(tweet, [root_to, from_], extracted)

    return tweet
Exemple #19
0
def call_consensus(samples):
    """
    call consensus peaks on the narrowPeak files from a set of
    ChiP/ATAC samples
    """
    data = samples[0][0]
    new_samples = []
    consensusdir = os.path.join(dd.get_work_dir(data), "consensus")
    utils.safe_makedir(consensusdir)
    peakfiles = []
    for data in dd.sample_data_iterator(samples):
        if dd.get_chip_method(data) == "chip":
            for fn in tz.get_in(("peaks_files", "macs2"), data, []):
                if "narrowPeak" in fn:
                    peakfiles.append(fn)
                elif "broadPeak" in fn:
                    peakfiles.append(fn)
        elif dd.get_chip_method(data) == "atac":
            if bam.is_paired(dd.get_work_bam(data)):
                for fn in tz.get_in(("peaks_files", "NF", "macs2"), data, []):
                    if "narrowPeak" in fn:
                        peakfiles.append(fn)
            else:
                logger.info(
                    f"Using peaks from full fraction since {dd.get_work_bam(data)} is single-ended."
                )
                for fn in tz.get_in(("peaks_files", "full", "macs2"), data,
                                    []):
                    if "narrowPeak" in fn:
                        peakfiles.append(fn)
    consensusfile = os.path.join(consensusdir, "consensus.bed")
    if not peakfiles:
        logger.info(
            "No suitable peak files found, skipping consensus peak calling.")
        return samples
    consensusfile = consensus(peakfiles, consensusfile, data)
    if not utils.file_exists(consensusfile):
        logger.warning("No consensus peaks found.")
        return samples
    saffile = consensus_to_saf(consensusfile,
                               os.path.splitext(consensusfile)[0] + ".saf")
    for data in dd.sample_data_iterator(samples):
        data = tz.assoc_in(data, ("peaks_files", "consensus"),
                           {"main": consensusfile})
        new_samples.append([data])
    return new_samples
Exemple #20
0
    def mk_node(ds, sources, cache, sources_path):
        existing = cache.get(ds.id, None)
        doc = ds.doc_without_lineage_sources

        if existing is not None:
            _ds, _doc, _sources = existing

            if not check_sources(sources, _sources):
                raise InvalidDocException('Inconsistent lineage for repeated dataset with _id: {}'.format(ds.id))

            if doc != _doc:
                raise InvalidDocException('Inconsistent metadata for repeated dataset with _id: {}'.format(ds.id))

            return _ds

        out_ds = toolz.assoc_in(doc, sources_path, sources)
        cache[ds.id] = (out_ds, doc, sources)
        return out_ds
def check_inconsistent_lineage(clirunner, index):
    """
      A -> B
      |    |
      |    v
      +--> C -> D
      |
      +--> E

    Add node E,
    then try adding A with modified E in the lineage, should fail to add ABCD
    """
    ds = SimpleDocNav(gen_dataset_test_dag(1313, force_tree=True))

    child_docs = [ds.sources[x].doc for x in ('ae', )]
    modified_doc = toolz.assoc_in(
        ds.doc, 'lineage.source_datasets.ae.label'.split('.'), 'modified')

    prefix = write_files({
        'lineage.yml': yaml.safe_dump_all(child_docs),
        'main.yml': yaml.safe_dump(modified_doc),
    })

    clirunner(['dataset', 'add', str(prefix / 'lineage.yml')])
    assert index.datasets.get(ds.sources['ae'].id) is not None

    r = clirunner(['dataset', 'add', str(prefix / 'main.yml')])

    assert 'ERROR Inconsistent lineage dataset' in r.output

    assert index.datasets.has(ds.id) is False
    assert index.datasets.has(ds.sources['ab'].id) is False
    assert index.datasets.has(ds.sources['ac'].id) is False
    assert index.datasets.has(ds.sources['ac'].sources['cd'].id) is False

    # now again but skipping verification check
    r = clirunner(
        ['dataset', 'add', '--no-verify-lineage',
         str(prefix / 'main.yml')])

    assert index.datasets.has(ds.id)
    assert index.datasets.has(ds.sources['ab'].id)
    assert index.datasets.has(ds.sources['ac'].id)
    assert index.datasets.has(ds.sources['ac'].sources['cd'].id)
Exemple #22
0
def run_args(args) -> pd.DataFrame:
    with open(args.config) as f:
        config = yaml.load(f)
    for k, v in vars(args).items():
        if v is not None and "." in k:
            config = toolz.assoc_in(config, k.split("."), v)
            print(k, v)
    if args.logdir is not None:
        config['train']['logdir'] = args.logdir
    try:
        cfg = voluptuous.Schema({
            'train': TrainConfig.schema,
            'version': str,
        },
                                extra=voluptuous.REMOVE_EXTRA,
                                required=True)(config)
    except voluptuous.error.Error as e:
        logger.error(humanize_error(config, e))
        sys.exit(1)

    logger.info(f"Parsed config\n{pformat(cfg)}")
    formatter = logging.Formatter(
        "%(asctime)s [%(levelname)5s]:%(name)20s: %(message)s")
    train_cfg: TrainConfig = cfg['train']
    os.makedirs(train_cfg.logdir, exist_ok=True)
    fn = os.path.join(train_cfg.logdir,
                      f"{getattr(args, 'name', 'mincall')}.log")
    h = (logging.FileHandler(fn))
    h.setLevel(logging.INFO)
    h.setFormatter(formatter)
    name_filter = ExtraFieldsFilter({"run_name": args.name})
    root_logger = logging.getLogger()

    root_logger.addHandler(h)
    root_logger.addFilter(name_filter)
    logging.info(f"Added handler to {fn}")
    try:
        with tf.Graph().as_default():
            return run(cfg['train'])
    finally:
        root_logger.removeHandler(h)
        root_logger.removeFilter(name_filter)
Exemple #23
0
    def getIfPath(self,path):
        ''' returns a structured nested dictionary with all values and keys in a given path '''
        if not path.endswith('/'):
            path+='/'
        try:
            nd = self.nested_dict()
            keylen = len(path.split('/'))
            if path.endswith('/'):
                keylen = len(path.split('/')) - 1

            for key, value in self.session.kv.find(path).items():
                keysToPut = [i for i in key.split('/')[keylen:]]
                nd.update(toolz.assoc_in(nd, keysToPut, value))
            if len(nd) > 0:
                return json.dumps(dict(nd))
            return 'Not Defined'

        except Exception as e:
            logging.error(traceback.format_exc())
            return json.dumps(traceback.format_exc())
Exemple #24
0
def create_peaktable(samples):
    """create a table of peak counts per sample to use with differential peak calling
    """
    data = dd.get_data_from_sample(samples[0])
    peakcounts = []
    out_dir = os.path.join(dd.get_work_dir(data), "consensus")
    out_file = os.path.join(out_dir, "consensus-counts.tsv")
    if dd.get_chip_method(data) == "chip":
        for data in dd.sample_data_iterator(samples):
            peakcounts.append(tz.get_in(("peak_counts"), data))
    elif dd.get_chip_method(data) == "atac":
        for data in dd.sample_data_iterator(samples):
            peakcounts.append(tz.get_in(("peak_counts", "NF"), data))
    combined_peaks = count.combine_count_files(peakcounts,
                                               out_file,
                                               ext=".counts")
    new_data = []
    for data in dd.sample_data_iterator(samples):
        data = tz.assoc_in(data, ("peak_counts", "peaktable"), combined_peaks)
        new_data.append(data)
    new_samples = dd.get_samples_from_datalist(new_data)
    return new_samples
Exemple #25
0
def pressure_correct(ic, path=None, sam_src="."):
    """Pressure correct the velocity fields using SAM"""

    if path is None:
        path = tempfile.mkdtemp(dir=".")

    prm = default_parameters()

    for key, val in [('nstop', 0), ('nsave3d', 1), ('nstat', 0),
                     ('nstatfrq', 1), ('dt', .0001), ('nsave3dstart', 0)]:
        prm = assoc_in(prm, ['parameters', key], val)

    case = InitialConditionCase(ic=ic, path=path, sam_src=sam_src, prm=prm)
    case.run()
    case.convert_files_to_netcdf()

    files = glob.glob(os.path.join(case.path, 'OUT_3D', '*.nc'))

    ic = ic.drop('time')
    ds = xr.open_dataset(files[-1]).load()\
                                   .assign_coords(x=ic.x, y=ic.y)
    shutil.rmtree(path)

    return ds
def clean_chipseq_alignment(data):
    # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr"))
    method = dd.get_chip_method(data)
    if method == "atac":
        data = shift_ATAC(data)
    work_bam = dd.get_work_bam(data)
    work_bam = bam.sort(work_bam, dd.get_config(data))
    bam.index(work_bam, dd.get_config(data))
    # an unfiltered BAM file is useful for calculating some metrics later
    data = tz.assoc_in(data, ['chipseq', 'align', "unfiltered"], work_bam)
    clean_bam = remove_nonassembled_chrom(work_bam, data)
    clean_bam = remove_mitochondrial_reads(clean_bam, data)
    data = atac.calculate_complexity_metrics(clean_bam, data)
    if not dd.get_keep_multimapped(data):
        clean_bam = remove_multimappers(clean_bam, data)
    if not dd.get_keep_duplicates(data):
        clean_bam = bam.remove_duplicates(clean_bam, data)
    data["work_bam"] = clean_bam
    # for ATAC-seq, brewak alignments into NF, mono/di/tri nucleosome BAM files
    if method == "atac":
        data = atac.split_ATAC(data)
    encode_bed = tz.get_in(
        ["genome_resources", "variation", "encode_blacklist"], data)
    if encode_bed:
        data["work_bam"] = remove_blacklist_regions(dd.get_work_bam(data),
                                                    encode_bed, data['config'])
        bam.index(data["work_bam"], data['config'])
    try:
        data["bigwig"] = _normalized_bam_coverage(dd.get_sample_name(data),
                                                  dd.get_work_bam(data), data)
    except subprocess.CalledProcessError:
        logger.warning(f"{dd.get_work_bam(data)} was too sparse to normalize, "
                       f" falling back to non-normalized coverage.")
        data["bigwig"] = _bam_coverage(dd.get_sample_name(data),
                                       dd.get_work_bam(data), data)
    return [[data]]
Exemple #27
0
def run_args(args):
    if args.config is not None:
        with open(args.config) as f:
            config = yaml.load(f)
    else:
        config = {'version': "v0.1"}

    for k, v in vars(args).items():
        if v is not None and "." in k:
            config = toolz.assoc_in(config, k.split("."), v)
    if args.logdir is not None:
        config['embed']['logdir'] = args.logdir
    try:
        cfg = voluptuous.Schema({
            'embed': EmbeddingCfg.schema,
            'version': str,
        },
                                extra=voluptuous.REMOVE_EXTRA,
                                required=True)(config)
        logger.info(f"Parsed config\n{pformat(cfg)}")
        run(cfg['embed'])
    except voluptuous.error.Error as e:
        logger.error(humanize_error(config, e))
        sys.exit(1)
    def doc_without_lineage_sources(self):
        if self._doc_without is None:
            self._doc_without = toolz.assoc_in(self._doc, self._sources_path,
                                               {})

        return self._doc_without
Exemple #29
0
def load_results(path):
    results_dict = {}

    with path.open() as f:
        # version
        version_line = next(f)
        version_match = re.match(r"version (\d+)$", version_line)
        version = int(version_match.group(1))
        if version != 2:
            raise ValueError("unknown version")
        results_dict["version"] = version

        # run
        run_line = next(f)
        run_match = re.match(r"run (\S+)$", run_line)
        results_dict["run"] = run_match.group(1)

        # attributes
        found_attributes = set()
        attr_regex = r"attr (\S+) (.+)$"
        for _ in range(len(ATTRIBUTES)):
            attr_line = next(f)
            attr_match = re.match(attr_regex, attr_line)
            attribute, value = attr_match.groups()
            assert attribute not in found_attributes
            found_attributes.add(attribute)

            if attribute in INT_ATTRIBUTES:
                value = int(value)
            elif attribute in DATETIME_ATTRIBUTES:
                value = datetime.datetime.strptime(value, "%Y%m%d-%H:%M:%S")

            results_dict[attribute] = value
        assert found_attributes == set(ATTRIBUTES)

        # itervars
        l = next(f)
        itervars = {}
        itervar_regex = r"itervar (\S+) (\S+)$"
        match = re.match(itervar_regex, l)
        while match:
            var, value = match.groups()
            assert var not in itervars
            itervars[var] = value
            l = next(f)
            match = re.match(itervar_regex, l)
        results_dict["itervars"] = itervars

        # params
        params = {}
        param_regex = r"param (\S+) (\S+)$"
        match = re.match(param_regex, l)
        while match:
            param, value = match.groups()
            assert param not in params
            params[param] = value
            l = next(f)
            match = re.match(param_regex, l)
        results_dict["params"] = params

        # scalars
        scalars = {}
        assert l == "\n"
        l = next(f)
        scalar_regex = "scalar (\S+) (\S+):(\S+) (\S+)$"
        statistic_regex = "statistic (\S+) (\S+):histogram$"
        field_regex = "field (\S+) (\S+)$"
        bin_regex = "bin\s+(\S+)\s+(\S+)$"
        while True:
            scalar_match = re.match(scalar_regex, l)
            statistic_match = re.match(statistic_regex, l)
            attr_match = re.match(attr_regex, l)

            if scalar_match:
                emitter, signal, recorder, value = scalar_match.groups()
                scalars = assoc_in(scalars, [emitter, signal, recorder], parse_value(value))
                l = next(f)
                continue

            elif statistic_match:
                # histogram fields
                emitter, signal = statistic_match.groups()
                found_fields = set()
                for _ in range(len(HISTOGRAM_FIELDS)):
                    l = next(f)
                    field_match = re.match(field_regex, l)
                    recorder, value = field_match.groups()
                    found_fields.add(recorder)
                    scalars = assoc_in(scalars, [emitter, signal, recorder], parse_value(value))
                assert found_fields == set(HISTOGRAM_FIELDS)

                # histogram bins
                bin_edges = []
                hist = []
                l = next(f)
                bin_match = re.match(bin_regex, l)
                while bin_match:
                    left_bin_edge, count = bin_match.groups()
                    bin_edges.append(float(left_bin_edge))
                    hist.append(int(count))
                    l = next(f)
                    bin_match = re.match(bin_regex, l)
                bin_edges.append(math.inf)
                scalars = assoc_in(
                    scalars,
                    [emitter, signal, "histogram"],
                    (np.array(hist), np.array(bin_edges))
                )

            elif attr_match:
                assert attr_match.group(1) == "source"  # ignore source of signals
                l = next(f)

            else:
                assert l == "\n"
                try:
                    next(f)
                except StopIteration:
                    break
                else:
                    assert False
    
    
        results_dict["scalars"] = scalars
    return Results(**results_dict)
Exemple #30
0
 def timed_learner(*args: Any, **kwargs: Any) -> LearnerReturnType:
     t0 = time()
     (p, d, l) = learner(*args, **kwargs)
     return p, d, fp.assoc_in(l, [learner_name, 'running_time'],
                              "%2.3f s" % (time() - t0))