Beispiel #1
0
def _cutadapt_trim(fastq_files, quality_format, adapters, out_files, log_file,
                   data):
    """Trimming with cutadapt.
    """
    if all([utils.file_exists(x) for x in out_files]):
        return out_files
    cmd = _cutadapt_trim_cmd(fastq_files, quality_format, adapters, out_files,
                             data)
    if len(fastq_files) == 1:
        of = [out_files[0], log_file]
        message = "Trimming %s in single end mode with cutadapt." % (
            fastq_files[0])
        with file_transaction(data, of) as of_tx:
            of1_tx, log_tx = of_tx
            do.run(cmd.format(**locals()), message)
    else:
        of = out_files + [log_file]
        with file_transaction(data, of) as tx_out_files:
            of1_tx, of2_tx, log_tx = tx_out_files
            tmp_fq1 = utils.append_stem(of1_tx, ".tmp")
            tmp_fq2 = utils.append_stem(of2_tx, ".tmp")
            singles_file = of1_tx + ".single"
            message = "Trimming %s and %s in paired end mode with cutadapt." % (
                fastq_files[0], fastq_files[1])
            do.run(cmd.format(**locals()), message)
    return out_files
Beispiel #2
0
def filter_reads_by_length(fq1, fq2, quality_format, min_length=20):
    """
    removes reads from a pair of fastq files that are shorter than
    a minimum length. removes both ends of a read if one end falls
    below the threshold while maintaining the order of the reads

    """

    logger.info("Removing reads in %s and %s that "
                "are less than %d bases." % (fq1, fq2, min_length))
    fq1_out = utils.append_stem(fq1, ".fixed")
    fq2_out = utils.append_stem(fq2, ".fixed")
    fq1_single = utils.append_stem(fq1, ".singles")
    fq2_single = utils.append_stem(fq2, ".singles")
    if all(map(utils.file_exists, [fq1_out, fq2_out, fq2_single, fq2_single])):
        return [fq1_out, fq2_out]

    fq1_in = SeqIO.parse(fq1, quality_format)
    fq2_in = SeqIO.parse(fq2, quality_format)

    with open(fq1_out, 'w') as fq1_out_handle, open(fq2_out, 'w') as fq2_out_handle, open(fq1_single, 'w') as fq1_single_handle, open(fq2_single, 'w') as fq2_single_handle:
        for fq1_record, fq2_record in izip(fq1_in, fq2_in):
            if len(fq1_record.seq) >= min_length and len(fq2_record.seq) >= min_length:
                fq1_out_handle.write(fq1_record.format(quality_format))
                fq2_out_handle.write(fq2_record.format(quality_format))
            else:
                if len(fq1_record.seq) > min_length:
                    fq1_single_handle.write(fq1_record.format(quality_format))
                if len(fq2_record.seq) > min_length:
                    fq2_single_handle.write(fq2_record.format(quality_format))

    return [fq1_out, fq2_out]
Beispiel #3
0
def filter_reads_by_length(fq1, fq2, quality_format, min_length=20):
    """
    removes reads from a pair of fastq files that are shorter than
    a minimum length. removes both ends of a read if one end falls
    below the threshold while maintaining the order of the reads

    """

    logger.info("Removing reads in %s and %s that "
                "are less than %d bases." % (fq1, fq2, min_length))
    fq1_out = utils.append_stem(fq1, ".fixed")
    fq2_out = utils.append_stem(fq2, ".fixed")
    fq1_single = utils.append_stem(fq1, ".singles")
    fq2_single = utils.append_stem(fq2, ".singles")
    if all(map(utils.file_exists, [fq1_out, fq2_out, fq2_single, fq2_single])):
        return [fq1_out, fq2_out]

    fq1_in = SeqIO.parse(fq1, quality_format)
    fq2_in = SeqIO.parse(fq2, quality_format)

    with open(fq1_out, 'w') as fq1_out_handle, open(fq2_out, 'w') as fq2_out_handle, open(fq1_single, 'w') as fq1_single_handle, open(fq2_single, 'w') as fq2_single_handle:
        for fq1_record, fq2_record in izip(fq1_in, fq2_in):
            if len(fq1_record.seq) >= min_length and len(fq2_record.seq) >= min_length:
                fq1_out_handle.write(fq1_record.format(quality_format))
                fq2_out_handle.write(fq2_record.format(quality_format))
            else:
                if len(fq1_record.seq) > min_length:
                    fq1_single_handle.write(fq1_record.format(quality_format))
                if len(fq2_record.seq) > min_length:
                    fq2_single_handle.write(fq2_record.format(quality_format))

    return [fq1_out, fq2_out]
Beispiel #4
0
def _cutadapt_trim(fastq_files, quality_format, adapters, out_files, log_file, config):
    """Trimming with cutadapt, using version installed with bcbio-nextgen.

    Uses the system executable to find the version next to our Anaconda Python.
    TODO: Could we use cutadapt as a library to avoid this?
    """
    if all([file_exists(x) for x in out_files]):
        return out_files
    cmd = _cutadapt_trim_cmd(fastq_files, quality_format, adapters, out_files)
    if len(fastq_files) == 1:
        of = [out_files[0], log_file]
        message = "Trimming %s in single end mode with cutadapt." % (fastq_files[0])
        with file_transaction(config, of) as of_tx:
            of1_tx, log_tx = of_tx
            do.run(cmd.format(**locals()), message)
    else:
        of = out_files + [log_file]
        with file_transaction(config, of) as tx_out_files:
            of1_tx, of2_tx, log_tx = tx_out_files
            tmp_fq1 = append_stem(of1_tx, ".tmp")
            tmp_fq2 = append_stem(of2_tx, ".tmp")
            singles_file = of1_tx + ".single"
            message = "Trimming %s and %s in paired end mode with cutadapt." % (fastq_files[0],
                                                                                fastq_files[1])
            do.run(cmd.format(**locals()), message)
    return out_files
Beispiel #5
0
def _make_isomir_counts(data, srna_type="seqbuster", out_dir=None, stem=""):
    """
    Parse miraligner files to create count matrix.
    """
    work_dir = dd.get_work_dir(data[0][0])
    if not out_dir:
        out_dir = op.join(work_dir, "mirbase")
    out_novel_isomir = append_stem(op.join(out_dir, "counts.tsv"), stem)
    out_novel_mirna = append_stem(op.join(out_dir, "counts_mirna.tsv"), stem)
    logger.debug("Create %s count data at %s." % (srna_type, out_dir))
    if file_exists(out_novel_mirna):
        return [out_novel_mirna, out_novel_isomir]
    out_dts = []
    for sample in data:
        if sample[0].get(srna_type):
            miraligner_fn = sample[0][srna_type]
            reads = _read_miraligner(miraligner_fn)
            if reads:
                out_file, dt, dt_pre = _tab_output(reads, miraligner_fn + ".back", dd.get_sample_name(sample[0]))
                out_dts.append(dt)
            else:
                logger.debug("WARNING::%s has NOT miRNA annotated for %s. Check if fasta files is small or species value." % (dd.get_sample_name(sample[0]), srna_type))
    if out_dts:
        out_files = _create_counts(out_dts, out_dir)
        out_files = [move_safe(out_files[0], out_novel_isomir), move_safe(out_files[1], out_novel_mirna)]
        return out_files
    else:
        logger.debug("WARNING::any samples have miRNA annotated for %s. Check if fasta files is small or species value." % srna_type)
Beispiel #6
0
def _cutadapt_trim(fastq_files, quality_format, adapters, out_files, log_file,
                   config):
    """Trimming with cutadapt, using version installed with bcbio-nextgen.

    Uses the system executable to find the version next to our Anaconda Python.
    TODO: Could we use cutadapt as a library to avoid this?
    """
    if all([file_exists(x) for x in out_files]):
        return out_files
    cmd = _cutadapt_trim_cmd(fastq_files, quality_format, adapters, out_files)
    if len(fastq_files) == 1:
        of = [out_files[0], log_file]
        message = "Trimming %s in single end mode with cutadapt." % (
            fastq_files[0])
        with file_transaction(config, of) as of_tx:
            of1_tx, log_tx = of_tx
            do.run(cmd.format(**locals()), message)
    else:
        of = out_files + [log_file]
        with file_transaction(config, of) as tx_out_files:
            of1_tx, of2_tx, log_tx = tx_out_files
            tmp_fq1 = append_stem(of1_tx, ".tmp")
            tmp_fq2 = append_stem(of2_tx, ".tmp")
            singles_file = of1_tx + ".single"
            message = "Trimming %s and %s in paired end mode with cutadapt." % (
                fastq_files[0], fastq_files[1])
            do.run(cmd.format(**locals()), message)
    return out_files
Beispiel #7
0
def trim_srna_sample(data):
    """
    Remove 3' adapter for smallRNA-seq
    Uses cutadapt but with different parameters than for other pipelines.
    """
    in_file = data["files"][0]
    names = data["rgnames"]['sample']
    work_dir = os.path.join(dd.get_work_dir(data), "trimmed")
    out_dir = os.path.join(work_dir, names)
    utils.safe_makedir(out_dir)
    out_file = replace_directory(append_stem(in_file, ".clean"), out_dir)
    trim_reads = data["config"]["algorithm"].get("trim_reads", True)
    if trim_reads:
        adapter = dd.get_adapters(data)[0]
        out_noadapter_file = replace_directory(
            append_stem(in_file, ".fragments"), out_dir)
        out_short_file = replace_directory(append_stem(in_file, ".short"),
                                           out_dir)
        cutadapt = os.path.join(os.path.dirname(sys.executable), "cutadapt")
        cmd = _cmd_cutadapt()
        if not utils.file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                do.run(cmd.format(**locals()), "remove adapter")
    else:
        symlink_plus(in_file, out_file)
    data["clean_fastq"] = out_file
    data["collapse"] = _collapse(data["clean_fastq"])
    data["size_stats"] = _summary(data['collapse'])
    return [[data]]
Beispiel #8
0
def trim_srna_sample(data):
    """
    Remove 3' adapter for smallRNA-seq
    Uses cutadapt but with different parameters than for other pipelines.
    """
    in_file = data["files"][0]
    names = data["rgnames"]['sample']
    work_dir = os.path.join(dd.get_work_dir(data), "trimmed")
    out_dir = os.path.join(work_dir, names)
    utils.safe_makedir(out_dir)
    out_file = replace_directory(append_stem(in_file, ".clean"), out_dir)
    trim_reads = data["config"]["algorithm"].get("trim_reads", True)
    adapter = dd.get_adapters(data)
    if trim_reads and not adapter and error_dnapi:
        raise ValueError(error_dnapi)
    adapters = adapter if adapter else _dnapi_prediction(in_file)
    times = "" if len(adapters) == 1 else "--times %s" % len(adapters)
    if trim_reads and adapters:
        adapter_cmd = " ".join(map(lambda x: "-a " + x, adapters))
        out_noadapter_file = replace_directory(
            append_stem(in_file, ".fragments"), out_dir)
        out_short_file = replace_directory(append_stem(in_file, ".short"),
                                           out_dir)
        log_out = os.path.join(out_dir, "%s.log" % names)
        atropos = _get_atropos()
        options = " ".join(
            data.get('resources', {}).get('atropos', {}).get("options", ""))
        cores = ("--threads %s" %
                 dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "")
        if " ".join(
                data.get('resources', {}).get('cutadapt',
                                              {}).get("options", "")):
            raise ValueError(
                "Atropos is now used, but cutadapt options found in YAML file."
                "See https://atropos.readthedocs.io/en/latest/")
        cmd = _cmd_atropos()
        if not utils.file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                do.run(cmd.format(**locals()), "remove adapter for %s" % names)
                if utils.file_exists(log_out):
                    content = open(log_out).read().replace(
                        out_short_file, names)
                    open(log_out, 'w').write(content)
                if options:
                    in_file = append_stem(tx_out_file, ".tmp")
                    utils.move_safe(tx_out_file, in_file)
                    cmd = "{atropos} {cores} {options} -se {in_file} -o {tx_out_file} -m 17"
                    do.run(cmd.format(**locals()),
                           "cutadapt with this %s for %s" % (options, names))
    else:
        if not trim_reads:
            logger.debug("Skip trimming for: %s" % names)
        elif not adapters:
            logger.info("No adapter founds in %s, this is an issue related"
                        " to no small RNA enrichment in your sample." % names)
        symlink_plus(in_file, out_file)
    data["clean_fastq"] = out_file
    data["collapse"] = _collapse(data["clean_fastq"])
    data["size_stats"] = _summary(data['collapse'])
    return [[data]]
Beispiel #9
0
def delayed_bam_merge(data):
    """Perform a merge on previously prepped files, delayed in processing.

    Handles merging of associated split read and discordant files if present
    """
    if data.get("combine"):
        assert len(data["combine"].keys()) == 1
        file_key = data["combine"].keys()[0]
        extras = []
        for x in data["combine"][file_key].get("extras", []):
            if isinstance(x, (list, tuple)):
                extras.extend(x)
            else:
                extras.append(x)
        in_files = sorted(list(set([data[file_key]] + extras)))
        out_file = data["combine"][file_key]["out"]
        for ext in ["-disc", "-sr", ""]:
            if ext:
                cur_in_files = list(filter(os.path.exists, (utils.append_stem(f, ext) for f in in_files)))
                cur_out_file = utils.append_stem(out_file, ext) if len(in_files) > 0 else None
            else:
                cur_in_files, cur_out_file = in_files, out_file
            if cur_out_file:
                config = copy.deepcopy(data["config"])
                config["algorithm"]["save_diskspace"] = False
                merged_file = merge_bam_files(cur_in_files, os.path.dirname(cur_out_file), config,
                                              out_file=cur_out_file)
        data.pop("region", None)
        data.pop("combine", None)
        data[file_key] = merged_file
    return [[data]]
Beispiel #10
0
def trim_srna_sample(data):
    """
    Remove 3' adapter for smallRNA-seq
    Uses cutadapt but with different parameters than for other pipelines.
    """
    in_file = data["files"][0]
    names = data["rgnames"]['sample']
    work_dir = os.path.join(dd.get_work_dir(data), "trimmed")
    out_dir = os.path.join(work_dir, names)
    utils.safe_makedir(out_dir)
    out_file = replace_directory(append_stem(in_file, ".clean"), out_dir)
    trim_reads = data["config"]["algorithm"].get("trim_reads", True)
    if trim_reads:
        adapter = dd.get_adapters(data)[0]
        out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir)
        out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir)
        cutadapt = os.path.join(os.path.dirname(sys.executable), "cutadapt")
        cmd = _cmd_cutadapt()
        if not utils.file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                do.run(cmd.format(**locals()), "remove adapter")
    else:
        symlink_plus(in_file, out_file)
    data["clean_fastq"] = out_file
    data["collapse"] = _collapse(data["clean_fastq"])
    data["size_stats"] = _summary(data['collapse'])
    return [[data]]
Beispiel #11
0
def trim_srna_sample(data):
    """
    Remove 3' adapter for smallRNA-seq
    Uses cutadapt but with different parameters than for other pipelines.
    """
    in_file = data["files"][0]
    names = data["rgnames"]['sample']
    work_dir = os.path.join(dd.get_work_dir(data), "trimmed")
    out_dir = os.path.join(work_dir, names)
    utils.safe_makedir(out_dir)
    out_file = replace_directory(append_stem(in_file, ".clean"), out_dir)
    trim_reads = data["config"]["algorithm"].get("trim_reads", True)
    if utils.file_exists(out_file):
        data["clean_fastq"] = out_file
        data["collapse"] = _collapse(data["clean_fastq"])
        data["size_stats"] = _summary(data['collapse'])
        return [[data]]

    adapter = dd.get_adapters(data)
    if trim_reads and not adapter and error_dnapi:
        raise ValueError(error_dnapi)
    adapters = adapter if adapter else _dnapi_prediction(in_file, out_dir)
    times = "" if len(adapters) == 1 else "--times %s" % len(adapters)
    if trim_reads and adapters:
        adapter_cmd = " ".join(map(lambda x: "-a " + x, adapters))
        out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir)
        out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir)
        log_out = os.path.join(out_dir, "%s.log" % names)
        atropos = _get_atropos()
        options = " ".join(data.get('resources', {}).get('atropos', {}).get("options", ""))
        cores = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "")
        if " ".join(data.get('resources', {}).get('cutadapt', {}).get("options", "")):
            raise ValueError("Atropos is now used, but cutadapt options found in YAML file."
                             "See https://atropos.readthedocs.io/en/latest/")
        cmd = _cmd_atropos()
        if not utils.file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                do.run(cmd.format(**locals()), "remove adapter for %s" % names)
                if utils.file_exists(log_out):
                    content = open(log_out).read().replace(out_short_file, names)
                    open(log_out, 'w').write(content)
                if options:
                    in_file = append_stem(tx_out_file, ".tmp")
                    utils.move_safe(tx_out_file, in_file)
                    cmd = "{atropos} {cores} {options} -se {in_file} -o {tx_out_file} -m 17"
                    do.run(cmd.format(**locals()), "cutadapt with this %s for %s" %(options, names))
    else:
        if not trim_reads:
            logger.debug("Skip trimming for: %s" % names)
        elif not adapters:
            logger.info("No adapter founds in %s, this is an issue related"
                        " to no small RNA enrichment in your sample." % names)
        symlink_plus(in_file, out_file)
    data["clean_fastq"] = out_file
    data["collapse"] = _collapse(data["clean_fastq"])
    data["size_stats"] = _summary(data['collapse'])
    return [[data]]
Beispiel #12
0
def _calculate_percentiles(in_file, sample, data=None, cutoffs=None):
    """
    Parse pct bases per region to summarize it in
    7 different pct of regions points with pct bases covered
    higher than a completeness cutoff (5, 10, 20, 50 ...)
    """
    has_data = False
    with open(in_file) as in_handle:
        for i, line in enumerate(in_handle):
            if i > 0:
                has_data = True
                break
    if not has_data:
        return []
    out_file = append_stem(in_file, "_summary")
    out_total_file = append_stem(in_file, "_total_summary")
    if not utils.file_exists(out_file) or not utils.file_exists(
            out_total_file):
        dt = pd.read_csv(in_file, sep="\t", index_col=False)
        pct = dict()
        pct_bases = dict()
        size = np.array(dt["chromEnd"]) - np.array(dt["chromStart"])
        for cutoff in [h for h in list(dt) if h.startswith("percentage")]:
            if cutoffs and int(cutoff.split("percentage")[1]) in cutoffs:
                a = np.array(dt[cutoff])
                for p_point in [0.01, 10, 25, 50, 75, 90, 99.9]:
                    q = np.percentile(a, p_point)
                    pct[(cutoff, p_point)] = q
                pct_bases[cutoff] = sum(size * a) / float(sum(size))

        with file_transaction(data, out_total_file) as tx_file:
            with open(tx_file, 'w') as out_handle:
                print >> out_handle, "cutoff_reads\tbases_pct\tsample"
                for k in pct_bases:
                    print >> out_handle, "\t".join(
                        map(str, [k, pct_bases[k], sample]))
        with file_transaction(data, out_file) as tx_file:
            with open(tx_file, 'w') as out_handle:
                print >> out_handle, "cutoff_reads\tregion_pct\tbases_pct\tsample"
                for k in pct:
                    print >> out_handle, "\t".join(
                        map(str, [k[0], k[1], pct[k], sample]))
    # To move metrics to multiqc, will remove older files
    # when bcbreport accepts these one, to avoid errors
    # while porting everything to multiqc
    # These files will be copied to final
    out_file_fixed = os.path.join(os.path.dirname(out_file),
                                  "%s_bcbio_coverage.txt" % sample)
    out_total_fixed = os.path.join(os.path.dirname(out_file),
                                   "%s_bcbio_coverage_avg.txt" % sample)
    copy_plus(out_file, out_file_fixed)
    copy_plus(out_total_file, out_total_fixed)
    return [out_file_fixed, out_total_fixed]
Beispiel #13
0
def cpg_het_pairs(cpgvcf, snpvcf, bam_file, out_file, workdir):
    """
    Detect het close to hemi-met sites
    """
    out_vcf = splitext_plus(out_file)[0] + ".vcf"
    cpg_filter = op.join(workdir, op.basename(append_stem(cpgvcf, "_filtered")))
    snp_filter = op.join(workdir, op.basename(append_stem(snpvcf, "_filtered")))

    if not file_exists(cpg_filter):
        with open(cpg_filter, 'w') as out_handle:
            with open(cpgvcf) as in_handle:
                for line in in_handle:
                    if line.startswith("#"):
                        continue
                    record = line.strip().split("\t")
                    # print record
                    header, frmt = record[8], record[9]
                    frmt = dict(zip(header.split(":"), frmt.split(':')))
                    if is_good_cpg(frmt, record):
                        print >>out_handle, line
    if not file_exists(snp_filter):
        with open(snp_filter, 'w') as out_handle:
            with open(snpvcf) as in_handle:
                for line in in_handle:
                    if line.startswith("#"):
                        continue
                    record = line.strip().split("\t")
                    header, frmt = record[8], record[9]
                    frmt = dict(zip(header.split(":"), frmt.split(':')))
                    if is_good_het(frmt, record):
                        print >>out_handle, line

    if not file_exists(out_vcf):
        res = pybedtools.BedTool(cpg_filter).window(snp_filter, w=75)
        with open(out_file, 'w') as out_handle, open(out_vcf, 'w') as vcf_handle:
            _create_vcf_header(cpgvcf, vcf_handle)
            print >>out_handle, "chrom\tCpG_pos\tCpG_nt\tSNP_pos\tAlleles\tassociation_plus\tSNP_reads_minus"
            for record in res:
                if record[1] != record[11]:
                    # if record[1] == "19889634":
                    link, link_as, align = _make_linkage(bam_file, record[0], int(record[1]), int(record[11]), _get_strand(record))
                    res = "%s\t%s\t%s\t%s\t%s/%s\t%s\t%s" % (record[0], record[1], record[3], record[11], record[13], record[14], _format(link), _format(link_as))
                    chrom, pos, ref, alt, qual, filt, info, frmt, sample = _get_vcf_line(record)
                    # print res
                    if _valid_test(link, link_as):
                        filt = "PASS"
                        print >>out_handle, res
                        # print res
                        # print >>out_handle, '\n'.join(align)

                    vcf_res = "{chrom}\t{pos}\t.\t{ref}\t{alt}\t{qual}\t{filt}\t{info}\t{frmt}\t{sample}".format(**locals())
                    print >>vcf_handle, vcf_res
    return _correct_vcf(out_vcf)
Beispiel #14
0
def delayed_bam_merge(data):
    """Perform a merge on previously prepped files, delayed in processing.

    Handles merging of associated split read and discordant files if present.
    """
    if data.get("combine"):
        assert len(data["combine"].keys()) == 1
        file_key = data["combine"].keys()[0]
        extras = []
        for x in data["combine"][file_key].get("extras", []):
            if isinstance(x, (list, tuple)):
                extras.extend(x)
            else:
                extras.append(x)
        if file_key in data:
            extras.append(data[file_key])
        in_files = sorted(list(set(extras)))
        out_file = tz.get_in(["combine", file_key, "out"], data,
                             _merge_out_from_infiles(in_files))
        sup_exts = data.get(file_key + "-plus", {}).keys()
        for ext in sup_exts + [""]:
            merged_file = None
            if os.path.exists(utils.append_stem(out_file, "-" + ext)):
                cur_out_file, cur_in_files = out_file, []
            if ext:
                cur_in_files = list(
                    filter(os.path.exists, (utils.append_stem(f, "-" + ext)
                                            for f in in_files)))
                cur_out_file = utils.append_stem(
                    out_file, "-" + ext) if len(cur_in_files) > 0 else None
            else:
                cur_in_files, cur_out_file = in_files, out_file
            if cur_out_file:
                config = copy.deepcopy(data["config"])
                config["algorithm"]["save_diskspace"] = False
                if len(cur_in_files) > 0:
                    merged_file = merge_bam_files(
                        cur_in_files,
                        os.path.dirname(cur_out_file),
                        config,
                        out_file=cur_out_file)
                else:
                    assert os.path.exists(cur_out_file)
                    merged_file = cur_out_file
            if merged_file:
                if ext:
                    data[file_key + "-plus"][ext] = merged_file
                else:
                    data[file_key] = merged_file
        data.pop("region", None)
        data.pop("combine", None)
    return [[data]]
Beispiel #15
0
def _calculate_percentiles(in_file, sample, data=None, cutoffs=None):
    """
    Parse pct bases per region to summarize it in
    7 different pct of regions points with pct bases covered
    higher than a completeness cutoff (5, 10, 20, 50 ...)
    """
    has_data = False
    with open(in_file) as in_handle:
        for i, line in enumerate(in_handle):
            if i > 0:
                has_data = True
                break
    if not has_data:
        return []
    out_file = append_stem(in_file, "_summary")
    out_total_file = append_stem(in_file, "_total_summary")
    if not utils.file_exists(out_file) or not utils.file_exists(out_total_file):
        dt = pd.read_csv(in_file, sep="\t", index_col=False)
        pct = dict()
        pct_bases = dict()
        size = np.array(dt["chromEnd"]) - np.array(dt["chromStart"])
        for cutoff in [h for h in list(dt) if h.startswith("percentage")]:
            if cutoffs and int(cutoff.split("percentage")[1]) in cutoffs:
                a = np.array(dt[cutoff])
                for p_point in [0.01, 10, 25, 50, 75, 90, 99.9]:
                    q = np.percentile(a, p_point)
                    pct[(cutoff, p_point)] = q
                pct_bases[cutoff] = sum(size * a) / float(sum(size))

        with file_transaction(data, out_total_file) as tx_file:
            with open(tx_file, 'w') as out_handle:
                print >>out_handle, "cutoff_reads\tbases_pct\tsample"
                for k in pct_bases:
                    print >>out_handle, "\t".join(map(str, [k, pct_bases[k], sample]))
        with file_transaction(data, out_file) as tx_file:
            with open(tx_file, 'w') as out_handle:
                print >>out_handle, "cutoff_reads\tregion_pct\tbases_pct\tsample"
                for k in pct:
                    print >>out_handle, "\t".join(map(str, [k[0], k[1], pct[k], sample]))
    # To move metrics to multiqc, will remove older files
    # when bcbreport accepts these one, to avoid errors
    # while porting everything to multiqc
    # These files will be copied to final
    out_file_fixed = os.path.join(os.path.dirname(out_file), "%s_bcbio_coverage.txt" % sample)
    out_total_fixed = os.path.join(os.path.dirname(out_file), "%s_bcbio_coverage_avg.txt" % sample)
    copy_plus(out_file, out_file_fixed)
    copy_plus(out_total_file, out_total_fixed)
    return [out_file_fixed, out_total_fixed]
Beispiel #16
0
def _collapse(in_file):
    out_file = append_stem(in_file, ".trimming").replace(".gz", "")
    if file_exists(out_file):
        return out_file
    seqs = collapse(in_file)
    write_output(out_file, seqs)
    return out_file
Beispiel #17
0
def cpg_postprocessing(data):
    mC = data["cpg_file"]
    if not "control" in data:
        return [[data]]
    hmC = data["control"]
    out_file = append_stem(mC, "_hmC")
    pos = 0
    pos_hmC = 0
    data["hmc_file"] = out_file
    if file_exists(out_file):
        return [[data]]
    logger.debug("processing %s versus %s" % (mC, hmC))
    with file_transaction(out_file) as out_tx:
        with open(out_tx, "w") as out_handle:
            with open(mC) as mC_h:
                with open(hmC) as hmC_h:
                    for line in mC_h:
                        cols = line.strip().split("\t")
                        if cols[3] != "CG":
                            continue
                        pos = int(cols[1])
                        counts = [int(float(cols[5])), int(cols[6])]
                        if pos < pos_hmC:
                            continue
                        elif pos > pos_hmC:
                            hmC_h, hmC = _sync_pos(hmC_h, pos)
                            if not hmC_h:
                                break
                            pos_hmC = hmC["pos"]
                        if counts[0] < 9 or hmC["counts"][0] < 9:
                            continue
                        if pos == hmC["pos"]:
                            pvalue = _call_hmc(counts, hmC["counts"])
                            print >>out_handle, "%s\t%s\t%s" % (line.strip(), "\t".join(hmC["info"]), pvalue)
    return [[data]]
Beispiel #18
0
def align_to_sort_bam(fastq1, fastq2, aligner, data):
    """Align to the named genome build, returning a sorted BAM file.
    """
    names = data["rgnames"]
    align_dir_parts = [data["dirs"]["work"], "align", names["sample"]]
    if data.get("disambiguate"):
        align_dir_parts.append(data["disambiguate"]["genome_build"])
    align_dir = utils.safe_makedir(apply(os.path.join, align_dir_parts))
    aligner_indexes = os.path.commonprefix(tz.get_in(("reference", aligner, "indexes"), data))
    if aligner_indexes.endswith("."):
        aligner_indexes = aligner_indexes[:-1]
    ref_file = tz.get_in(("reference", "fasta", "base"), data)
    if fastq1.endswith(".bam"):
        data = _align_from_bam(fastq1, aligner, aligner_indexes, ref_file,
                               names, align_dir, data)
    else:
        data = _align_from_fastq(fastq1, fastq2, aligner, aligner_indexes, ref_file,
                                 names, align_dir, data)
    if data["work_bam"] and utils.file_exists(data["work_bam"]):
        bam.index(data["work_bam"], data["config"])
        for extra in ["-sr", "-disc"]:
            extra_bam = utils.append_stem(data['work_bam'], extra)
            if utils.file_exists(extra_bam):
                bam.index(extra_bam, data["config"])
    return data
def align(pair):
    import os
    from bcbio.utils import file_exists, replace_suffix, append_stem, safe_makedir
    import subprocess
    safe_makedir("align")
    genome = "/n/hsphS10/hsphfs1/chb/biodata/genomes/Hsapiens/GRCh37/bowtie2/GRCh37"
    out_sam = os.path.join("align", os.path.basename(replace_suffix(pair[0], ".sam")))
    out_bam = replace_suffix(out_sam, ".bam")
    sorted = append_stem(out_bam, "_sorted")
    sorted_prefix = os.path.splitext(sorted)[0]
    out_index = replace_suffix(sorted, ".bai")
    if not file_exists(out_sam):
        if len(pair) == 2:
            fq1, fq2 = pair
            cmd = "bowtie2 -S {out_sam} {genome} -1 {fq1} -2 {fq2}"
        else:
            fq1 = pair[0]
            cmd = "bowtie2 -S {out_sam} {genome} {fq1}"
        subprocess.check_call(cmd.format(**locals()), shell=True)
    if not file_exists(out_bam):
        cmd = "samtools view -S {out_sam} -b -o {out_bam}"
        subprocess.check_call(cmd.format(**locals()), shell=True)
    if not file_exists(sorted):
        cmd = "samtools sort {out_bam} {sorted_prefix}"
        subprocess.check_call(cmd.format(**locals()), shell=True)
    if not file_exists(out_index):
        cmd = "samtools index {sorted}"
        subprocess.check_call(cmd.format(**locals()), shell=True)
    return sorted
Beispiel #20
0
def trim_read_through(fastq_files, dirs, lane_config):
    """
    for small insert sizes, the read length can be longer than the insert
    resulting in the reverse complement of the 3' adapter being sequenced.
    this takes adapter sequences and trims the only the reverse complement
    of the adapter

    MYSEQUENCEAAAARETPADA -> MYSEQUENCEAAAA (no polyA trim)

    """
    quality_format = _get_quality_format(lane_config)
    to_trim = _get_sequences_to_trim(lane_config, SUPPORTED_ADAPTERS)
    out_files = _get_read_through_trimmed_outfiles(fastq_files, dirs)
    fixed_files = append_stem(out_files, ".fixed")
    if all(map(file_exists, fixed_files)):
        return fixed_files
    logger.info("Trimming %s from the 3' end of reads in %s using "
                "cutadapt." % (", ".join(to_trim),
                               ", ".join(fastq_files)))
    cores = lane_config["algorithm"].get("num_cores", 1)
    out_files = _cutadapt_trim(fastq_files, quality_format,
                               to_trim, out_files, cores)

    fixed_files = remove_short_reads(out_files, dirs, lane_config)
    return fixed_files
Beispiel #21
0
def group_batches(xs):
    """Group samples into batches for simultaneous variant calling.

    Identify all samples to call together: those in the same batch,
    variant caller and genomic region.
    Pull together all BAM files from this batch and process together,
    Provide details to pull these finalized files back into individual
    expected files.
    """
    singles = []
    batch_groups = collections.defaultdict(list)
    for data, region, out_fname in xs:
        batch = data.get("metadata", {}).get("batch")
        caller = data["config"]["algorithm"].get("variantcaller", "gatk")
        if batch is not None:
            batches = batch if isinstance(batch, (list, tuple)) else [batch]
            for b in batches:
                batch_groups[(b, tuple(region), caller)].append((data, out_fname))
        else:
            singles.append((data, tuple(region), out_fname))
    batches = []
    remap_batches = {}
    for (batch, region, _), xs in batch_groups.iteritems():
        cur_data, cur_fname = xs[0]
        batch_fname = utils.append_stem(cur_fname, "-" + str(batch))
        batch_data = copy.deepcopy(cur_data)
        batch_data["work_bam"] = [x[0]["work_bam"] for x in xs]
        batch_data["work_items"] = [x[0] for x in xs]
        batch_data["group"] = batch_fname
        batches.append((batch_data, region, batch_fname))
        remap_batches[batch_fname] = xs
    return singles + batches, remap_batches
def align(pair):
    import os
    from bcbio.utils import file_exists, replace_suffix, append_stem, safe_makedir
    import subprocess
    safe_makedir("align")
    genome = "/n/hsphS10/hsphfs1/chb/biodata/genomes/Hsapiens/GRCh37/bowtie2/GRCh37"
    out_sam = os.path.join("align",
                           os.path.basename(replace_suffix(pair[0], ".sam")))
    out_bam = replace_suffix(out_sam, ".bam")
    sorted = append_stem(out_bam, "_sorted")
    sorted_prefix = os.path.splitext(sorted)[0]
    out_index = replace_suffix(sorted, ".bai")
    if not file_exists(out_sam):
        if len(pair) == 2:
            fq1, fq2 = pair
            cmd = "bowtie2 -S {out_sam} {genome} -1 {fq1} -2 {fq2}"
        else:
            fq1 = pair[0]
            cmd = "bowtie2 -S {out_sam} {genome} {fq1}"
        subprocess.check_call(cmd.format(**locals()), shell=True)
    if not file_exists(out_bam):
        cmd = "samtools view -S {out_sam} -b -o {out_bam}"
        subprocess.check_call(cmd.format(**locals()), shell=True)
    if not file_exists(sorted):
        cmd = "samtools sort {out_bam} {sorted_prefix}"
        subprocess.check_call(cmd.format(**locals()), shell=True)
    if not file_exists(out_index):
        cmd = "samtools index {sorted}"
        subprocess.check_call(cmd.format(**locals()), shell=True)
    return sorted
Beispiel #23
0
def align_to_sort_bam(fastq1, fastq2, aligner, data):
    """Align to the named genome build, returning a sorted BAM file.
    """
    names = data["rgnames"]
    align_dir_parts = [data["dirs"]["work"], "align", names["sample"]]
    if data.get("disambiguate"):
        align_dir_parts.append(data["disambiguate"]["genome_build"])
    aligner_index = _get_aligner_index(aligner, data)
    align_dir = utils.safe_makedir(apply(os.path.join, align_dir_parts))
    ref_file = tz.get_in(("reference", "fasta", "base"), data)
    if fastq1.endswith(".bam"):
        data = _align_from_bam(fastq1, aligner, aligner_index, ref_file, names,
                               align_dir, data)
    else:
        data = _align_from_fastq(fastq1, fastq2, aligner, aligner_index,
                                 ref_file, names, align_dir, data)
    if data["work_bam"] and utils.file_exists(data["work_bam"]):
        if data.get("align_split") and dd.get_mark_duplicates(data):
            # If merging later with with bamsormadup need query sorted inputs
            # but CWL requires a bai file. Create a fake one to make it happy.
            bam.fake_index(data["work_bam"], data)
        else:
            bam.index(data["work_bam"], data["config"])
        for extra in ["-sr", "-disc"]:
            extra_bam = utils.append_stem(data['work_bam'], extra)
            if utils.file_exists(extra_bam):
                bam.index(extra_bam, data["config"])
    return data
Beispiel #24
0
def _filter_by_support(orig_file, index):
    """Filter call file based on supporting evidence, adding pass/filter annotations to BEDPE.

    Filters based on the following criteria:
      - Multiple forms of evidence in any sample (split and paired end)
      - Minimum read support for the call.
    """
    min_read_count = 4
    out_file = utils.append_stem(orig_file, "-filter")
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            with open(orig_file) as in_handle:
                with open(tx_out_file, "w") as out_handle:
                    for parts in (l.rstrip().split("\t") for l in in_handle):
                        support = _get_support(parts)
                        evidence = set(reduce(operator.add, [x.keys() for x in support.values()]))
                        read_count = reduce(operator.add, support[index].values())
                        if len(evidence) < 2:
                            lfilter = "ApproachSupport"
                        elif read_count < min_read_count:
                            lfilter = "ReadCountSupport"
                        else:
                            lfilter = "PASS"
                        parts.append(lfilter)
                        out_handle.write("\t".join(parts) + "\n")
    return out_file
Beispiel #25
0
def run(items):
    """Perform detection of structural variations with lumpy, using bwa-mem alignment.
    """
    if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", False, None] for data in items):
        raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection")
    work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural", items[0]["name"][-1],
                                               "lumpy"))
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        dedup_bam, sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir)
        full_bams.append(dedup_bam)
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
    pebed_file, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, work_dir, items)
    out = []
    sample_config_file = _write_samples_to_ids(pebed_file, items)
    lumpy_vcf = _bedpe_to_vcf(pebed_file, sample_config_file, items)
    for i, data in enumerate(items):
        if "sv" not in data:
            data["sv"] = []
        sample = tz.get_in(["rgnames", "sample"], data)
        sample_bedpe = _filter_by_support(_subset_to_sample(pebed_file, i, data), i, data)
        if lumpy_vcf:
            sample_vcf = utils.append_stem(lumpy_vcf, "-%s" % sample)
            sample_vcf = _filter_by_bedpe(vcfutils.select_sample(lumpy_vcf, sample, sample_vcf, data["config"]),
                                          sample_bedpe, data)
        else:
            sample_vcf = None
        data["sv"].append({"variantcaller": "lumpy",
                           "vrn_file": sample_vcf,
                           "exclude_file": exclude_file,
                           "bedpe_file": sample_bedpe,
                           "sample_bed": sample_config_file})
        out.append(data)
    return out
Beispiel #26
0
def align_to_sort_bam(fastq1, fastq2, aligner, data):
    """Align to the named genome build, returning a sorted BAM file.
    """
    names = data["rgnames"]
    align_dir_parts = [data["dirs"]["work"], "align", names["sample"]]
    if data.get("disambiguate"):
        align_dir_parts.append(data["disambiguate"]["genome_build"])
    align_dir = utils.safe_makedir(apply(os.path.join, align_dir_parts))
    aligner_indexes = os.path.commonprefix(
        tz.get_in(("reference", aligner, "indexes"), data))
    if aligner_indexes.endswith("."):
        aligner_indexes = aligner_indexes[:-1]
    ref_file = tz.get_in(("reference", "fasta", "base"), data)
    if fastq1.endswith(".bam"):
        data = _align_from_bam(fastq1, aligner, aligner_indexes, ref_file,
                               names, align_dir, data)
    else:
        data = _align_from_fastq(fastq1, fastq2, aligner, aligner_indexes,
                                 ref_file, names, align_dir, data)
    if data["work_bam"] and utils.file_exists(data["work_bam"]):
        if not data.get("align_split"):
            bam.index(data["work_bam"], data["config"])
        for extra in ["-sr", "-disc"]:
            extra_bam = utils.append_stem(data['work_bam'], extra)
            if utils.file_exists(extra_bam):
                bam.index(extra_bam, data["config"])
    return data
Beispiel #27
0
def group_batches(xs):
    """Group samples into batches for simultaneous variant calling.

    Identify all samples to call together: those in the same batch,
    variant caller and genomic region.
    Pull together all BAM files from this batch and process together,
    Provide details to pull these finalized files back into individual
    expected files.
    """
    singles = []
    batch_groups = collections.defaultdict(list)
    for data, region, out_fname in xs:
        batch = data.get("metadata", {}).get("batch")
        caller = data["config"]["algorithm"].get("variantcaller", "gatk")
        if batch is not None:
            batch_groups[(batch, region, caller)].append((data, out_fname))
        else:
            singles.append((data, region, out_fname))
    batches = []
    remap_batches = {}
    for (batch, region, _), xs in batch_groups.iteritems():
        cur_data, cur_fname = xs[0]
        batch_fname = utils.append_stem(cur_fname, "-" + batch)
        batch_data = copy.deepcopy(cur_data)
        batch_data["work_bam"] = [x[0]["work_bam"] for x in xs]
        batch_data["group"] = batch_fname
        batches.append((batch_data, region, batch_fname))
        remap_batches[batch_fname] = xs
    return singles + batches, remap_batches
Beispiel #28
0
def run(items):
    """Perform detection of structural variations with lumpy, using bwa-mem alignment.
    """
    if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", False, None] for data in items):
        raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection")
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0])
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        dedup_bam, sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir)
        full_bams.append(dedup_bam)
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
    lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, work_dir, items)
    out = []
    for i, data in enumerate(items):
        if "sv" not in data:
            data["sv"] = []
        sample = dd.get_sample_name(data)
        dedup_bam, sr_bam, _ = sshared.get_split_discordants(data, work_dir)
        sample_vcf = vcfutils.select_sample(lumpy_vcf, sample,
                                            utils.append_stem(lumpy_vcf, "-%s" % sample),
                                            data["config"])
        gt_vcf = _run_svtyper(sample_vcf, dedup_bam, sr_bam, data)
        filter_vcf = _filter_by_support(gt_vcf, data)
        data["sv"].append({"variantcaller": "lumpy",
                           "vrn_file": filter_vcf,
                           "exclude_file": exclude_file})
        out.append(data)
    return out
Beispiel #29
0
def _calculate_percentiles(cov_file, dist_file, cutoffs, out_dir, data):
    """Calculate percentage over over specified cutoff range.

    XXX Does not calculate the per-bin coverage estimations which we had
    earlier with sambamba depth. Instead has a global metric of percent coverage
    which provides a more defined look at coverage changes by depth.
    """
    if not utils.file_exists(dist_file) or not utils.file_exists(cov_file):
        return []
    sample = dd.get_sample_name(data)
    out_total_file = append_stem(dist_file, "_total_summary")
    if not utils.file_exists(out_total_file):
        with file_transaction(data, out_total_file) as tx_file:
            with open(tx_file, 'w') as out_handle:
                writer = csv.writer(out_handle, dialect="excel-tab")
                writer.writerow(["cutoff_reads", "bases_pct", "sample"])
                with open(dist_file) as in_handle:
                    for line in in_handle:
                        count, pct = line.strip().split()
                        count = int(count)
                        pct = "%.1f" % (float(pct) * 100.0)
                        if count >= min(cutoffs) and count <= max(cutoffs):
                            writer.writerow(
                                ["percentage%s" % count, pct, sample])
                    if min(cutoffs) < count:
                        writer.writerow(
                            ["percentage%s" % min(cutoffs), pct, sample])
    # To move metrics to multiqc, will remove older files
    # when bcbreport accepts these one, to avoid errors
    # while porting everything to multiqc
    # These files will be copied to final
    out_total_fixed = os.path.join(os.path.dirname(out_total_file),
                                   "%s_bcbio_coverage_avg.txt" % sample)
    copy_plus(out_total_file, out_total_fixed)
    return [out_total_fixed]
Beispiel #30
0
def _collapse(in_file):
    out_file = append_stem(in_file, ".trimming").replace(".gz", "")
    if file_exists(out_file):
        return out_file
    seqs = collapse(in_file)
    write_output(out_file, seqs)
    return out_file
Beispiel #31
0
def align_to_sort_bam(fastq1, fastq2, aligner, data):
    """Align to the named genome build, returning a sorted BAM file.
    """
    names = data["rgnames"]
    align_dir_parts = [data["dirs"]["work"], "align", names["sample"]]
    if data.get("disambiguate"):
        align_dir_parts.append(data["disambiguate"]["genome_build"])
    aligner_index = _get_aligner_index(aligner, data)
    align_dir = utils.safe_makedir(apply(os.path.join, align_dir_parts))
    ref_file = tz.get_in(("reference", "fasta", "base"), data)
    if fastq1.endswith(".bam"):
        data = _align_from_bam(fastq1, aligner, aligner_index, ref_file,
                               names, align_dir, data)
    else:
        data = _align_from_fastq(fastq1, fastq2, aligner, aligner_index, ref_file,
                                 names, align_dir, data)
    if data["work_bam"] and utils.file_exists(data["work_bam"]):
        if data.get("align_split") and dd.get_mark_duplicates(data):
            # If merging later with with bamsormadup need query sorted inputs
            # but CWL requires a bai file. Create a fake one to make it happy.
            bam.fake_index(data["work_bam"], data)
        else:
            bam.index(data["work_bam"], data["config"])
        for extra in ["-sr", "-disc"]:
            extra_bam = utils.append_stem(data['work_bam'], extra)
            if utils.file_exists(extra_bam):
                bam.index(extra_bam, data["config"])
    return data
Beispiel #32
0
def run_vep(data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    out_file = utils.append_stem(data["vrn_file"], "-vepeffects")
    assert data["vrn_file"].endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(data["genome_build"],
                                                   tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("variant_effect_predictor.pl", data["config"])
                dbnsfp_args, dbnsfp_fields = _get_dbnsfp(data)
                loftee_args, loftee_fields = _get_loftee(data)
                std_fields = ["Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature",
                              "EXON", "PolyPhen", "SIFT", "Protein_position", "BIOTYPE", "CANONICAL", "CCDS"]
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout"] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats",
                       "--cache", "--offline", "--dir", vep_dir,
                       "--sift", "b", "--polyphen", "b", "--symbol", "--numbers", "--biotype", "--total_length",
                       "--canonical", "--ccds",
                       "--fields", ",".join(std_fields + dbnsfp_fields + loftee_fields)] + dbnsfp_args + loftee_args
                cmd = "gunzip -c %s | %s | bgzip -c > %s" % (data["vrn_file"], " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
Beispiel #33
0
def run_vep(data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    out_file = utils.append_stem(data["vrn_file"], "-vepeffects")
    assert data["vrn_file"].endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(
                data["genome_build"],
                tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                vep = config_utils.get_program("variant_effect_predictor.pl",
                                               data["config"])
                dbnsfp_args, dbnsfp_fields = _get_dbnsfp(data)
                loftee_args, loftee_fields = _get_loftee(data)
                std_fields = [
                    "Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL",
                    "Feature", "EXON", "PolyPhen", "SIFT", "Protein_position",
                    "BIOTYPE", "CANONICAL", "CCDS"
                ]
                cmd = [
                    vep, "--vcf", "-o", "stdout", "--fork", "4", "--species",
                    ensembl_name, "--no_stats", "--cache", "--offline",
                    "--dir", vep_dir, "--sift", "b", "--polyphen", "b",
                    "--symbol", "--numbers", "--biotype", "--total_length",
                    "--canonical", "--ccds", "--fields",
                    ",".join(std_fields + dbnsfp_fields + loftee_fields)
                ] + dbnsfp_args + loftee_args
                cmd = "gunzip -c %s | %s | bgzip -c > %s" % (
                    data["vrn_file"], " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
Beispiel #34
0
def _filter_by_support(orig_file, index):
    """Filter call file based on supporting evidence, adding pass/filter annotations to BEDPE.

    Filters based on the following criteria:
      - Minimum read support for the call.
    Other filters not currently applied due to being too restrictive:
      - Multiple forms of evidence in any sample (split and paired end)
    """
    min_read_count = 4
    out_file = utils.append_stem(orig_file, "-filter")
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            with open(orig_file) as in_handle:
                with open(tx_out_file, "w") as out_handle:
                    for parts in (l.rstrip().split("\t") for l in in_handle):
                        support = _get_support(parts)
                        #evidence = set(reduce(operator.add, [x.keys() for x in support.values()]))
                        read_count = reduce(operator.add, support[index].values())
                        if read_count < min_read_count:
                            lfilter = "ReadCountSupport"
                        #elif len(evidence) < 2:
                        #    lfilter = "ApproachSupport"
                        else:
                            lfilter = "PASS"
                        parts.append(lfilter)
                        out_handle.write("\t".join(parts) + "\n")
    return out_file
Beispiel #35
0
def run_vep(in_file, data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    if not vcfutils.vcf_has_variants(in_file):
        return None
    out_file = utils.append_stem(in_file, "-vepeffects")
    assert in_file.endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(data["genome_build"],
                                                   tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("vep", data["config"])
                is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False)
                # HGVS requires a bgzip compressed, faidx indexed input file or is unusable slow
                if dd.get_ref_file_compressed(data):
                    hgvs_compatible = True
                    config_args = ["--fasta", dd.get_ref_file_compressed(data)]
                else:
                    hgvs_compatible = False
                    config_args = ["--fasta", dd.get_ref_file(data)]
                if is_human:
                    plugin_fns = {"loftee": _get_loftee, "maxentscan": _get_maxentscan,
                                  "genesplicer": _get_genesplicer, "spliceregion": _get_spliceregion}
                    plugins = ["loftee"]
                    if "vep_splicesite_annotations" in dd.get_tools_on(data):
                        # "genesplicer" too unstable so currently removed
                        plugins += ["maxentscan", "spliceregion"]
                    for plugin in plugins:
                        plugin_args = plugin_fns[plugin](data)
                        config_args += plugin_args
                    config_args += ["--sift", "b", "--polyphen", "b"]
                    if hgvs_compatible:
                        config_args += ["--hgvs", "--shift_hgvs", "1"]
                if (dd.get_effects_transcripts(data).startswith("canonical")
                      or tz.get_in(("config", "algorithm", "clinical_reporting"), data)):
                    config_args += ["--pick_allele"]
                if ensembl_name.endswith("_merged"):
                    config_args += ["--merged"]
                    ensembl_name = ensembl_name.replace("_merged", "")
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats", "--cache",
                        "--offline", "--dir", vep_dir,
                       "--symbol", "--numbers", "--biotype", "--total_length", "--canonical",
                       "--gene_phenotype", "--ccds", "--uniprot", "--domains", "--regulatory",
                       "--protein", "--tsl", "--appris", "--af", "--max_af", "--af_1kg", "--af_esp", "--af_gnomad",
                       "--pubmed", "--variant_class", "--allele_number"] + config_args
                perl_exports = utils.get_perl_exports()
                # Remove empty fields (';;') which can cause parsing errors downstream
                cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perl_exports, " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
Beispiel #36
0
def delayed_bam_merge(data):
    """Perform a merge on previously prepped files, delayed in processing.

    Handles merging of associated split read and discordant files if present.
    """
    if data.get("combine"):
        assert len(data["combine"].keys()) == 1
        file_key = data["combine"].keys()[0]
        extras = []
        for x in data["combine"][file_key].get("extras", []):
            if isinstance(x, (list, tuple)):
                extras.extend(x)
            else:
                extras.append(x)
        if file_key in data:
            extras.append(data[file_key])
        in_files = sorted(list(set(extras)))
        out_file = tz.get_in(["combine", file_key, "out"], data, _merge_out_from_infiles(in_files))
        sup_exts = data.get(file_key + "-plus", {}).keys()
        for ext in sup_exts + [""]:
            merged_file = None
            if os.path.exists(utils.append_stem(out_file, "-" + ext)):
                cur_out_file, cur_in_files = out_file, []
            if ext:
                cur_in_files = list(filter(os.path.exists, (utils.append_stem(f, "-" + ext) for f in in_files)))
                cur_out_file = utils.append_stem(out_file, "-" + ext) if len(cur_in_files) > 0 else None
            else:
                cur_in_files, cur_out_file = in_files, out_file
            if cur_out_file:
                config = copy.deepcopy(data["config"])
                config["algorithm"]["save_diskspace"] = False
                if len(cur_in_files) > 0:
                    merged_file = merge_bam_files(
                        cur_in_files, os.path.dirname(cur_out_file), config, out_file=cur_out_file
                    )
                else:
                    assert os.path.exists(cur_out_file)
                    merged_file = cur_out_file
            if merged_file:
                if ext:
                    data[file_key + "-plus"][ext] = merged_file
                else:
                    data[file_key] = merged_file
        data.pop("region", None)
        data.pop("combine", None)
    return [[data]]
Beispiel #37
0
def run_vep(in_file, data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    if not vcfutils.vcf_has_variants(in_file):
        return None
    out_file = utils.append_stem(in_file, "-vepeffects")
    assert in_file.endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(data["genome_build"],
                                                   tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("vep", data["config"])
                is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False)
                # HGVS requires a bgzip compressed, faidx indexed input file or is unusable slow
                if dd.get_ref_file_compressed(data):
                    hgvs_compatible = True
                    config_args = ["--fasta", dd.get_ref_file_compressed(data)]
                else:
                    hgvs_compatible = False
                    config_args = ["--fasta", dd.get_ref_file(data)]
                if is_human:
                    plugin_fns = {"loftee": _get_loftee, "maxentscan": _get_maxentscan,
                                  "genesplicer": _get_genesplicer, "spliceregion": _get_spliceregion}
                    plugins = ["loftee"]
                    if "vep_splicesite_annotations" in dd.get_tools_on(data):
                        # "genesplicer" too unstable so currently removed
                        plugins += ["maxentscan", "spliceregion"]
                    for plugin in plugins:
                        plugin_args = plugin_fns[plugin](data)
                        config_args += plugin_args
                    config_args += ["--sift", "b", "--polyphen", "b"]
                    if hgvs_compatible:
                        config_args += ["--hgvs", "--shift_hgvs", "1"]
                if (dd.get_effects_transcripts(data).startswith("canonical")
                      or tz.get_in(("config", "algorithm", "clinical_reporting"), data)):
                    config_args += ["--pick_allele"]
                if ensembl_name.endswith("_merged"):
                    config_args += ["--merged"]
                    ensembl_name = ensembl_name.replace("_merged", "")
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats", "--cache",
                        "--offline", "--dir", vep_dir,
                       "--symbol", "--numbers", "--biotype", "--total_length", "--canonical",
                       "--gene_phenotype", "--ccds", "--uniprot", "--domains", "--regulatory",
                       "--protein", "--tsl", "--appris", "--af", "--max_af", "--af_1kg", "--af_esp", "--af_gnomad",
                       "--pubmed", "--variant_class", "--allele_number"] + config_args
                perl_exports = utils.get_perl_exports()
                # Remove empty fields (';;') which can cause parsing errors downstream
                cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perl_exports, " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        return vcfutils.bgzip_and_index(out_file, data["config"])
Beispiel #38
0
def trim_srna_sample(data):
    """
    Remove 3' adapter for smallRNA-seq
    Uses cutadapt but with different parameters than for other pipelines.
    """
    in_file = data["files"][0]
    names = data["rgnames"]['sample']
    work_dir = os.path.join(dd.get_work_dir(data), "trimmed")
    out_dir = os.path.join(work_dir, names)
    utils.safe_makedir(out_dir)
    out_file = replace_directory(append_stem(in_file, ".clean"), out_dir)
    trim_reads = data["config"]["algorithm"].get("trim_reads", True)
    adapter = dd.get_adapters(data)
    if trim_reads and adapter:
        adapter = adapter[0]
        out_noadapter_file = replace_directory(
            append_stem(in_file, ".fragments"), out_dir)
        out_short_file = replace_directory(append_stem(in_file, ".short"),
                                           out_dir)
        log_out = os.path.join(out_dir, "%s.log" % names)
        cutadapt = os.path.join(os.path.dirname(sys.executable), "cutadapt")
        options = " ".join(
            config_utils.get_resources("cutadapt",
                                       data['config']).get("options", ""))
        cmd = _cmd_cutadapt()
        if not utils.file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                do.run(cmd.format(**locals()), "remove adapter for %s" % names)
                if utils.file_exists(log_out):
                    content = open(log_out).read().replace(
                        out_short_file, names)
                    open(log_out, 'w').write(content)
                if options:
                    in_file = tx_out_file + ".tmp.fastq"
                    utils.move_safe(tx_out_file, in_file)
                    cmd = "{cutadapt} {options} {in_file} -o {tx_out_file} -m 17"
                    do.run(cmd.format(**locals()),
                           "cutadapt with this %s for %s" % (options, names))
    else:
        logger.debug("Skip trimming for: %s" % names)
        symlink_plus(in_file, out_file)
    data["clean_fastq"] = out_file
    data["collapse"] = _collapse(data["clean_fastq"])
    data["size_stats"] = _summary(data['collapse'])
    return [[data]]
Beispiel #39
0
def run(items):
    """Perform detection of structural variations with lumpy, using bwa-mem alignment.
    """
    if not all(utils.get_in(data, ("config", "algorithm", "aligner"))
               in ["bwa", "sentieon-bwa", False, None] for data in items):
        raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection")
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0])
    previous_evidence = {}
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir)
        full_bams.append(dd.get_align_bam(data))
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
        cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir)
        previous_evidence[dd.get_sample_name(data)] = {}
        if cur_dels and utils.file_exists(cur_dels):
            previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels
        if cur_dups and utils.file_exists(cur_dups):
            previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups
    lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence,
                                         work_dir, items)
    gt_vcfs = {}
    for data in items:
        sample = dd.get_sample_name(data)
        sample_vcf = vcfutils.select_sample(lumpy_vcf, sample,
                                            utils.append_stem(lumpy_vcf, "-%s" % sample),
                                            data["config"])
        if "bnd-genotype" in dd.get_tools_on(data):
            gt_vcf = _run_svtyper(sample_vcf, dd.get_align_bam(data), exclude_file, data)
        elif "lumpy-genotype" in dd.get_tools_off(data):
            gt_vcf = sample_vcf
        else:
            std_vcf, bnd_vcf = _split_breakends(sample_vcf, data)
            std_gt_vcf = _run_svtyper(std_vcf, dd.get_align_bam(data), exclude_file, data)
            gt_vcf = vcfutils.concat_variant_files_bcftools(
                orig_files=[std_gt_vcf, bnd_vcf],
                out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0],
                config=data["config"])
        gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data)
    if paired and paired.normal_name:
        gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        vcf_file = gt_vcfs[dd.get_sample_name(data)]
        if dd.get_svprioritize(data):
            effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff")
        else:
            effects_vcf = None
        data["sv"].append({"variantcaller": "lumpy",
                           "vrn_file": effects_vcf or vcf_file,
                           "exclude_file": exclude_file})
        out.append(data)
    return out
def _change_sample_name(in_file):
    """Fix name in feature counts log file to get the same
       name in multiqc report.
    """
    out_file = append_stem(in_file, "_fixed")
    with file_transaction(out_file) as tx_out:
        with open(tx_out, "w") as out_handle:
            print >>out_handle, open(in_file).read().replace(".nsorted.primary", "")
    return out_file
Beispiel #41
0
def _prepare_bam(bam_file, bed_file, config):
    if not bam_file or not bed_file:
        return bam_file
    out_file = utils.append_stem(bam_file, '_filter')
    samtools = config_utils.get_program("samtools", config)
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out:
            cmd = "{samtools} view -bh -L {bed_file} {bam_file} > {tx_out}"
            do.run(cmd.format(**locals()), "Clean %s" % bam_file)
    return out_file
Beispiel #42
0
def _collapse(in_file):
    """
    Collpase reads into unique sequences with seqcluster
    """
    out_file = append_stem(in_file, ".trimming").replace(".gz", "")
    if file_exists(out_file):
        return out_file
    seqs = collapse(in_file)
    write_output(out_file, seqs, minimum=1, size=16)
    return out_file
Beispiel #43
0
def _process_bam(bam_file, in_fastq, sample, reference, config):
        broad_runner = broad.runner_from_config(config)
        names = {'rg': in_fastq, 'library': 'WGBS_LIB', 'pl': 'Illumina', 'pu': 'R1', 'sm': in_fastq, 'sample': sample}
        out_fix_bam = broad_runner.run_fn("picard_fix_rgs", bam_file, names)
        order_bam = utils.append_stem(out_fix_bam, "_order")
        broad_runner.run_fn("picard_reorder", out_fix_bam, reference, order_bam)
        bam.index(order_bam, config)
        # order_bam = _set_quality(order_bam)
        # bam.index(order_bam, config)
        return order_bam
Beispiel #44
0
def _prepare_bam(bam_file, bed_file, config):
    if not bam_file or not bed_file:
        return bam_file
    out_file = utils.append_stem(bam_file, '_filter')
    samtools = config_utils.get_program("samtools", config)
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out:
            cmd = "{samtools} view -bh -L {bed_file} {bam_file} > {tx_out}"
            do.run(cmd.format(**locals()), "Clean %s" % bam_file)
    return out_file
Beispiel #45
0
def _prepare_file(fn, out_dir):
    """Cut the beginning of the reads to avoid detection of miRNAs"""
    atropos = _get_atropos()
    cmd = "{atropos} trim --max-reads 500000 -u 22 -se {fn} -o {tx_file}"
    out_file = os.path.join(out_dir, append_stem(os.path.basename(fn), "end"))
    if file_exists(out_file):
        return out_file
    with file_transaction(out_file) as tx_file:
        do.run(cmd.format(**locals()))
    return out_file
Beispiel #46
0
def _collapse(in_file):
    """
    Collpase reads into unique sequences with seqcluster
    """
    out_file = append_stem(in_file, ".trimming").replace(".gz", "")
    if file_exists(out_file):
        return out_file
    seqs = collapse(in_file)
    write_output(out_file, seqs, minimum=1, size=16)
    return out_file
Beispiel #47
0
def trim_srna_sample(data):
    adapter = dd.get_adapters(data)[0]
    names = data["rgnames"]['sample']
    work_dir = os.path.join(dd.get_work_dir(data), "trimmed")
    out_dir = os.path.join(work_dir, names)
    in_file = data["files"][0]
    utils.safe_makedir(out_dir)
    out_file = replace_directory(append_stem(in_file, ".clean"), out_dir)
    out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir)
    out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir)
    cutadapt = os.path.join(os.path.dirname(sys.executable), "cutadapt")
    cmd = _cmd_cutadapt()
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            do.run(cmd.format(**locals()), "remove adapter")
    data["clean_fastq"] = out_file
    data["collapse"] = _collapse(data["clean_fastq"])
    data["size_stats"] = _summary(data['collapse'])
    return [[data]]
Beispiel #48
0
def _prepare_file(fn, out_dir):
    """Cut the beginning of the reads to avoid detection of miRNAs"""
    atropos = _get_atropos()
    cmd = "{atropos} trim --max-reads 500000 -u 22 -se {fn} -o {tx_file}"
    out_file = os.path.join(out_dir, append_stem(os.path.basename(fn), "end"))
    if file_exists(out_file):
        return out_file
    with file_transaction(out_file) as tx_file:
        do.run(cmd.format(**locals()))
    return out_file
def _change_sample_name(in_file):
    """Fix name in feature counts log file to get the same
       name in multiqc report.
    """
    out_file = append_stem(in_file, "_fixed")
    with file_transaction(out_file) as tx_out:
        with open(tx_out, "w") as out_handle:
            print >> out_handle, open(in_file).read().replace(
                ".nsorted.primary", "")
    return out_file
Beispiel #50
0
def run(items):
    """Perform detection of structural variations with lumpy, using bwa-mem alignment.
    """
    if not all(
            utils.get_in(data, ("config", "algorithm",
                                "aligner")) in ["bwa", False, None]
            for data in items):
        raise ValueError(
            "Require bwa-mem alignment input for lumpy structural variation detection"
        )
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    work_dir = _sv_workdir(
        paired.tumor_data if paired and paired.tumor_data else items[0])
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        dedup_bam, sr_bam, disc_bam = sshared.get_split_discordants(
            data, work_dir)
        full_bams.append(dedup_bam)
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
    lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams,
                                         work_dir, items)
    gt_vcfs = {}
    for data in items:
        sample = dd.get_sample_name(data)
        dedup_bam, sr_bam, _ = sshared.get_split_discordants(data, work_dir)
        sample_vcf = vcfutils.select_sample(
            lumpy_vcf, sample, utils.append_stem(lumpy_vcf, "-%s" % sample),
            data["config"])
        std_vcf, bnd_vcf = _split_breakends(sample_vcf, data)
        std_gt_vcf = _run_svtyper(std_vcf, dedup_bam, sr_bam, exclude_file,
                                  data)
        gt_vcf = vcfutils.combine_variant_files(
            orig_files=[std_gt_vcf, bnd_vcf],
            out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0],
            ref_file=dd.get_ref_file(data),
            config=data["config"])
        gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data)
    if paired and paired.normal_name:
        gt_vcfs = _filter_by_background([paired.tumor_name],
                                        [paired.normal_name], gt_vcfs,
                                        paired.tumor_data)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        vcf_file = gt_vcfs[dd.get_sample_name(data)]
        effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff")
        data["sv"].append({
            "variantcaller": "lumpy",
            "vrn_file": effects_vcf or vcf_file,
            "exclude_file": exclude_file
        })
        out.append(data)
    return out
Beispiel #51
0
def run(items):
    """Perform detection of structural variations with lumpy, using bwa-mem alignment.
    """
    if not all(utils.get_in(data, ("config", "algorithm", "aligner"))
               in ["bwa", "sentieon-bwa", False, None] for data in items):
        raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection")
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0])
    previous_evidence = {}
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir)
        full_bams.append(dd.get_align_bam(data))
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
        cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir)
        previous_evidence[dd.get_sample_name(data)] = {}
        if cur_dels and utils.file_exists(cur_dels):
            previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels
        if cur_dups and utils.file_exists(cur_dups):
            previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups
    lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence,
                                         work_dir, items)
    gt_vcfs = {}
    for data in items:
        sample = dd.get_sample_name(data)
        sample_vcf = vcfutils.select_sample(lumpy_vcf, sample,
                                            utils.append_stem(lumpy_vcf, "-%s" % sample),
                                            data["config"])
        if "bnd-genotype" in dd.get_tools_on(data):
            gt_vcf = _run_svtyper(sample_vcf, dd.get_align_bam(data), exclude_file, data)
        else:
            std_vcf, bnd_vcf = _split_breakends(sample_vcf, data)
            std_gt_vcf = _run_svtyper(std_vcf, dd.get_align_bam(data), exclude_file, data)
            gt_vcf = vcfutils.concat_variant_files_bcftools(
                orig_files=[std_gt_vcf, bnd_vcf],
                out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0],
                config=data["config"])
        gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data)
    if paired and paired.normal_name:
        gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        vcf_file = gt_vcfs[dd.get_sample_name(data)]
        if dd.get_svprioritize(data):
            effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff")
        else:
            effects_vcf = None
        data["sv"].append({"variantcaller": "lumpy",
                           "vrn_file": effects_vcf or vcf_file,
                           "exclude_file": exclude_file})
        out.append(data)
    return out
Beispiel #52
0
def trim_srna_sample(data):
    adapter = dd.get_adapters(data)[0]
    names = data["rgnames"]['sample']
    work_dir = os.path.join(dd.get_work_dir(data), "trimmed")
    out_dir = os.path.join(work_dir, names)
    in_file = data["files"][0]
    utils.safe_makedir(out_dir)
    out_file = replace_directory(append_stem(in_file, ".clean"), out_dir)
    out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"),
                                           out_dir)
    out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir)
    cutadapt = os.path.join(os.path.dirname(sys.executable), "cutadapt")
    cmd = _cmd_cutadapt()
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            do.run(cmd.format(**locals()), "remove adapter")
    data["clean_fastq"] = out_file
    data["collapse"] = _collapse(data["clean_fastq"])
    data["size_stats"] = _summary(data['collapse'])
    return [[data]]
Beispiel #53
0
def _prepare_bam(bam_file, bed_file, config):
    """Remove regions from bed files"""
    if not bam_file or not bed_file:
        return bam_file
    out_file = utils.append_stem(bam_file, '_filter')
    bedtools = config_utils.get_program("bedtools", config)
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out:
            cmd = "{bedtools} subtract -nonamecheck -A -a {bam_file} -b {bed_file} > {tx_out}"
            do.run(cmd.format(**locals()), "Clean %s" % bam_file)
    return out_file
Beispiel #54
0
def _correct_vcf(vcf_file):
    """
    sort by genome/position, bgzip and index
    """
    vcf_sort = append_stem(vcf_file, "_sort") + ".gz"
    if not file_exists(vcf_sort):
        with file_transaction(vcf_sort) as tx_out:
            cmd = "cat {vcf_file} |vcf-sort | bgzip  > {tx_out}"
            do.run(cmd.format(**locals()), "sort %s" % vcf_file)
            do.run("tabix -f {0}".format(tx_out), "")
    return vcf_sort
Beispiel #55
0
 def test_2_vcf_exclusion(self):
     """Exclude samples from VCF files.
     """
     ref_file = os.path.join(self.data_dir, "genomes", "hg19", "seq", "hg19.fa")
     config = load_config(os.path.join(self.data_dir, "automated",
                                       "post_process-sample.yaml"))
     out_file = utils.append_stem(self.combo_file, "-exclude")
     to_exclude = ["S1"]
     if os.path.exists(out_file):
         os.remove(out_file)
     vcfutils.exclude_samples(self.combo_file, out_file, to_exclude, ref_file, config)
def mark_duplicates(sam_file):
    import subprocess
    from bcbio.utils import file_exists, replace_suffix, append_stem
    fm = "/n/HSPH/local/share/java/picard/FixMateInformation.jar"
    md = "/n/HSPH/local/share/java/picard/MarkDuplicates.jar"
    jvm_opts = "-Xms750m -Xmx2000m"
    mate_fixed_file = append_stem(sam_file, "_matefixed")
    if not file_exists(mate_fixed_file):
        cmd = ("java {jvm_opts} -jar {fm} INPUT={sam_file} "
               "OUTPUT={mate_fixed_file}")
        subprocess.check_call(cmd.format(**locals()), shell=True)
    sam_file = mate_fixed_file
    out_file = append_stem(sam_file, "_dupemarked")
    stats_file = replace_suffix(append_stem(sam_file, "_stats"), ".txt")
    if not file_exists(out_file):
        cmd = ("java {jvm_opts} -jar {md} INPUT={sam_file} "
               "OUTPUT={out_file} METRICS_FILE={stats_file} "
               "VALIDATION_STRINGENCY=LENIENT")
        subprocess.check_call(cmd.format(**locals()), shell=True)
    return out_file
Beispiel #57
0
def _prepare_bam(bam_file, bed_file, config):
    """Remove regions from bed files"""
    if not bam_file or not bed_file:
        return bam_file
    out_file = utils.append_stem(bam_file, '_filter')
    bedtools = config_utils.get_program("bedtools", config)
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out:
            cmd = "{bedtools} subtract -nonamecheck -A -a {bam_file} -b {bed_file} > {tx_out}"
            do.run(cmd.format(**locals()), "Remove blacklist regions from %s" % bam_file)
    return out_file