Ejemplo n.º 1
0
def trim_srna_sample(data):
    """
    Remove 3' adapter for smallRNA-seq
    Uses cutadapt but with different parameters than for other pipelines.
    """
    in_file = data["files"][0]
    names = data["rgnames"]['sample']
    work_dir = os.path.join(dd.get_work_dir(data), "trimmed")
    out_dir = os.path.join(work_dir, names)
    utils.safe_makedir(out_dir)
    out_file = replace_directory(append_stem(in_file, ".clean"), out_dir)
    trim_reads = data["config"]["algorithm"].get("trim_reads", True)
    if trim_reads:
        adapter = dd.get_adapters(data)[0]
        out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir)
        out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir)
        cutadapt = os.path.join(os.path.dirname(sys.executable), "cutadapt")
        cmd = _cmd_cutadapt()
        if not utils.file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                do.run(cmd.format(**locals()), "remove adapter")
    else:
        symlink_plus(in_file, out_file)
    data["clean_fastq"] = out_file
    data["collapse"] = _collapse(data["clean_fastq"])
    data["size_stats"] = _summary(data['collapse'])
    return [[data]]
Ejemplo n.º 2
0
def delayed_bam_merge(data):
    """Perform a merge on previously prepped files, delayed in processing.

    Handles merging of associated split read and discordant files if present
    """
    if data.get("combine"):
        assert len(data["combine"].keys()) == 1
        file_key = data["combine"].keys()[0]
        extras = []
        for x in data["combine"][file_key].get("extras", []):
            if isinstance(x, (list, tuple)):
                extras.extend(x)
            else:
                extras.append(x)
        in_files = sorted(list(set([data[file_key]] + extras)))
        out_file = data["combine"][file_key]["out"]
        for ext in ["-disc", "-sr", ""]:
            if ext:
                cur_in_files = list(filter(os.path.exists, (utils.append_stem(f, ext) for f in in_files)))
                cur_out_file = utils.append_stem(out_file, ext) if len(in_files) > 0 else None
            else:
                cur_in_files, cur_out_file = in_files, out_file
            if cur_out_file:
                config = copy.deepcopy(data["config"])
                config["algorithm"]["save_diskspace"] = False
                merged_file = merge_bam_files(cur_in_files, os.path.dirname(cur_out_file), config,
                                              out_file=cur_out_file)
        data.pop("region", None)
        data.pop("combine", None)
        data[file_key] = merged_file
    return [[data]]
Ejemplo n.º 3
0
def _cutadapt_trim(fastq_files, quality_format, adapters, out_files, log_file, config):
    """Trimming with cutadapt, using version installed with bcbio-nextgen.

    Uses the system executable to find the version next to our Anaconda Python.
    TODO: Could we use cutadapt as a library to avoid this?
    """
    if all([file_exists(x) for x in out_files]):
        return out_files
    cmd = _cutadapt_trim_cmd(fastq_files, quality_format, adapters, out_files)
    if len(fastq_files) == 1:
        of = [out_files[0], log_file]
        message = "Trimming %s in single end mode with cutadapt." % (fastq_files[0])
        with file_transaction(config, of) as of_tx:
            of1_tx, log_tx = of_tx
            do.run(cmd.format(**locals()), message)
    else:
        of = out_files + [log_file]
        with file_transaction(config, of) as tx_out_files:
            of1_tx, of2_tx, log_tx = tx_out_files
            tmp_fq1 = append_stem(of1_tx, ".tmp")
            tmp_fq2 = append_stem(of2_tx, ".tmp")
            singles_file = of1_tx + ".single"
            message = "Trimming %s and %s in paired end mode with cutadapt." % (fastq_files[0],
                                                                                fastq_files[1])
            do.run(cmd.format(**locals()), message)
    return out_files
Ejemplo n.º 4
0
def filter_reads_by_length(fq1, fq2, quality_format, min_length=20):
    """
    removes reads from a pair of fastq files that are shorter than
    a minimum length. removes both ends of a read if one end falls
    below the threshold while maintaining the order of the reads

    """

    logger.info("Removing reads in %s and %s that "
                "are less than %d bases." % (fq1, fq2, min_length))
    fq1_out = utils.append_stem(fq1, ".fixed")
    fq2_out = utils.append_stem(fq2, ".fixed")
    fq1_single = utils.append_stem(fq1, ".singles")
    fq2_single = utils.append_stem(fq2, ".singles")
    if all(map(utils.file_exists, [fq1_out, fq2_out, fq2_single, fq2_single])):
        return [fq1_out, fq2_out]

    fq1_in = SeqIO.parse(fq1, quality_format)
    fq2_in = SeqIO.parse(fq2, quality_format)

    with open(fq1_out, 'w') as fq1_out_handle, open(fq2_out, 'w') as fq2_out_handle, open(fq1_single, 'w') as fq1_single_handle, open(fq2_single, 'w') as fq2_single_handle:
        for fq1_record, fq2_record in izip(fq1_in, fq2_in):
            if len(fq1_record.seq) >= min_length and len(fq2_record.seq) >= min_length:
                fq1_out_handle.write(fq1_record.format(quality_format))
                fq2_out_handle.write(fq2_record.format(quality_format))
            else:
                if len(fq1_record.seq) > min_length:
                    fq1_single_handle.write(fq1_record.format(quality_format))
                if len(fq2_record.seq) > min_length:
                    fq2_single_handle.write(fq2_record.format(quality_format))

    return [fq1_out, fq2_out]
Ejemplo n.º 5
0
def _make_isomir_counts(data, srna_type="seqbuster", out_dir=None, stem=""):
    """
    Parse miraligner files to create count matrix.
    """
    work_dir = dd.get_work_dir(data[0][0])
    if not out_dir:
        out_dir = op.join(work_dir, "mirbase")
    out_novel_isomir = append_stem(op.join(out_dir, "counts.tsv"), stem)
    out_novel_mirna = append_stem(op.join(out_dir, "counts_mirna.tsv"), stem)
    logger.debug("Create %s count data at %s." % (srna_type, out_dir))
    if file_exists(out_novel_mirna):
        return [out_novel_mirna, out_novel_isomir]
    out_dts = []
    for sample in data:
        if sample[0].get(srna_type):
            miraligner_fn = sample[0][srna_type]
            reads = _read_miraligner(miraligner_fn)
            if reads:
                out_file, dt, dt_pre = _tab_output(reads, miraligner_fn + ".back", dd.get_sample_name(sample[0]))
                out_dts.append(dt)
            else:
                logger.debug("WARNING::%s has NOT miRNA annotated for %s. Check if fasta files is small or species value." % (dd.get_sample_name(sample[0]), srna_type))
    if out_dts:
        out_files = _create_counts(out_dts, out_dir)
        out_files = [move_safe(out_files[0], out_novel_isomir), move_safe(out_files[1], out_novel_mirna)]
        return out_files
    else:
        logger.debug("WARNING::any samples have miRNA annotated for %s. Check if fasta files is small or species value." % srna_type)
Ejemplo n.º 6
0
def trim_srna_sample(data):
    """
    Remove 3' adapter for smallRNA-seq
    Uses cutadapt but with different parameters than for other pipelines.
    """
    in_file = data["files"][0]
    names = data["rgnames"]['sample']
    work_dir = os.path.join(dd.get_work_dir(data), "trimmed")
    out_dir = os.path.join(work_dir, names)
    utils.safe_makedir(out_dir)
    out_file = replace_directory(append_stem(in_file, ".clean"), out_dir)
    trim_reads = data["config"]["algorithm"].get("trim_reads", True)
    if utils.file_exists(out_file):
        data["clean_fastq"] = out_file
        data["collapse"] = _collapse(data["clean_fastq"])
        data["size_stats"] = _summary(data['collapse'])
        return [[data]]

    adapter = dd.get_adapters(data)
    if trim_reads and not adapter and error_dnapi:
        raise ValueError(error_dnapi)
    adapters = adapter if adapter else _dnapi_prediction(in_file, out_dir)
    times = "" if len(adapters) == 1 else "--times %s" % len(adapters)
    if trim_reads and adapters:
        adapter_cmd = " ".join(map(lambda x: "-a " + x, adapters))
        out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir)
        out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir)
        log_out = os.path.join(out_dir, "%s.log" % names)
        atropos = _get_atropos()
        options = " ".join(data.get('resources', {}).get('atropos', {}).get("options", ""))
        cores = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "")
        if " ".join(data.get('resources', {}).get('cutadapt', {}).get("options", "")):
            raise ValueError("Atropos is now used, but cutadapt options found in YAML file."
                             "See https://atropos.readthedocs.io/en/latest/")
        cmd = _cmd_atropos()
        if not utils.file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                do.run(cmd.format(**locals()), "remove adapter for %s" % names)
                if utils.file_exists(log_out):
                    content = open(log_out).read().replace(out_short_file, names)
                    open(log_out, 'w').write(content)
                if options:
                    in_file = append_stem(tx_out_file, ".tmp")
                    utils.move_safe(tx_out_file, in_file)
                    cmd = "{atropos} {cores} {options} -se {in_file} -o {tx_out_file} -m 17"
                    do.run(cmd.format(**locals()), "cutadapt with this %s for %s" %(options, names))
    else:
        if not trim_reads:
            logger.debug("Skip trimming for: %s" % names)
        elif not adapters:
            logger.info("No adapter founds in %s, this is an issue related"
                        " to no small RNA enrichment in your sample." % names)
        symlink_plus(in_file, out_file)
    data["clean_fastq"] = out_file
    data["collapse"] = _collapse(data["clean_fastq"])
    data["size_stats"] = _summary(data['collapse'])
    return [[data]]
Ejemplo n.º 7
0
def cpg_het_pairs(cpgvcf, snpvcf, bam_file, out_file, workdir):
    """
    Detect het close to hemi-met sites
    """
    out_vcf = splitext_plus(out_file)[0] + ".vcf"
    cpg_filter = op.join(workdir, op.basename(append_stem(cpgvcf, "_filtered")))
    snp_filter = op.join(workdir, op.basename(append_stem(snpvcf, "_filtered")))

    if not file_exists(cpg_filter):
        with open(cpg_filter, 'w') as out_handle:
            with open(cpgvcf) as in_handle:
                for line in in_handle:
                    if line.startswith("#"):
                        continue
                    record = line.strip().split("\t")
                    # print record
                    header, frmt = record[8], record[9]
                    frmt = dict(zip(header.split(":"), frmt.split(':')))
                    if is_good_cpg(frmt, record):
                        print >>out_handle, line
    if not file_exists(snp_filter):
        with open(snp_filter, 'w') as out_handle:
            with open(snpvcf) as in_handle:
                for line in in_handle:
                    if line.startswith("#"):
                        continue
                    record = line.strip().split("\t")
                    header, frmt = record[8], record[9]
                    frmt = dict(zip(header.split(":"), frmt.split(':')))
                    if is_good_het(frmt, record):
                        print >>out_handle, line

    if not file_exists(out_vcf):
        res = pybedtools.BedTool(cpg_filter).window(snp_filter, w=75)
        with open(out_file, 'w') as out_handle, open(out_vcf, 'w') as vcf_handle:
            _create_vcf_header(cpgvcf, vcf_handle)
            print >>out_handle, "chrom\tCpG_pos\tCpG_nt\tSNP_pos\tAlleles\tassociation_plus\tSNP_reads_minus"
            for record in res:
                if record[1] != record[11]:
                    # if record[1] == "19889634":
                    link, link_as, align = _make_linkage(bam_file, record[0], int(record[1]), int(record[11]), _get_strand(record))
                    res = "%s\t%s\t%s\t%s\t%s/%s\t%s\t%s" % (record[0], record[1], record[3], record[11], record[13], record[14], _format(link), _format(link_as))
                    chrom, pos, ref, alt, qual, filt, info, frmt, sample = _get_vcf_line(record)
                    # print res
                    if _valid_test(link, link_as):
                        filt = "PASS"
                        print >>out_handle, res
                        # print res
                        # print >>out_handle, '\n'.join(align)

                    vcf_res = "{chrom}\t{pos}\t.\t{ref}\t{alt}\t{qual}\t{filt}\t{info}\t{frmt}\t{sample}".format(**locals())
                    print >>vcf_handle, vcf_res
    return _correct_vcf(out_vcf)
Ejemplo n.º 8
0
def _calculate_percentiles(in_file, sample, data=None, cutoffs=None):
    """
    Parse pct bases per region to summarize it in
    7 different pct of regions points with pct bases covered
    higher than a completeness cutoff (5, 10, 20, 50 ...)
    """
    has_data = False
    with open(in_file) as in_handle:
        for i, line in enumerate(in_handle):
            if i > 0:
                has_data = True
                break
    if not has_data:
        return []
    out_file = append_stem(in_file, "_summary")
    out_total_file = append_stem(in_file, "_total_summary")
    if not utils.file_exists(out_file) or not utils.file_exists(out_total_file):
        dt = pd.read_csv(in_file, sep="\t", index_col=False)
        pct = dict()
        pct_bases = dict()
        size = np.array(dt["chromEnd"]) - np.array(dt["chromStart"])
        for cutoff in [h for h in list(dt) if h.startswith("percentage")]:
            if cutoffs and int(cutoff.split("percentage")[1]) in cutoffs:
                a = np.array(dt[cutoff])
                for p_point in [0.01, 10, 25, 50, 75, 90, 99.9]:
                    q = np.percentile(a, p_point)
                    pct[(cutoff, p_point)] = q
                pct_bases[cutoff] = sum(size * a) / float(sum(size))

        with file_transaction(data, out_total_file) as tx_file:
            with open(tx_file, 'w') as out_handle:
                print >>out_handle, "cutoff_reads\tbases_pct\tsample"
                for k in pct_bases:
                    print >>out_handle, "\t".join(map(str, [k, pct_bases[k], sample]))
        with file_transaction(data, out_file) as tx_file:
            with open(tx_file, 'w') as out_handle:
                print >>out_handle, "cutoff_reads\tregion_pct\tbases_pct\tsample"
                for k in pct:
                    print >>out_handle, "\t".join(map(str, [k[0], k[1], pct[k], sample]))
    # To move metrics to multiqc, will remove older files
    # when bcbreport accepts these one, to avoid errors
    # while porting everything to multiqc
    # These files will be copied to final
    out_file_fixed = os.path.join(os.path.dirname(out_file), "%s_bcbio_coverage.txt" % sample)
    out_total_fixed = os.path.join(os.path.dirname(out_file), "%s_bcbio_coverage_avg.txt" % sample)
    copy_plus(out_file, out_file_fixed)
    copy_plus(out_total_file, out_total_fixed)
    return [out_file_fixed, out_total_fixed]
Ejemplo n.º 9
0
def align_to_sort_bam(fastq1, fastq2, aligner, data):
    """Align to the named genome build, returning a sorted BAM file.
    """
    names = data["rgnames"]
    align_dir_parts = [data["dirs"]["work"], "align", names["sample"]]
    if data.get("disambiguate"):
        align_dir_parts.append(data["disambiguate"]["genome_build"])
    align_dir = utils.safe_makedir(apply(os.path.join, align_dir_parts))
    aligner_indexes = os.path.commonprefix(tz.get_in(("reference", aligner, "indexes"), data))
    if aligner_indexes.endswith("."):
        aligner_indexes = aligner_indexes[:-1]
    ref_file = tz.get_in(("reference", "fasta", "base"), data)
    if fastq1.endswith(".bam"):
        data = _align_from_bam(fastq1, aligner, aligner_indexes, ref_file,
                               names, align_dir, data)
    else:
        data = _align_from_fastq(fastq1, fastq2, aligner, aligner_indexes, ref_file,
                                 names, align_dir, data)
    if data["work_bam"] and utils.file_exists(data["work_bam"]):
        bam.index(data["work_bam"], data["config"])
        for extra in ["-sr", "-disc"]:
            extra_bam = utils.append_stem(data['work_bam'], extra)
            if utils.file_exists(extra_bam):
                bam.index(extra_bam, data["config"])
    return data
Ejemplo n.º 10
0
def align(pair):
    import os
    from bcbio.utils import file_exists, replace_suffix, append_stem, safe_makedir
    import subprocess
    safe_makedir("align")
    genome = "/n/hsphS10/hsphfs1/chb/biodata/genomes/Hsapiens/GRCh37/bowtie2/GRCh37"
    out_sam = os.path.join("align", os.path.basename(replace_suffix(pair[0], ".sam")))
    out_bam = replace_suffix(out_sam, ".bam")
    sorted = append_stem(out_bam, "_sorted")
    sorted_prefix = os.path.splitext(sorted)[0]
    out_index = replace_suffix(sorted, ".bai")
    if not file_exists(out_sam):
        if len(pair) == 2:
            fq1, fq2 = pair
            cmd = "bowtie2 -S {out_sam} {genome} -1 {fq1} -2 {fq2}"
        else:
            fq1 = pair[0]
            cmd = "bowtie2 -S {out_sam} {genome} {fq1}"
        subprocess.check_call(cmd.format(**locals()), shell=True)
    if not file_exists(out_bam):
        cmd = "samtools view -S {out_sam} -b -o {out_bam}"
        subprocess.check_call(cmd.format(**locals()), shell=True)
    if not file_exists(sorted):
        cmd = "samtools sort {out_bam} {sorted_prefix}"
        subprocess.check_call(cmd.format(**locals()), shell=True)
    if not file_exists(out_index):
        cmd = "samtools index {sorted}"
        subprocess.check_call(cmd.format(**locals()), shell=True)
    return sorted
Ejemplo n.º 11
0
def _collapse(in_file):
    out_file = append_stem(in_file, ".trimming").replace(".gz", "")
    if file_exists(out_file):
        return out_file
    seqs = collapse(in_file)
    write_output(out_file, seqs)
    return out_file
Ejemplo n.º 12
0
def run(items):
    """Perform detection of structural variations with lumpy, using bwa-mem alignment.
    """
    if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", False, None] for data in items):
        raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection")
    work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural", items[0]["name"][-1],
                                               "lumpy"))
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        dedup_bam, sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir)
        full_bams.append(dedup_bam)
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
    pebed_file, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, work_dir, items)
    out = []
    sample_config_file = _write_samples_to_ids(pebed_file, items)
    lumpy_vcf = _bedpe_to_vcf(pebed_file, sample_config_file, items)
    for i, data in enumerate(items):
        if "sv" not in data:
            data["sv"] = []
        sample = tz.get_in(["rgnames", "sample"], data)
        sample_bedpe = _filter_by_support(_subset_to_sample(pebed_file, i, data), i, data)
        if lumpy_vcf:
            sample_vcf = utils.append_stem(lumpy_vcf, "-%s" % sample)
            sample_vcf = _filter_by_bedpe(vcfutils.select_sample(lumpy_vcf, sample, sample_vcf, data["config"]),
                                          sample_bedpe, data)
        else:
            sample_vcf = None
        data["sv"].append({"variantcaller": "lumpy",
                           "vrn_file": sample_vcf,
                           "exclude_file": exclude_file,
                           "bedpe_file": sample_bedpe,
                           "sample_bed": sample_config_file})
        out.append(data)
    return out
Ejemplo n.º 13
0
def _filter_by_support(orig_file, index):
    """Filter call file based on supporting evidence, adding pass/filter annotations to BEDPE.

    Filters based on the following criteria:
      - Multiple forms of evidence in any sample (split and paired end)
      - Minimum read support for the call.
    """
    min_read_count = 4
    out_file = utils.append_stem(orig_file, "-filter")
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            with open(orig_file) as in_handle:
                with open(tx_out_file, "w") as out_handle:
                    for parts in (l.rstrip().split("\t") for l in in_handle):
                        support = _get_support(parts)
                        evidence = set(reduce(operator.add, [x.keys() for x in support.values()]))
                        read_count = reduce(operator.add, support[index].values())
                        if len(evidence) < 2:
                            lfilter = "ApproachSupport"
                        elif read_count < min_read_count:
                            lfilter = "ReadCountSupport"
                        else:
                            lfilter = "PASS"
                        parts.append(lfilter)
                        out_handle.write("\t".join(parts) + "\n")
    return out_file
Ejemplo n.º 14
0
def run(items):
    """Perform detection of structural variations with lumpy, using bwa-mem alignment.
    """
    if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", False, None] for data in items):
        raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection")
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0])
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        dedup_bam, sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir)
        full_bams.append(dedup_bam)
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
    lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, work_dir, items)
    out = []
    for i, data in enumerate(items):
        if "sv" not in data:
            data["sv"] = []
        sample = dd.get_sample_name(data)
        dedup_bam, sr_bam, _ = sshared.get_split_discordants(data, work_dir)
        sample_vcf = vcfutils.select_sample(lumpy_vcf, sample,
                                            utils.append_stem(lumpy_vcf, "-%s" % sample),
                                            data["config"])
        gt_vcf = _run_svtyper(sample_vcf, dedup_bam, sr_bam, data)
        filter_vcf = _filter_by_support(gt_vcf, data)
        data["sv"].append({"variantcaller": "lumpy",
                           "vrn_file": filter_vcf,
                           "exclude_file": exclude_file})
        out.append(data)
    return out
Ejemplo n.º 15
0
def trim_read_through(fastq_files, dirs, lane_config):
    """
    for small insert sizes, the read length can be longer than the insert
    resulting in the reverse complement of the 3' adapter being sequenced.
    this takes adapter sequences and trims the only the reverse complement
    of the adapter

    MYSEQUENCEAAAARETPADA -> MYSEQUENCEAAAA (no polyA trim)

    """
    quality_format = _get_quality_format(lane_config)
    to_trim = _get_sequences_to_trim(lane_config, SUPPORTED_ADAPTERS)
    out_files = _get_read_through_trimmed_outfiles(fastq_files, dirs)
    fixed_files = append_stem(out_files, ".fixed")
    if all(map(file_exists, fixed_files)):
        return fixed_files
    logger.info("Trimming %s from the 3' end of reads in %s using "
                "cutadapt." % (", ".join(to_trim),
                               ", ".join(fastq_files)))
    cores = lane_config["algorithm"].get("num_cores", 1)
    out_files = _cutadapt_trim(fastq_files, quality_format,
                               to_trim, out_files, cores)

    fixed_files = remove_short_reads(out_files, dirs, lane_config)
    return fixed_files
Ejemplo n.º 16
0
def align_to_sort_bam(fastq1, fastq2, aligner, data):
    """Align to the named genome build, returning a sorted BAM file.
    """
    names = data["rgnames"]
    align_dir_parts = [data["dirs"]["work"], "align", names["sample"]]
    if data.get("disambiguate"):
        align_dir_parts.append(data["disambiguate"]["genome_build"])
    aligner_index = _get_aligner_index(aligner, data)
    align_dir = utils.safe_makedir(apply(os.path.join, align_dir_parts))
    ref_file = tz.get_in(("reference", "fasta", "base"), data)
    if fastq1.endswith(".bam"):
        data = _align_from_bam(fastq1, aligner, aligner_index, ref_file,
                               names, align_dir, data)
    else:
        data = _align_from_fastq(fastq1, fastq2, aligner, aligner_index, ref_file,
                                 names, align_dir, data)
    if data["work_bam"] and utils.file_exists(data["work_bam"]):
        if data.get("align_split") and dd.get_mark_duplicates(data):
            # If merging later with with bamsormadup need query sorted inputs
            # but CWL requires a bai file. Create a fake one to make it happy.
            bam.fake_index(data["work_bam"], data)
        else:
            bam.index(data["work_bam"], data["config"])
        for extra in ["-sr", "-disc"]:
            extra_bam = utils.append_stem(data['work_bam'], extra)
            if utils.file_exists(extra_bam):
                bam.index(extra_bam, data["config"])
    return data
Ejemplo n.º 17
0
def run_vep(data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    out_file = utils.append_stem(data["vrn_file"], "-vepeffects")
    assert data["vrn_file"].endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(data["genome_build"],
                                                   tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("variant_effect_predictor.pl", data["config"])
                dbnsfp_args, dbnsfp_fields = _get_dbnsfp(data)
                loftee_args, loftee_fields = _get_loftee(data)
                std_fields = ["Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature",
                              "EXON", "PolyPhen", "SIFT", "Protein_position", "BIOTYPE", "CANONICAL", "CCDS"]
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout"] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats",
                       "--cache", "--offline", "--dir", vep_dir,
                       "--sift", "b", "--polyphen", "b", "--symbol", "--numbers", "--biotype", "--total_length",
                       "--canonical", "--ccds",
                       "--fields", ",".join(std_fields + dbnsfp_fields + loftee_fields)] + dbnsfp_args + loftee_args
                cmd = "gunzip -c %s | %s | bgzip -c > %s" % (data["vrn_file"], " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
Ejemplo n.º 18
0
def group_batches(xs):
    """Group samples into batches for simultaneous variant calling.

    Identify all samples to call together: those in the same batch,
    variant caller and genomic region.
    Pull together all BAM files from this batch and process together,
    Provide details to pull these finalized files back into individual
    expected files.
    """
    singles = []
    batch_groups = collections.defaultdict(list)
    for data, region, out_fname in xs:
        batch = data.get("metadata", {}).get("batch")
        caller = data["config"]["algorithm"].get("variantcaller", "gatk")
        if batch is not None:
            batch_groups[(batch, region, caller)].append((data, out_fname))
        else:
            singles.append((data, region, out_fname))
    batches = []
    remap_batches = {}
    for (batch, region, _), xs in batch_groups.iteritems():
        cur_data, cur_fname = xs[0]
        batch_fname = utils.append_stem(cur_fname, "-" + batch)
        batch_data = copy.deepcopy(cur_data)
        batch_data["work_bam"] = [x[0]["work_bam"] for x in xs]
        batch_data["group"] = batch_fname
        batches.append((batch_data, region, batch_fname))
        remap_batches[batch_fname] = xs
    return singles + batches, remap_batches
Ejemplo n.º 19
0
def run_vep(in_file, data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    if not vcfutils.vcf_has_variants(in_file):
        return None
    out_file = utils.append_stem(in_file, "-vepeffects")
    assert in_file.endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(data["genome_build"],
                                                   tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("vep", data["config"])
                is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False)
                # HGVS requires a bgzip compressed, faidx indexed input file or is unusable slow
                if dd.get_ref_file_compressed(data):
                    hgvs_compatible = True
                    config_args = ["--fasta", dd.get_ref_file_compressed(data)]
                else:
                    hgvs_compatible = False
                    config_args = ["--fasta", dd.get_ref_file(data)]
                if is_human:
                    plugin_fns = {"loftee": _get_loftee, "maxentscan": _get_maxentscan,
                                  "genesplicer": _get_genesplicer, "spliceregion": _get_spliceregion}
                    plugins = ["loftee"]
                    if "vep_splicesite_annotations" in dd.get_tools_on(data):
                        # "genesplicer" too unstable so currently removed
                        plugins += ["maxentscan", "spliceregion"]
                    for plugin in plugins:
                        plugin_args = plugin_fns[plugin](data)
                        config_args += plugin_args
                    config_args += ["--sift", "b", "--polyphen", "b"]
                    if hgvs_compatible:
                        config_args += ["--hgvs", "--shift_hgvs", "1"]
                if (dd.get_effects_transcripts(data).startswith("canonical")
                      or tz.get_in(("config", "algorithm", "clinical_reporting"), data)):
                    config_args += ["--pick_allele"]
                if ensembl_name.endswith("_merged"):
                    config_args += ["--merged"]
                    ensembl_name = ensembl_name.replace("_merged", "")
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats", "--cache",
                        "--offline", "--dir", vep_dir,
                       "--symbol", "--numbers", "--biotype", "--total_length", "--canonical",
                       "--gene_phenotype", "--ccds", "--uniprot", "--domains", "--regulatory",
                       "--protein", "--tsl", "--appris", "--af", "--max_af", "--af_1kg", "--af_esp", "--af_gnomad",
                       "--pubmed", "--variant_class", "--allele_number"] + config_args
                perl_exports = utils.get_perl_exports()
                # Remove empty fields (';;') which can cause parsing errors downstream
                cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perl_exports, " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
Ejemplo n.º 20
0
def delayed_bam_merge(data):
    """Perform a merge on previously prepped files, delayed in processing.

    Handles merging of associated split read and discordant files if present.
    """
    if data.get("combine"):
        assert len(data["combine"].keys()) == 1
        file_key = data["combine"].keys()[0]
        extras = []
        for x in data["combine"][file_key].get("extras", []):
            if isinstance(x, (list, tuple)):
                extras.extend(x)
            else:
                extras.append(x)
        if file_key in data:
            extras.append(data[file_key])
        in_files = sorted(list(set(extras)))
        out_file = tz.get_in(["combine", file_key, "out"], data, _merge_out_from_infiles(in_files))
        sup_exts = data.get(file_key + "-plus", {}).keys()
        for ext in sup_exts + [""]:
            merged_file = None
            if os.path.exists(utils.append_stem(out_file, "-" + ext)):
                cur_out_file, cur_in_files = out_file, []
            if ext:
                cur_in_files = list(filter(os.path.exists, (utils.append_stem(f, "-" + ext) for f in in_files)))
                cur_out_file = utils.append_stem(out_file, "-" + ext) if len(cur_in_files) > 0 else None
            else:
                cur_in_files, cur_out_file = in_files, out_file
            if cur_out_file:
                config = copy.deepcopy(data["config"])
                config["algorithm"]["save_diskspace"] = False
                if len(cur_in_files) > 0:
                    merged_file = merge_bam_files(
                        cur_in_files, os.path.dirname(cur_out_file), config, out_file=cur_out_file
                    )
                else:
                    assert os.path.exists(cur_out_file)
                    merged_file = cur_out_file
            if merged_file:
                if ext:
                    data[file_key + "-plus"][ext] = merged_file
                else:
                    data[file_key] = merged_file
        data.pop("region", None)
        data.pop("combine", None)
    return [[data]]
Ejemplo n.º 21
0
def _change_sample_name(in_file):
    """Fix name in feature counts log file to get the same
       name in multiqc report.
    """
    out_file = append_stem(in_file, "_fixed")
    with file_transaction(out_file) as tx_out:
        with open(tx_out, "w") as out_handle:
            print >>out_handle, open(in_file).read().replace(".nsorted.primary", "")
    return out_file
Ejemplo n.º 22
0
def _collapse(in_file):
    """
    Collpase reads into unique sequences with seqcluster
    """
    out_file = append_stem(in_file, ".trimming").replace(".gz", "")
    if file_exists(out_file):
        return out_file
    seqs = collapse(in_file)
    write_output(out_file, seqs, minimum=1, size=16)
    return out_file
Ejemplo n.º 23
0
def _prepare_file(fn, out_dir):
    """Cut the beginning of the reads to avoid detection of miRNAs"""
    atropos = _get_atropos()
    cmd = "{atropos} trim --max-reads 500000 -u 22 -se {fn} -o {tx_file}"
    out_file = os.path.join(out_dir, append_stem(os.path.basename(fn), "end"))
    if file_exists(out_file):
        return out_file
    with file_transaction(out_file) as tx_file:
        do.run(cmd.format(**locals()))
    return out_file
Ejemplo n.º 24
0
def trim_srna_sample(data):
    adapter = dd.get_adapters(data)[0]
    names = data["rgnames"]['sample']
    work_dir = os.path.join(dd.get_work_dir(data), "trimmed")
    out_dir = os.path.join(work_dir, names)
    in_file = data["files"][0]
    utils.safe_makedir(out_dir)
    out_file = replace_directory(append_stem(in_file, ".clean"), out_dir)
    out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir)
    out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir)
    cutadapt = os.path.join(os.path.dirname(sys.executable), "cutadapt")
    cmd = _cmd_cutadapt()
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            do.run(cmd.format(**locals()), "remove adapter")
    data["clean_fastq"] = out_file
    data["collapse"] = _collapse(data["clean_fastq"])
    data["size_stats"] = _summary(data['collapse'])
    return [[data]]
Ejemplo n.º 25
0
def _prepare_bam(bam_file, bed_file, config):
    if not bam_file or not bed_file:
        return bam_file
    out_file = utils.append_stem(bam_file, '_filter')
    samtools = config_utils.get_program("samtools", config)
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out:
            cmd = "{samtools} view -bh -L {bed_file} {bam_file} > {tx_out}"
            do.run(cmd.format(**locals()), "Clean %s" % bam_file)
    return out_file
Ejemplo n.º 26
0
def run(items):
    """Perform detection of structural variations with lumpy, using bwa-mem alignment.
    """
    if not all(utils.get_in(data, ("config", "algorithm", "aligner"))
               in ["bwa", "sentieon-bwa", False, None] for data in items):
        raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection")
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0])
    previous_evidence = {}
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir)
        full_bams.append(dd.get_align_bam(data))
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
        cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir)
        previous_evidence[dd.get_sample_name(data)] = {}
        if cur_dels and utils.file_exists(cur_dels):
            previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels
        if cur_dups and utils.file_exists(cur_dups):
            previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups
    lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence,
                                         work_dir, items)
    gt_vcfs = {}
    for data in items:
        sample = dd.get_sample_name(data)
        sample_vcf = vcfutils.select_sample(lumpy_vcf, sample,
                                            utils.append_stem(lumpy_vcf, "-%s" % sample),
                                            data["config"])
        if "bnd-genotype" in dd.get_tools_on(data):
            gt_vcf = _run_svtyper(sample_vcf, dd.get_align_bam(data), exclude_file, data)
        else:
            std_vcf, bnd_vcf = _split_breakends(sample_vcf, data)
            std_gt_vcf = _run_svtyper(std_vcf, dd.get_align_bam(data), exclude_file, data)
            gt_vcf = vcfutils.concat_variant_files_bcftools(
                orig_files=[std_gt_vcf, bnd_vcf],
                out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0],
                config=data["config"])
        gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data)
    if paired and paired.normal_name:
        gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        vcf_file = gt_vcfs[dd.get_sample_name(data)]
        if dd.get_svprioritize(data):
            effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff")
        else:
            effects_vcf = None
        data["sv"].append({"variantcaller": "lumpy",
                           "vrn_file": effects_vcf or vcf_file,
                           "exclude_file": exclude_file})
        out.append(data)
    return out
Ejemplo n.º 27
0
 def test_2_vcf_exclusion(self):
     """Exclude samples from VCF files.
     """
     ref_file = os.path.join(self.data_dir, "genomes", "hg19", "seq", "hg19.fa")
     config = load_config(os.path.join(self.data_dir, "automated",
                                       "post_process-sample.yaml"))
     out_file = utils.append_stem(self.combo_file, "-exclude")
     to_exclude = ["S1"]
     if os.path.exists(out_file):
         os.remove(out_file)
     vcfutils.exclude_samples(self.combo_file, out_file, to_exclude, ref_file, config)
Ejemplo n.º 28
0
def _correct_vcf(vcf_file):
    """
    sort by genome/position, bgzip and index
    """
    vcf_sort = append_stem(vcf_file, "_sort") + ".gz"
    if not file_exists(vcf_sort):
        with file_transaction(vcf_sort) as tx_out:
            cmd = "cat {vcf_file} |vcf-sort | bgzip  > {tx_out}"
            do.run(cmd.format(**locals()), "sort %s" % vcf_file)
            do.run("tabix -f {0}".format(tx_out), "")
    return vcf_sort
Ejemplo n.º 29
0
def _prepare_bam(bam_file, bed_file, config):
    """Remove regions from bed files"""
    if not bam_file or not bed_file:
        return bam_file
    out_file = utils.append_stem(bam_file, '_filter')
    bedtools = config_utils.get_program("bedtools", config)
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out:
            cmd = "{bedtools} subtract -nonamecheck -A -a {bam_file} -b {bed_file} > {tx_out}"
            do.run(cmd.format(**locals()), "Remove blacklist regions from %s" % bam_file)
    return out_file
Ejemplo n.º 30
0
def mark_duplicates(sam_file):
    import subprocess
    from bcbio.utils import file_exists, replace_suffix, append_stem
    fm = "/n/HSPH/local/share/java/picard/FixMateInformation.jar"
    md = "/n/HSPH/local/share/java/picard/MarkDuplicates.jar"
    jvm_opts = "-Xms750m -Xmx2000m"
    mate_fixed_file = append_stem(sam_file, "_matefixed")
    if not file_exists(mate_fixed_file):
        cmd = ("java {jvm_opts} -jar {fm} INPUT={sam_file} "
               "OUTPUT={mate_fixed_file}")
        subprocess.check_call(cmd.format(**locals()), shell=True)
    sam_file = mate_fixed_file
    out_file = append_stem(sam_file, "_dupemarked")
    stats_file = replace_suffix(append_stem(sam_file, "_stats"), ".txt")
    if not file_exists(out_file):
        cmd = ("java {jvm_opts} -jar {md} INPUT={sam_file} "
               "OUTPUT={out_file} METRICS_FILE={stats_file} "
               "VALIDATION_STRINGENCY=LENIENT")
        subprocess.check_call(cmd.format(**locals()), shell=True)
    return out_file