Example #1
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    config = data["config"]
    out_prefix = os.path.join(align_dir, names["lane"])
    out_file = out_prefix + "Aligned.out.sam"
    out_dir = os.path.join(align_dir, "%s_star" % names["lane"])

    final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"]))
    if file_exists(final_out):
        return final_out
    star_path = config_utils.get_program("STAR", config)
    fastq = " ".join([fastq_file, pair_file]) if pair_file else fastq_file
    num_cores = config["algorithm"].get("num_cores", 1)

    safe_makedir(align_dir)
    cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq} "
           "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} "
           "--outReadsUnmapped Fastx --outFilterMultimapNmax 10 "
           "--outSAMunmapped Within")
    cmd += _read_group_option(names)
    fusion_mode = get_in(data, ("config", "algorithm", "fusion_mode"), False)
    if fusion_mode:
        cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15"
    strandedness = get_in(data, ("config", "algorithm", "strandedness"),
                          "unstranded").lower()
    if strandedness == "unstranded":
        cmd += " --outSAMstrandField intronMotif"
    run_message = "Running STAR aligner on %s and %s." % (pair_file, ref_file)
    do.run(cmd.format(**locals()), run_message, None)
    out_file = bam.sam_to_bam(out_file, config)
    out_file = _fix_sam_header(out_file, config)
    if not file_exists(final_out):
        symlink_plus(out_file, final_out)
    return final_out
Example #2
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    qual_format = data["config"]["algorithm"].get("quality_format", "").lower()
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file = alignprep.split_namedpipe_cl(fastq_file, data)
        if pair_file:
            pair_file = alignprep.split_namedpipe_cl(pair_file, data)
    else:
        final_file = None
        if qual_format == "illumina":
            fastq_file = alignprep.fastq_convert_pipe_cl(fastq_file, data)
            if pair_file:
                pair_file = alignprep.fastq_convert_pipe_cl(pair_file, data)
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        # If we cannot do piping, use older bwa aln approach
        if not _can_use_mem(fastq_file, data):
            out_file = _align_backtrack(fastq_file, pair_file, ref_file, out_file,
                                        names, rg_info, data)
        else:
            out_file = _align_mem(fastq_file, pair_file, ref_file, out_file,
                                  names, rg_info, data)
    data["work_bam"] = out_file
    return data
Example #3
0
def align(fastq_file, pair_file, ref_file, out_base, align_dir, data,
          names=None):
    """Perform a BWA alignment, generating a SAM file.
    """
    config = data["config"]
    sai1_file = os.path.join(align_dir, "%s_1.sai" % out_base)
    sai2_file = (os.path.join(align_dir, "%s_2.sai" % out_base)
                 if pair_file else None)
    sam_file = os.path.join(align_dir, "%s.sam" % out_base)
    if not utils.file_exists(sam_file):
        if not utils.file_exists(sai1_file):
            with file_transaction(sai1_file) as tx_sai1_file:
                _run_bwa_align(fastq_file, ref_file, tx_sai1_file, config)
        if sai2_file and not utils.file_exists(sai2_file):
            with file_transaction(sai2_file) as tx_sai2_file:
                _run_bwa_align(pair_file, ref_file, tx_sai2_file, config)
        align_type = "sampe" if sai2_file else "samse"
        sam_cl = [config_utils.get_program("bwa", config), align_type, ref_file, sai1_file]
        if sai2_file:
            sam_cl.append(sai2_file)
        sam_cl.append(fastq_file)
        if sai2_file:
            sam_cl.append(pair_file)
        with file_transaction(sam_file) as tx_sam_file:
            cmd = "{cl} > {out_file}".format(cl=" ".join(sam_cl), out_file=tx_sam_file)
            do.run(cmd, "bwa {align_type}".format(**locals()), None)
    return sam_file
Example #4
0
def combine_bed_by_size(input_beds, sample, work_dir, data, delim=","):
    """Combine a set of BED files, breaking into individual size chunks.
    """
    out_file = os.path.join(work_dir, "%s-ensemble.bed" % sample)
    if len(input_beds) > 0:
        size_beds = []
        for e_start, e_end in validate.EVENT_SIZES:
            base, ext = os.path.splitext(out_file)
            size_out_file = "%s-%s_%s%s" % (base, e_start, e_end, ext)
            if not utils.file_exists(size_out_file):
                with file_transaction(data, size_out_file) as tx_out_file:
                    with shared.bedtools_tmpdir(data):
                        all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0]
                        has_regions = False
                        with open(all_file, "w") as out_handle:
                            for line in fileinput.input(input_beds):
                                chrom, start, end, event_str = line.split()[:4]
                                event = event_str.split("_", 1)[0]
                                size = int(end) - int(start)
                                if size >= e_start and size < e_end or event == "BND":
                                    out_handle.write(line)
                                    has_regions = True
                        if has_regions:
                            pybedtools.BedTool(all_file).sort(stream=True)\
                              .merge(c=4, o="distinct", delim=delim).saveas(tx_out_file)
            if utils.file_exists(size_out_file):
                ann_size_out_file = annotate.add_genes(size_out_file, data)
                size_beds.append(ann_size_out_file)
        if len(size_beds) > 0:
            out_file = bedutils.combine(size_beds, out_file, data)
    return out_file
Example #5
0
def align_to_sort_bam(fastq1, fastq2, aligner, data):
    """Align to the named genome build, returning a sorted BAM file.
    """
    names = data["rgnames"]
    align_dir_parts = [data["dirs"]["work"], "align", names["sample"]]
    if data.get("disambiguate"):
        align_dir_parts.append(data["disambiguate"]["genome_build"])
    aligner_index = _get_aligner_index(aligner, data)
    align_dir = utils.safe_makedir(apply(os.path.join, align_dir_parts))
    ref_file = tz.get_in(("reference", "fasta", "base"), data)
    if fastq1.endswith(".bam"):
        data = _align_from_bam(fastq1, aligner, aligner_index, ref_file,
                               names, align_dir, data)
    else:
        data = _align_from_fastq(fastq1, fastq2, aligner, aligner_index, ref_file,
                                 names, align_dir, data)
    if data["work_bam"] and utils.file_exists(data["work_bam"]):
        if data.get("align_split") and dd.get_mark_duplicates(data):
            # If merging later with with bamsormadup need query sorted inputs
            # but CWL requires a bai file. Create a fake one to make it happy.
            bam.fake_index(data["work_bam"], data)
        else:
            bam.index(data["work_bam"], data["config"])
        for extra in ["-sr", "-disc"]:
            extra_bam = utils.append_stem(data['work_bam'], extra)
            if utils.file_exists(extra_bam):
                bam.index(extra_bam, data["config"])
    return data
Example #6
0
File: rseqc.py Project: roryk/bipy
def _fetch_chrom_sizes(config):

    PROGRAM = "fetchChromSizes"

    if not program_exists(PROGRAM):
        logger.error("%s is not in the path or is not executable. Make sure "
                     "it is installed or go to "
                     "http://hgdownload.cse.ucsc.edu/admin/exe/"
                     "to download it." % (PROGRAM))
        exit(1)

    if "annotation" not in config:
        logger.error("'annotation' must be in the yaml file. See example "
                     " configuration files")
        exit(1)
    if "name" not in config["annotation"]:
        logger.error("'name' must be in the yaml file under  "
                     " 'annotation'. See example configuration files.")
        exit(1)
    genome = config["annotation"]["name"]
    chrom_size_file = os.path.join(_results_dir(config),
                                   genome + ".sizes")
    if file_exists(chrom_size_file):
        return chrom_size_file

    with file_transaction(chrom_size_file) as tmp_chrom_size_file:
        sh.fetchChromSizes(genome, _out=tmp_chrom_size_file)

    if not file_exists(chrom_size_file):
        logger.error("chromosome size file does not exist. Check "
                     "'annotation': 'name' to make sure it is valid.")
        exit(1)
    return chrom_size_file
Example #7
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file = alignprep.split_namedpipe_cl(fastq_file, data)
        if pair_file:
            pair_file = alignprep.split_namedpipe_cl(pair_file, data)
    else:
        final_file = None
    samtools = config_utils.get_program("samtools", data["config"])
    novoalign = config_utils.get_program("novoalign", data["config"])
    resources = config_utils.get_resources("novoalign", data["config"])
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "1G")
    extra_novo_args = " ".join(_novoalign_args_from_config(data["config"]))
    rg_info = get_rg_info(names)
    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        with utils.curdir_tmpdir(data) as work_dir:
            with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file):
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                cmd = ("{novoalign} -o SAM '{rg_info}' -d {ref_file} -f {fastq_file} {pair_file} "
                       "  -c {num_cores} {extra_novo_args} | ")
                cmd = cmd.format(**locals()) + tobam_cl
                do.run(cmd, "Novoalign: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, fastq_file)])
    data["work_bam"] = out_file
    return data
Example #8
0
def _extract_split_and_discordants(in_bam, work_dir, data):
    """Retrieve split-read alignments from input BAM file.
    """
    dedup_file = os.path.join(work_dir, "%s-dedup.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    sr_file = os.path.join(work_dir, "%s-sr.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    disc_file = os.path.join(work_dir, "%s-disc.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    samtools = config_utils.get_program("samtools", data["config"])
    cores = utils.get_in(data, ("config", "algorithm", "num_cores"), 1)
    resources = config_utils.get_resources("sambamba", data["config"])
    mem = config_utils.adjust_memory(resources.get("memory", "2G"),
                                     3, "decrease")
    if not utils.file_exists(sr_file) or not utils.file_exists(disc_file) or utils.file_exists(dedup_file):
        with utils.curdir_tmpdir() as tmpdir:
            with file_transaction(sr_file) as tx_sr_file:
                with file_transaction(disc_file) as tx_disc_file:
                    with file_transaction(dedup_file) as tx_dedup_file:
                        samblaster_cl = postalign.samblaster_dedup_sort(data, tmpdir, tx_dedup_file,
                                                                        tx_sr_file, tx_disc_file)
                        out_base = os.path.join(tmpdir, "%s-namesort" % os.path.splitext(in_bam)[0])
                        cmd = ("{samtools} sort -n -o -@ {cores} -m {mem} {in_bam} {out_base} | "
                               "{samtools} view -h - | ")
                        cmd = cmd.format(**locals()) + samblaster_cl
                        do.run(cmd, "samblaster: split and discordant reads", data)
    for fname in [sr_file, disc_file, dedup_file]:
        bam.index(fname, data["config"])
    return dedup_file, sr_file, disc_file
Example #9
0
def _run_amber(paired, work_dir, lenient=False):
    """AMBER: calculate allele frequencies at likely heterozygous sites.

    lenient flag allows amber runs on small test sets.
    """
    amber_dir = utils.safe_makedir(os.path.join(work_dir, "amber"))
    out_file = os.path.join(amber_dir, "%s.amber.baf" % dd.get_sample_name(paired.tumor_data))
    if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".pcf"):
        with file_transaction(paired.tumor_data, out_file) as tx_out_file:
            key = "germline_het_pon"
            het_bed = tz.get_in(["genome_resources", "variation", key], paired.tumor_data)
            cmd = ["AMBER"] + _get_jvm_opts(tx_out_file, paired.tumor_data) + \
                  ["-threads", dd.get_num_cores(paired.tumor_data),
                   "-tumor", dd.get_sample_name(paired.tumor_data),
                   "-tumor_bam", dd.get_align_bam(paired.tumor_data),
                   "-reference", dd.get_sample_name(paired.normal_data),
                   "-reference_bam", dd.get_align_bam(paired.normal_data),
                   "-ref_genome", dd.get_ref_file(paired.tumor_data),
                   "-bed", het_bed,
                   "-output_dir", os.path.dirname(tx_out_file)]
            if lenient:
                cmd += ["-max_het_af_percent", "1.0"]
            try:
                do.run(cmd, "PURPLE: AMBER baf generation")
            except subprocess.CalledProcessError as msg:
                if not lenient and _amber_allowed_errors(str(msg)):
                    return _run_amber(paired, work_dir, True)
            for f in os.listdir(os.path.dirname(tx_out_file)):
                if f != os.path.basename(tx_out_file):
                    shutil.move(os.path.join(os.path.dirname(tx_out_file), f),
                                os.path.join(amber_dir, f))
    return out_file
Example #10
0
def _run_delly(bam_files, chrom, sv_type, ref_file, work_dir, items):
    """Run delly, calling structural variations for the specified type.
    """
    out_file = os.path.join(work_dir, "%s-svs%s-%s.vcf"
                            % (os.path.splitext(os.path.basename(bam_files[0]))[0], sv_type, chrom))
    cores = min(utils.get_in(items[0], ("config", "algorithm", "num_cores"), 1),
                len(bam_files))
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            if not _has_variant_regions(items, out_file, chrom):
                vcfutils.write_empty_vcf(tx_out_file)
            else:
                exclude = ["-x", prepare_exclude_file(items, out_file, chrom)]
                cmd = ["delly", "-t", sv_type, "-g", ref_file, "-o", tx_out_file] + exclude + bam_files
                multi_cmd = "export OMP_NUM_THREADS=%s && " % cores
                try:
                    do.run(multi_cmd + " ".join(cmd), "delly structural variant")
                    # Delly will write nothing if no variants found
                    if not utils.file_exists(tx_out_file):
                        vcfutils.write_empty_vcf(tx_out_file)
                except subprocess.CalledProcessError, msg:
                    # delly returns an error exit code if there are no variants
                    if "No structural variants found" in str(msg):
                        vcfutils.write_empty_vcf(tx_out_file)
                    else:
                        raise
Example #11
0
def split_gtf(gtf, sample_size=None, out_dir=None):
    """
    split a GTF file into two equal parts, randomly selecting genes.
    sample_size will select up to sample_size genes in total
    """
    if out_dir:
        part1_fn = os.path.basename(os.path.splitext(gtf)[0]) + ".part1.gtf"
        part2_fn = os.path.basename(os.path.splitext(gtf)[0]) + ".part2.gtf"
        part1 = os.path.join(out_dir, part1_fn)
        part2 = os.path.join(out_dir, part2_fn)
        if file_exists(part1) and file_exists(part2):
            return part1, part2
    else:
        part1 = tempfile.NamedTemporaryFile(delete=False, suffix=".part1.gtf").name
        part2 = tempfile.NamedTemporaryFile(delete=False, suffix=".part2.gtf").name

    db = get_gtf_db(gtf)
    gene_ids = set([x['gene_id'][0] for x in db.all_features()])
    if not sample_size or (sample_size and sample_size > len(gene_ids)):
        sample_size = len(gene_ids)
    gene_ids = set(random.sample(gene_ids, sample_size))
    part1_ids = set(random.sample(gene_ids, sample_size / 2))
    part2_ids = gene_ids.difference(part1_ids)
    with open(part1, "w") as part1_handle:
        for gene in part1_ids:
            for feature in db.children(gene):
                part1_handle.write(str(feature) + "\n")
    with open(part2, "w") as part2_handle:
        for gene in part2_ids:
            for feature in db.children(gene):
                part2_handle.write(str(feature) + "\n")
    return part1, part2
Example #12
0
def bgzip_and_index(in_file, config=None, remove_orig=True, prep_cmd="", tabix_args=None, out_dir=None):
    """bgzip and tabix index an input file, handling VCF and BED.
    """
    if config is None:
        config = {}
    out_file = in_file if in_file.endswith(".gz") else in_file + ".gz"
    if out_dir:
        remove_orig = False
        out_file = os.path.join(out_dir, os.path.basename(out_file))
    if (not utils.file_exists(out_file) or not os.path.lexists(out_file)
          or (utils.file_exists(in_file) and not utils.file_uptodate(out_file, in_file))):
        assert not in_file == out_file, "Input file is bgzipped but not found: %s" % in_file
        assert os.path.exists(in_file), "Input file %s not found" % in_file
        if not utils.file_uptodate(out_file, in_file):
            with file_transaction(config, out_file) as tx_out_file:
                bgzip = tools.get_bgzip_cmd(config)
                cat_cmd = "zcat" if in_file.endswith(".gz") else "cat"
                if prep_cmd:
                    prep_cmd = "| %s " % prep_cmd
                cmd = "{cat_cmd} {in_file} {prep_cmd} | {bgzip} -c > {tx_out_file}"
                try:
                    do.run(cmd.format(**locals()), "bgzip %s" % os.path.basename(in_file))
                except subprocess.CalledProcessError:
                    # Race conditions: ignore errors where file has been deleted by another
                    if os.path.exists(in_file) and not os.path.exists(out_file):
                        raise
            if remove_orig:
                try:
                    os.remove(in_file)
                except OSError:  # Handle cases where run in parallel and file has been deleted
                    pass
    tabix_index(out_file, config, tabix_args=tabix_args)
    return out_file
Example #13
0
def prepare_exclude_file(items, base_file, chrom=None):
    """Prepare a BED file for exclusion, incorporating variant regions and chromosome.

    Excludes locally repetitive regions (if `remove_lcr` is set) and
    centromere regions, both of which contribute to long run times and
    false positive structural variant calls.
    """
    out_file = "%s-exclude.bed" % utils.splitext_plus(base_file)[0]
    all_vrs = _get_variant_regions(items)
    ready_region = (shared.subset_variant_regions(tz.first(all_vrs), chrom, base_file, items)
                    if len(all_vrs) > 0 else chrom)
    with shared.bedtools_tmpdir(items[0]):
        # Get a bedtool for the full region if no variant regions
        if ready_region == chrom:
            want_bedtool = callable.get_ref_bedtool(tz.get_in(["reference", "fasta", "base"], items[0]),
                                                    items[0]["config"], chrom)
            lcr_bed = shared.get_lcr_bed(items)
            if lcr_bed:
                want_bedtool = want_bedtool.subtract(pybedtools.BedTool(lcr_bed))
        else:
            want_bedtool = pybedtools.BedTool(ready_region).saveas()
        sv_exclude_bed = _get_sv_exclude_file(items)
        if sv_exclude_bed and len(want_bedtool) > 0:
            want_bedtool = want_bedtool.subtract(sv_exclude_bed).saveas()
        if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"):
            with file_transaction(out_file) as tx_out_file:
                full_bedtool = callable.get_ref_bedtool(tz.get_in(["reference", "fasta", "base"], items[0]),
                                                        items[0]["config"])
                if len(want_bedtool) > 0:
                    full_bedtool.subtract(want_bedtool).saveas(tx_out_file)
                else:
                    full_bedtool.saveas(tx_out_file)
    return out_file
Example #14
0
def make_large_exons_gtf(gtf_file):
    """
    Save all exons > 1000 bases to a separate file for estimating the
    insert size distribution
    """
    out_dir = os.path.abspath(os.path.join(os.path.dirname(gtf_file), "tophat"))
    out_file = os.path.join(out_dir, "large_exons.gtf")

    if file_exists(out_file):
        return out_file

    dbfn = gtf_file + ".db"
    if not file_exists(dbfn):
        db = gffutils.create_db(gtf_file, dbfn=dbfn, keep_order=True,
                                merge_strategy='merge', force=False,
                                infer_gene_extent=False)
    else:
        db = gffutils.FeatureDB(dbfn)
    processed_count = 0
    kept_exons = []
    for exon in db.features_of_type('exon'):
        processed_count += 1
        if processed_count % 10000 == 0:
            print("Processed %d exons." % processed_count)
        if exon.end - exon.start > 1000:
            kept_exons.append(exon)

    with open(out_file, "w") as out_handle:
        print("Writing %d large exons to %s." % (processed_count,
                                                 out_file))
        for exon in kept_exons:
            out_handle.write(str(exon) + "\n")
    return out_file
Example #15
0
def align(fastq_file, pair_file, index_dir, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM.
    """
    umi_ext = "-cumi" if "umi_bam" in data else ""
    out_file = os.path.join(align_dir, "{0}-sort{1}.bam".format(dd.get_sample_name(data), umi_ext))
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    rg_info = novoalign.get_rg_info(names)
    preset = "sr"

    pair_file = pair_file if pair_file else ""
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data)
    else:
        final_file = None

    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file):
            index_file = None
            # Skip trying to use indices now as they provide only slight speed-ups
            # and give inconsitent outputs in BAM headers
            # If a single index present, index_dir points to that
            # if index_dir and os.path.isfile(index_dir):
            #     index_dir = os.path.dirname(index_dir)
            #     index_file = os.path.join(index_dir, "%s-%s.mmi" % (dd.get_genome_build(data), preset))
            if not index_file or not os.path.exists(index_file):
                index_file = dd.get_ref_file(data)
            cmd = ("minimap2 -a -x {preset} -R '{rg_info}' -t {num_cores} {index_file} "
                   "{fastq_file} {pair_file} | ")
            do.run(cmd.format(**locals()) + tobam_cl, "minimap2 alignment: %s" % dd.get_sample_name(data))
    data["work_bam"] = out_file
    return data
Example #16
0
def index(in_bam, config, check_timestamp=True):
    """Index a BAM file, skipping if index present.

    Centralizes BAM indexing providing ability to switch indexing approaches.
    """
    assert is_bam(in_bam), "%s in not a BAM file" % in_bam
    index_file = "%s.bai" % in_bam
    alt_index_file = "%s.bai" % os.path.splitext(in_bam)[0]
    if check_timestamp:
        bai_exists = utils.file_uptodate(index_file, in_bam) or utils.file_uptodate(alt_index_file, in_bam)
    else:
        bai_exists = utils.file_exists(index_file) or utils.file_exists(alt_index_file)
    if not bai_exists:
        # Remove old index files and re-run to prevent linking into tx directory
        for fname in [index_file, alt_index_file]:
            utils.remove_safe(fname)
        sambamba = _get_sambamba(config)
        samtools = config_utils.get_program("samtools", config)
        num_cores = config["algorithm"].get("num_cores", 1)
        with file_transaction(config, index_file) as tx_index_file:
            assert tx_index_file.find(".bam.bai") > 0
            tx_bam_file = tx_index_file.replace(".bam.bai", ".bam")
            utils.symlink_plus(in_bam, tx_bam_file)
            if sambamba:
                cmd = "{sambamba} index -t {num_cores} {tx_bam_file}"
            else:
                cmd = "{samtools} index {tx_bam_file}"
            do.run(cmd.format(**locals()), "Index BAM file: %s" % os.path.basename(in_bam))
    return index_file if utils.file_exists(index_file) else alt_index_file
Example #17
0
def update_loc_file(galaxy_base, loc_type, genome_build, ref_loc):
    ref_loc = os.path.abspath(ref_loc)
    loc_file = get_loc_file(galaxy_base, loc_type)
    if not loc_file:
        return None
    formatter = get_locformatter(loc_type)
    builds = []
    tmp_out = tempfile.NamedTemporaryFile(delete=False).name
    if file_exists(loc_file):
        with open(loc_file) as in_handle, open(tmp_out, "w") as out_handle:
            for line in in_handle:
                if line.startswith("#"):
                    out_handle.write(line)
                else:
                    parts = line.strip().split()
                    build = parts[1]
                    builds.append(build)
                    if build != genome_build:
                        out_handle.write(line)
                    else:
                        out_handle.write(formatter(genome_build, ref_loc))
        shutil.copyfile(tmp_out, loc_file)
    if genome_build not in builds or not file_exists(loc_file):
        with open(loc_file, "a") as out_handle:
            out_handle.write(formatter(genome_build, ref_loc))
    return loc_file
Example #18
0
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data):
    """Provide prioritized tab delimited output for a single caller.
    """
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller))
    if not utils.file_exists(out_file):
        priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0]
        if not utils.file_exists(priority_vcf):
            with file_transaction(data, priority_vcf) as tx_out_file:
                cmd = ("bcbio-prioritize known -i {vcf_file} -o {tx_out_file} -k {prioritize_by}")
                do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest")
        if post_prior_fn:
            priority_vcf = post_prior_fn(priority_vcf, work_dir, data)
        simple_vcf = "%s-simple.vcf.gz" % utils.splitext_plus(priority_vcf)[0]
        if not utils.file_exists(simple_vcf):
            with file_transaction(data, simple_vcf) as tx_out_file:
                transcript_file = regions.get_sv_bed(data, "transcripts1000", work_dir)
                if transcript_file:
                    transcript_file = vcfutils.bgzip_and_index(transcript_file, data["config"])
                    ann_opt = "--gene_bed %s" % transcript_file
                else:
                    ann_opt = ""
                cmd = "simple_sv_annotation.py {ann_opt} -o - {priority_vcf} | bgzip -c > {tx_out_file}"
                do.run(cmd.format(**locals()), "Prioritize: simplified annotation output")
        simple_vcf = vcfutils.bgzip_and_index(vcfutils.sort_by_ref(simple_vcf, data), data["config"])
        with file_transaction(data, out_file) as tx_out_file:
            cmd = ("zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} "
                   """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """
                   "print CALLER,SNAME,$1,$2,I$END,"
                   """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,"""
                   "I$KNOWN,I$END_GENE,I$LOF,I$SIMPLE_ANN,"
                   "S${sample}$SR,S${sample}$PE}}' > {tx_out_file}")
            do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited")
    return out_file
Example #19
0
def _combine_sample_regions_batch(batch, items):
    """Combine sample regions within a group of batched samples.
    """
    config = items[0]["config"]
    work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "regions"))
    analysis_file = os.path.join(work_dir, "%s-analysis_blocks.bed" % batch)
    no_analysis_file = os.path.join(work_dir, "%s-noanalysis_blocks.bed" % batch)
    if not utils.file_exists(analysis_file) or _needs_region_update(analysis_file, items):
        # Combine all nblocks into a final set of intersecting regions
        # without callable bases. HT @brentp for intersection approach
        # https://groups.google.com/forum/?fromgroups#!topic/bedtools-discuss/qA9wK4zN8do
        bed_regions = [pybedtools.BedTool(x["regions"]["nblock"])
                       for x in items if "regions" in x]
        if len(bed_regions) == 0:
            analysis_file, no_analysis_file = None, None
        else:
            with file_transaction(items[0], analysis_file, no_analysis_file) as (tx_afile, tx_noafile):
                def intersect_two(a, b):
                    return a.intersect(b, u=True, nonamecheck=True)
                nblock_regions = reduce(intersect_two, bed_regions).saveas(
                    "%s-nblock%s" % utils.splitext_plus(tx_afile))
                ref_file = tz.get_in(["reference", "fasta", "base"], items[0])
                ref_regions = get_ref_bedtool(ref_file, config)
                min_n_size = int(config["algorithm"].get("nomap_split_size", 250))
                block_filter = NBlockRegionPicker(ref_regions, config, min_n_size)
                final_nblock_regions = nblock_regions.filter(
                    block_filter.include_block).saveas().each(block_filter.expand_block).saveas(
                        "%s-nblockfinal%s" % utils.splitext_plus(tx_afile))
                final_regions = ref_regions.subtract(final_nblock_regions, nonamecheck=True).merge(d=min_n_size)
                _write_bed_regions(items[0], final_regions, tx_afile, tx_noafile)
    if analysis_file and utils.file_exists(analysis_file):
        return analysis_file, no_analysis_file
    else:
        return None, None
Example #20
0
def run(data):
    config = data[0][0]['config']
    work_dir = dd.get_work_dir(data[0][0])
    genome = dd.get_ref_file(data[0][0])
    mirdeep2 = os.path.join(os.path.dirname(sys.executable), "miRDeep2.pl")
    perl_exports = get_perl_exports()
    mirbase = op.abspath(op.dirname(dd.get_mirbase_ref(data[0][0])))
    species = dd.get_species(data[0][0])
    hairpin = op.join(mirbase, "hairpin.fa")
    mature = op.join(mirbase, "mature.fa")
    rfam_file = op.join(mirbase, "Rfam_for_miRDeep.fa")
    bam_file = op.join(work_dir, "align", "seqs.bam")
    seqs_dir = op.join(work_dir, "seqcluster", "prepare")
    collapsed = op.join(seqs_dir, "seqs.ma")
    out_dir = op.join(work_dir, "mirdeep2")
    out_file = op.join(out_dir, "result_res.csv")
    safe_makedir(out_dir)
    with chdir(out_dir):
        collapsed, bam_file = _prepare_inputs(collapsed, bam_file, out_dir)
        cmd = ("{perl_exports} && {mirdeep2} {collapsed} {genome} {bam_file} {mature} none {hairpin} -f {rfam_file} -r simple -c -d -P -t {species} -z res").format(**locals())
        if file_exists(mirdeep2) and not file_exists(out_file) and file_exists(mature) and file_exists(rfam_file):
            do.run(cmd.format(**locals()), "Running mirdeep2.")
        if file_exists(out_file):
            novel_db = _parse_novel(out_file, dd.get_species(data[0][0]))
            return novel_db
Example #21
0
    def get_summary_metrics(self, align_metrics, dup_metrics,
            insert_metrics=None, hybrid_metrics=None, vrn_vals=None,
            rnaseq_metrics=None):
        """Retrieve a high level summary of interesting metrics.
        """
        with open(align_metrics) as in_handle:
            align_vals = self._parse_align_metrics(in_handle)
        if dup_metrics:
            with open(dup_metrics) as in_handle:
                dup_vals = self._parse_dup_metrics(in_handle)
        else:
            dup_vals = {}
        (insert_vals, hybrid_vals, rnaseq_vals) = (None, None, None)
        if insert_metrics and file_exists(insert_metrics):
            with open(insert_metrics) as in_handle:
                insert_vals = self._parse_insert_metrics(in_handle)
        if hybrid_metrics and file_exists(hybrid_metrics):
            with open(hybrid_metrics) as in_handle:
                hybrid_vals = self._parse_hybrid_metrics(in_handle)
        if rnaseq_metrics and file_exists(rnaseq_metrics):
            with open(rnaseq_metrics) as in_handle:
                rnaseq_vals = self._parse_rnaseq_metrics(in_handle)

        return self._tabularize_metrics(align_vals, dup_vals, insert_vals,
                hybrid_vals, vrn_vals, rnaseq_vals)
Example #22
0
def _variant_filtration_indel(snp_file, ref_file, vrn_files, config):
    """Filter indel variant calls using GATK best practice recommendations.
    """
    broad_runner = broad.runner_from_config(config)
    filter_type = "INDEL"
    variantcaller = config["algorithm"].get("variantcaller", "gatk")
    if not config_utils.use_vqsr([config["algorithm"]]):
        return vfilter.jexl_hard(broad_runner, snp_file, ref_file, filter_type,
                                 ["QD < 2.0", "ReadPosRankSum < -20.0", "FS > 200.0"])
    else:
        # also check if we've failed recal and needed to do strict filtering
        filter_file = "{base}-filter{ext}.vcf".format(base=os.path.splitext(snp_file)[0], ext=filter_type)
        if file_exists(filter_file):
            config["algorithm"]["coverage_interval"] = "regional"
            return _variant_filtration_indel(snp_file, ref_file, vrn_files, config)
        assert "train_indels" in vrn_files, "Need indel training file specified"
        params, recal_file, tranches_file = _shared_variant_filtration(
            filter_type, snp_file, ref_file, vrn_files, variantcaller)
        if not file_exists(recal_file):
            with file_transaction(recal_file, tranches_file) as (tx_recal, tx_tranches):
                params.extend(["--recal_file", tx_recal,
                               "--tranches_file", tx_tranches])
                if LooseVersion(broad_runner.get_gatk_version()) >= LooseVersion("2.7"):
                    params.extend(["--numBadVariants", "3000"])
                try:
                    broad_runner.new_resources("gatk-vqsr")
                    broad_runner.run_gatk(params, log_error=False)
                except:
                    logger.info("VQSR failed due to lack of training data. Using hard filtering.")
                    config["algorithm"]["coverage_interval"] = "regional"
                    return _variant_filtration_indel(snp_file, ref_file, vrn_files, config)
        return _apply_variant_recal(broad_runner, snp_file, ref_file, recal_file,
                                    tranches_file, filter_type)
Example #23
0
def compare_to_rm(data):
    """Compare final variant calls against reference materials of known calls.
    """
    toval_data = _get_validate(data)
    if toval_data:
        if isinstance(toval_data["vrn_file"], (list, tuple)):
            vrn_file = [os.path.abspath(x) for x in toval_data["vrn_file"]]
        else:
            vrn_file = os.path.abspath(toval_data["vrn_file"])
        rm_file = normalize_input_path(toval_data["config"]["algorithm"]["validate"], toval_data)
        rm_interval_file = _gunzip(
            normalize_input_path(toval_data["config"]["algorithm"].get("validate_regions"), toval_data), toval_data
        )
        rm_genome = toval_data["config"]["algorithm"].get("validate_genome_build")
        sample = toval_data["name"][-1].replace(" ", "_")
        caller = _get_caller(toval_data)
        base_dir = utils.safe_makedir(os.path.join(toval_data["dirs"]["work"], "validate", sample, caller))
        val_config_file = _create_validate_config_file(
            vrn_file, rm_file, rm_interval_file, rm_genome, base_dir, toval_data
        )
        work_dir = os.path.join(base_dir, "work")
        out = {
            "summary": os.path.join(work_dir, "validate-summary.csv"),
            "grading": os.path.join(work_dir, "validate-grading.yaml"),
            "discordant": os.path.join(work_dir, "%s-eval-ref-discordance-annotate.vcf" % sample),
        }
        if not utils.file_exists(out["discordant"]) or not utils.file_exists(out["grading"]):
            bcbio_variation_comparison(val_config_file, base_dir, toval_data)
        out["concordant"] = filter(
            os.path.exists,
            [os.path.join(work_dir, "%s-%s-concordance.vcf" % (sample, x)) for x in ["eval-ref", "ref-eval"]],
        )[0]
        data["validate"] = out
    return [[data]]
Example #24
0
def priority_total_coverage(data):
    """
    calculate coverage at 10 depth intervals in the priority regions
    """
    bed_file = dd.get_svprioritize(data)
    if not bed_file and not file_exists(bed_file):
        return data
    work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage")
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed")
    if file_exists(out_file):
        data['priority_total_coverage'] = os.path.abspath(out_file)
        return data
    nthreads = dd.get_num_cores(data)
    in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    sambamba = config_utils.get_program("sambamba", data, default="sambamba")
    with tx_tmpdir(data, work_dir) as tmp_dir:
        cleaned_bed = os.path.join(tmp_dir, os.path.basename(bed_file))
        cleaned_bed = bed.decomment(bed_file, cleaned_bed)
        with file_transaction(out_file) as tx_out_file:
            cmd = ("{sambamba} depth region -t {nthreads} -L {cleaned_bed} "
                   "-F \"not unmapped\" "
                   "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 "
                   "{in_bam} -o {tx_out_file}")
            message = "Calculating coverage of {bed_file} regions in {in_bam}"
            do.run(cmd.format(**locals()), message.format(**locals()))
    data['priority_total_coverage'] = os.path.abspath(out_file)
    return data
Example #25
0
def regions_coverage(chanjo_db, batch_name, out_dir):
    """
    create BED file of coverage of all regions from a Chanjo database
    """
    if not utils.file_exists(chanjo_db):
        return None
    out_file = os.path.join(out_dir, batch_name + "-all-regions.bed.gz")
    if utils.file_exists(out_file):
        return out_file
    conn = sqlite3.connect(chanjo_db)
    c = conn.cursor()
    q = c.execute("SELECT contig, start, end, strand, coverage, completeness, "
                  "sample_id "
                  "FROM interval_data "
                  "JOIN interval ON interval_data.parent_id=interval.id ")
    with file_transaction(out_file) as tx_out_file:
        with open(tx_out_file + ".tmp", "w") as out_handle:
            out_handle.write("\t".join(["#chr", "start", "end", "name",
                                       "coverage", "completeness"]) + "\n")
            for line in q:
                line = [str(x) for x in line]
                # chanjo reports coordinates as 1 based instead of 0 based
                start = str(int(line[1]) - 1)
                out_handle.write("\t".join([line[0], start, line[2], line[6],
                                            line[3], line[4], line[5]]) + "\n")
        bt = BedTool(tx_out_file + ".tmp").sort().bgzip()
        shutil.move(bt, tx_out_file)
    return out_file
Example #26
0
def _mint_trna_annotation(data):
    """
    use MINTmap to quantify tRNAs
    """
    trna_lookup = op.join(dd.get_srna_mint_lookup(data))
    trna_space = op.join(dd.get_srna_mint_space(data))
    trna_other = op.join(dd.get_srna_mint_other(data))
    name = dd.get_sample_name(data)
    work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "trna_mint", name))
    in_file = op.basename(data["clean_fastq"])
    mintmap = os.path.realpath(os.path.join(os.path.dirname(sys.executable), "MINTmap.pl"))
    perl_export = utils.get_perl_exports()
    if not file_exists(trna_lookup) or not file_exists(mintmap):
        logger.info("There is no tRNA annotation to run MINTmap.")
        return work_dir
    jar_folder = os.path.join(os.path.dirname(mintmap), "MINTplates")
    out_file = op.join(work_dir, name + "-MINTmap_v1-exclusive-tRFs.expression.txt")
    if not file_exists(out_file):
        with tx_tmpdir(data) as txdir:
            with utils.chdir(txdir):
                utils.symlink_plus(data["clean_fastq"], op.join(txdir, in_file))
                cmd = ("{perl_export} && {mintmap} -f {in_file} -p {name} "
                       "-l {trna_lookup} -s {trna_space} -j {jar_folder} "
                       "-o {trna_other}").format(**locals())
                do.run(cmd, "tRNA for %s" % name)
                for filename in glob.glob("*MINTmap*"):
                    shutil.move(filename, work_dir)
    return work_dir
Example #27
0
def sample_annotation(data):
    """
    Annotate miRNAs using miRBase database with seqbuster tool
    """
    names = data["rgnames"]['sample']
    tools = dd.get_expression_caller(data)
    work_dir = os.path.join(dd.get_work_dir(data), "mirbase")
    out_dir = os.path.join(work_dir, names)
    utils.safe_makedir(out_dir)
    out_file = op.join(out_dir, names)
    if dd.get_mirbase_hairpin(data):
        mirbase = op.abspath(op.dirname(dd.get_mirbase_hairpin(data)))
        if utils.file_exists(data["collapse"]):
            data['transcriptome_bam'] = _align(data["collapse"], dd.get_mirbase_hairpin(data), out_file, data)
            data['seqbuster'] = _miraligner(data["collapse"], out_file, dd.get_species(data), mirbase, data['config'])
        else:
            logger.debug("Trimmed collapsed file is empty for %s." % names)
    else:
        logger.debug("No annotation file from miRBase.")

    sps = dd.get_species(data) if dd.get_species(data) else "None"
    logger.debug("Looking for mirdeep2 database for %s" % names)
    if file_exists(op.join(dd.get_work_dir(data), "mirdeep2", "novel", "hairpin.fa")):
        data['seqbuster_novel'] = _miraligner(data["collapse"], "%s_novel" % out_file, sps,  op.join(dd.get_work_dir(data), "mirdeep2", "novel"), data['config'])

    if "trna" in tools:
        data['trna'] = _mint_trna_annotation(data)

    data = spikein.counts_spikein(data)
    return [[data]]
def prep_gemini_db(fnames, call_info, samples):
    """Prepare a gemini database from VCF inputs prepared with snpEff.
    """
    data = samples[0]
    out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini"))
    name, caller, is_batch = call_info
    gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller))
    gemini_vcf = get_multisample_vcf(fnames, name, caller, data)
    use_gemini_quick = (do_db_build(samples, check_gemini=False) and
                        any(vcfutils.vcf_has_variants(f) for f in fnames))
    if not utils.file_exists(gemini_db) and use_gemini_quick:
        use_gemini = do_db_build(samples) and any(vcfutils.vcf_has_variants(f) for f in fnames)
        if use_gemini:
            with file_transaction(gemini_db) as tx_gemini_db:
                gemini = config_utils.get_program("gemini", data["config"])
                if "program_versions" in data["config"].get("resources", {}):
                    gemini_ver = programs.get_version("gemini", config=data["config"])
                else:
                    gemini_ver = None
                # Recent versions of gemini allow loading only passing variants
                load_opts = ""
                if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion("0.6.2.1"):
                    load_opts += " --passonly"
                # For small test files, skip gene table loading which takes a long time
                if gemini_ver and LooseVersion(gemini_ver) > LooseVersion("0.6.4"):
                    if _is_small_vcf(gemini_vcf):
                        load_opts += " --skip-gene-tables"
                    if "/test_automated_output/" in gemini_vcf:
                        load_opts += " --test-mode"
                num_cores = data["config"]["algorithm"].get("num_cores", 1)
                cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t snpEff --cores {num_cores} {tx_gemini_db}"
                cmd = cmd.format(**locals())
                do.run(cmd, "Create gemini database for %s %s" % (name, caller), data)
    return [[(name, caller), {"db": gemini_db if utils.file_exists(gemini_db) else None,
                              "vcf": gemini_vcf if is_batch else None}]]
Example #29
0
def summarize(calls, data, items):
    """Summarize results from multiple callers into a single flattened BED file.

    Approach:
      - Combine all calls found in all files
      - Filter files retaining those present with multiple levels of support.
      - Remove calls in high depth regions.
      - Remove calls with ends overlapping exclusion regions like low complexity regions.
    """
    sample = tz.get_in(["rgnames", "sample"], data)
    work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "structural",
                                               sample, "ensemble"))
    with shared.bedtools_tmpdir(data):
        input_beds = filter(lambda xs: xs[1] is not None and utils.file_exists(xs[1]),
                            [(c["variantcaller"], _create_bed(c, sample, work_dir, calls, data)) for c in calls])
    if len(input_beds) > 0:
        out_file = combine_bed_by_size([xs[1] for xs in input_beds], sample, work_dir, data)
        if utils.file_exists(out_file):
            if len(input_beds) > N_FILTER_CALLERS:
                filter_file = _filter_ensemble(out_file, data)
            else:
                filter_file = out_file
            limit_file = shared.remove_highdepth_regions(filter_file, items)
            exclude_files = [f for f in [x.get("exclude_file") for x in calls] if f]
            exclude_file = exclude_files[0] if len(exclude_files) > 0 else None
            if exclude_file:
                noexclude_file, _ = sshared.exclude_by_ends(limit_file, exclude_file, data)
            else:
                noexclude_file = limit_file
            bedprep_dir = utils.safe_makedir(os.path.join(os.path.dirname(noexclude_file), "bedprep"))
            if utils.file_exists(noexclude_file):
                calls.append({"variantcaller": "sv-ensemble",
                              "input_beds": input_beds,
                              "vrn_file": bedutils.clean_file(noexclude_file, data, bedprep_dir=bedprep_dir)})
    return calls
Example #30
0
def _create_combined_fasta(data, out_dir):
    """
    if there are genomes to be disambiguated, create a FASTA file of
    all of the transcripts for all genomes
    """
    items = disambiguate.split([data])
    fasta_files = []
    for i in items:
        odata = i[0]
        gtf_file = dd.get_gtf_file(odata)
        ref_file = dd.get_ref_file(odata)
        out_file = os.path.join(out_dir, dd.get_genome_build(odata) + ".fa")
        if file_exists(out_file):
            fasta_files.append(out_file)
        else:
            out_file = _gtf_to_fasta(gtf_file, ref_file, out_file)
            out_file = _clean_gtf_fa(out_file, out_file)
            fasta_files.append(out_file)
    out_stem = os.path.join(out_dir, dd.get_genome_build(data))
    if dd.get_disambiguate(data):
        out_stem = "-".join([out_stem] + dd.get_disambiguate(data))
    combined_file = out_stem + ".fa"
    if file_exists(combined_file):
        return combined_file

    fasta_file_string = " ".join(fasta_files)
    cmd = "cat {fasta_file_string} > {tx_out_file}"
    with file_transaction(combined_file) as tx_out_file:
        do.run(cmd.format(**locals()), "Combining transcriptome FASTA files.")
    return combined_file
Example #31
0
def run_freebayes(align_bam,
                  ref_file,
                  config,
                  dbsnp=None,
                  region=None,
                  out_file=None):
    """Detect small polymorphisms with FreeBayes.
    """
    if out_file is None:
        out_file = "%s-variants.vcf" % os.path.splitext(align_bam)[0]
    if not file_exists(out_file):
        logger.info("Genotyping with FreeBayes: {region} {fname}".format(
            region=region, fname=os.path.basename(align_bam)))
        with file_transaction(out_file) as tx_out_file:
            cl = [
                config["program"].get("freebayes",
                                      "freebayes"), "-b", align_bam, "-v",
                tx_out_file, "-f", ref_file, "--left-align-indels"
            ]
            cl += _freebayes_options_from_config(config["algorithm"], out_file,
                                                 region)
            subprocess.check_call(cl)
    return out_file
Example #32
0
def _create_bed(call, sample, work_dir, calls, data):
    """Create a simplified BED file from caller specific input.
    """
    out_file = os.path.join(
        work_dir, "%s-ensemble-%s.bed" % (sample, call["variantcaller"]))
    if call.get("vrn_file") and not utils.file_uptodate(
            out_file, call["vrn_file"]):
        with file_transaction(data, out_file) as tx_out_file:
            convert_fn = CALLER_TO_BED.get(call["variantcaller"])
            if convert_fn:
                vrn_file = call["vrn_file"]
                if call["variantcaller"] in SUBSET_BY_ENSEMBLE:
                    ecalls = [
                        x for x in calls if x["variantcaller"] in
                        SUBSET_BY_ENSEMBLE[call["variantcaller"]]
                    ]
                    if len(ecalls) > 0:
                        vrn_file = _subset_by_ensemble(call["vrn_file"],
                                                       ecalls[0]["vrn_file"],
                                                       data)
                convert_fn(vrn_file, call["variantcaller"], tx_out_file)
    if utils.file_exists(out_file):
        return out_file
Example #33
0
def _grabix_index(data):
    """Create grabix index of bgzip input file.

    grabix does not allow specification of output file, so symlink the original
    file into a transactional directory.
    """
    in_file = data["bgzip_file"]
    config = data["config"]
    grabix = config_utils.get_program("grabix", config)
    gbi_file = _get_grabix_index(in_file)
    # We always build grabix input so we can use it for counting reads and doing downsampling
    if not gbi_file or _is_partial_index(gbi_file):
        if gbi_file:
            utils.remove_safe(gbi_file)
        else:
            gbi_file = in_file + ".gbi"
        with file_transaction(data, gbi_file) as tx_gbi_file:
            tx_in_file = os.path.splitext(tx_gbi_file)[0]
            utils.symlink_plus(in_file, tx_in_file)
            do.run([grabix, "index", tx_in_file],
                   "Index input with grabix: %s" % os.path.basename(in_file))
    assert utils.file_exists(gbi_file)
    return [gbi_file]
Example #34
0
def _bgzip_from_cram(cram_file, dirs, data):
    """Create bgzipped fastq files from an input CRAM file in regions of interest.

    Returns a list with a single file, for single end CRAM files, or two
    files for paired end input.
    """
    import pybedtools
    region_file = (tz.get_in(["config", "algorithm", "variant_regions"], data)
                   if tz.get_in(["config", "algorithm", "coverage_interval"],
                                data) in ["regional", "exome", "amplicon"] else
                   None)
    if region_file:
        regions = [
            "%s:%s-%s" % tuple(r[:3]) for r in pybedtools.BedTool(region_file)
        ]
    else:
        regions = [None]
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep"))
    out_s, out_p1, out_p2 = [
        os.path.join(
            work_dir, "%s-%s.fq.gz" %
            (utils.splitext_plus(os.path.basename(cram_file))[0], fext))
        for fext in ["s1", "p1", "p2"]
    ]
    if (not utils.file_exists(out_s) and
        (not utils.file_exists(out_p1) or not utils.file_exists(out_p2))):
        cram.index(cram_file, data["config"])
        fastqs, part_dir = _cram_to_fastq_regions(regions, cram_file, dirs,
                                                  data)
        if len(fastqs[0]) == 1:
            with file_transaction(data, out_s) as tx_out_file:
                _merge_and_bgzip([xs[0] for xs in fastqs], tx_out_file, out_s)
        else:
            for i, out_file in enumerate([out_p1, out_p2]):
                if not utils.file_exists(out_file):
                    ext = "/%s" % (i + 1)
                    with file_transaction(data, out_file) as tx_out_file:
                        _merge_and_bgzip([xs[i] for xs in fastqs], tx_out_file,
                                         out_file, ext)
        shutil.rmtree(part_dir)
    if utils.file_exists(out_p1):
        return [out_p1, out_p2]
    else:
        assert utils.file_exists(out_s)
        return [out_s]
Example #35
0
def align(fastq_file, pair_file, ref_file, out_base, align_dir, config,
          extra_args=None, names=None):
    """Alignment with bowtie2.
    """
    out_file = os.path.join(align_dir, "%s.sam" % out_base)
    if not file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            cl = [config_utils.get_program("bowtie2", config)]
            cl += _bowtie2_args_from_config(config)
            cl += extra_args if extra_args is not None else []
            cl += ["-q",
                   "--sensitive",
                   "-X", 2000, # default is too selective for most data
                   "-x", ref_file]
            if pair_file:
                cl += ["-1", fastq_file, "-2", pair_file]
            else:
                cl += ["-U", fastq_file]
            cl += ["-S", tx_out_file]
            cl = [str(i) for i in cl]
            do.run(cl, "Aligning %s and %s with Bowtie2." % (fastq_file, pair_file),
                   None)
    return out_file
Example #36
0
def rapmap_pseudoalign(fq1, fq2, rapmap_dir, gtf_file, ref_file, data):
    safe_makedir(rapmap_dir)
    samplename = dd.get_sample_name(data)
    out_file = os.path.join(rapmap_dir, samplename + ".bam")
    if file_exists(out_file):
        return out_file
    rapmap_idx = rapmap_index(gtf_file, ref_file, data, rapmap_dir)
    num_cores = dd.get_num_cores(data)
    rapmap = config_utils.get_program("rapmap", data["config"])
    cmd = ("{rapmap} pseudomap -i {rapmap_idx} -t {num_cores} ")
    fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})"
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    if not fq2:
        cmd += " -r {fq1_cmd} "
    else:
        fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})"
        fq2_cmd = fq2_cmd.format(fq2=fq2)
        cmd += " -1 {fq1_cmd} -2 {fq2_cmd} "
    message = "pseudomapping transcripts in {fq1} and {fq2}."
    with file_transaction(data, out_file) as tx_out_file:
        cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_out_file)
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_file
def prepare_mask_gtf(gtf):
    """
    make a mask file of usually-masked RNA biotypes
    """

    mask_biotype = [
        "rRNA", "Mt_rRNA", "misc_RNA", "snRNA", "snoRNA", "tRNA", "Mt_tRNA"
    ]
    mask_chrom = ["MT"]
    out_file = os.path.join(os.path.dirname(gtf), "ref-transcripts-mask.gtf")
    if file_exists(out_file):
        return out_file
    biotype_lookup = _biotype_lookup_fn(gtf)
    # if we can't find a biotype column, skip this
    if not biotype_lookup:
        return None
    db = _get_gtf_db(gtf)
    with open(out_file, "w") as out_handle:
        for g in db.all_features():
            biotype = biotype_lookup(g)
            if (biotype in mask_biotype) or (g.chrom in mask_chrom):
                out_handle.write(str(g) + "\n")
    return out_file
Example #38
0
def salmon_quant_bam(bam_file, salmon_dir, gtf_file, ref_file, data):
    samplename = dd.get_sample_name(data)
    quant_dir = os.path.join(salmon_dir, "quant")
    safe_makedir(salmon_dir)
    out_file = os.path.join(quant_dir, "quant.sf")
    if file_exists(out_file):
        return out_file
    if dd.get_transcriptome_fasta(data):
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data)
    num_cores = dd.get_num_cores(data)
    strandedness = dd.get_strandedness(data).lower()
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    libtype = _libtype_string(bam_file, strandedness)
    num_cores = dd.get_num_cores(data)
    cmd = ("{salmon} quant {libtype} -p {num_cores} -t {gtf_fa} "
           "-o {tx_out_dir} -a {bam_file} ")
    cmd += "--numBootstraps 30 "
    with file_transaction(data, quant_dir) as tx_out_dir:
        message = "Quantifying transcripts in %s with Salmon." % bam_file
        do.run(cmd.format(**locals()), message, None)
    return out_file
Example #39
0
def run(bam_file, data, out_dir):
    out = {}
    if not tz.get_in(["config", "algorithm", "preseq"], data):
        return out

    samtools_stats_dir = os.path.join(out_dir, os.path.pardir, "samtools")
    samtools_stats = samtools.run(bam_file, data, samtools_stats_dir)

    stats_file = os.path.join(out_dir, "%s.txt" % dd.get_sample_name(data))
    if not utils.file_exists(stats_file):
        utils.safe_makedir(out_dir)
        preseq = config_utils.get_program("preseq", data["config"])
        params = _get_preseq_params(data, int(samtools_stats["Total_reads"]))
        param_line = "-step {step} -extrap {extrap} -seg_len {seg_len}".format(
            **params)
        with file_transaction(data, stats_file) as tx_out_file:
            cmd = "{preseq} lc_extrap -bam -pe {bam_file} -o {tx_out_file} {param_line}".format(
                **locals())
            do.run(cmd.format(**locals()), "preseq lc_extrap", data)

    out = _prep_real_counts(bam_file, data, samtools_stats)

    return {"base": stats_file, "metrics": out}
Example #40
0
def genotype_filter(vcf_file, expression, data, name, filterext=""):
    """Perform genotype based filtering using GATK with the provided expression.

    Adds FT tags to genotypes, rather than the general FILTER flag.
    """
    base, ext = utils.splitext_plus(vcf_file)
    out_file = "{base}-filter{filterext}{ext}".format(**locals())
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            params = [
                "-T", "VariantFiltration", "-R",
                tz.get_in(["reference", "fasta", "base"],
                          data), "--variant", vcf_file, "--out", tx_out_file,
                "--genotypeFilterName", name, "--genotypeFilterExpression",
                "'%s'" % expression
            ]
            jvm_opts = broad.get_gatk_framework_opts(data["config"])
            cmd = [config_utils.get_program("gatk-framework", data["config"])
                   ] + jvm_opts + params
            do.run(cmd, "Filter with expression: %s" % expression)
    if out_file.endswith(".vcf.gz"):
        out_file = vcfutils.bgzip_and_index(out_file, data["config"])
    return out_file
Example #41
0
def bam2bigwig(in_file, config, out_prefix=None):
    """
    assumes the library preparation was not strand specific for now
    """
    PROGRAM = "bam2wig.py"
    if not program_exists(PROGRAM):
        logger.info("%s is not in the path or is not executable." % (PROGRAM))
        exit(1)

    prefix = "bigwig"
    chrom_size_file = config["annotation"].get("chrom_size_file", None)
    out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix)
    if not chrom_size_file:
        chrom_size_file = _fetch_chrom_sizes(config)
    wiggle_file = out_prefix + ".wig"

    if not file_exists(wiggle_file):
        bam2wig = sh.Command(which(PROGRAM))
        bam2wig(i=in_file, s=chrom_size_file, o=out_prefix)

    bigwig_file = out_prefix + ".bw"

    return wig2bigwig(wiggle_file, chrom_size_file, bigwig_file)
Example #42
0
def square_batch_region(data, region, bam_files, vrn_files, out_file):
    """Perform squaring of a batch in a supplied region, with input BAMs
    """
    from bcbio.variation import sentieon
    if not utils.file_exists(out_file):
        jointcaller = tz.get_in(("config", "algorithm", "jointcaller"), data)
        if jointcaller in ["%s-joint" % x for x in SUPPORTED["general"]]:
            _square_batch_bcbio_variation(data, region, bam_files, vrn_files, out_file, "square")
        elif jointcaller in ["%s-merge" % x for x in SUPPORTED["general"]]:
            _square_batch_bcbio_variation(data, region, bam_files, vrn_files, out_file, "merge")
        elif jointcaller in ["%s-joint" % x for x in SUPPORTED["gatk"]]:
            gatkjoint.run_region(data, region, vrn_files, out_file)
        elif jointcaller in ["%s-joint" % x for x in SUPPORTED["gvcf"]]:
            merge_gvcfs(data, region, vrn_files, out_file)
        elif jointcaller in ["%s-joint" % x for x in SUPPORTED["sentieon"]]:
            sentieon.run_gvcftyper(vrn_files, out_file, region, data)
        else:
            raise ValueError("Unexpected joint calling approach: %s." % jointcaller)
    if region:
        data["region"] = region
    data = _fix_orig_vcf_refs(data)
    data["vrn_file"] = out_file
    return [data]
Example #43
0
def _save_uploaded_data_json(samples, data_json_work, out_dir):
    """ Fixes all absolute work-rooted paths to relative final-rooted paths
    """
    if not utils.file_exists(data_json_work):
        return None

    upload_path_mapping = dict()
    for sample in samples:
        upload_path_mapping.update(get_all_upload_paths_from_sample(sample))
    if not upload_path_mapping:
        return data_json_work

    with io.open(data_json_work, encoding="utf-8") as f:
        data = json.load(f, object_pairs_hook=OrderedDict)
    upload_base = samples[0]["upload"]["dir"]
    data = walk_json(
        data, lambda s: _work_path_to_rel_final_path(s, upload_path_mapping,
                                                     upload_base))

    data_json_final = os.path.join(out_dir, "multiqc_data_final.json")
    with io.open(data_json_final, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)
    return data_json_final
Example #44
0
def run_align(*data):
    """
    Prepare data to run alignment step, only once for each project
    """
    work_dir = dd.get_work_dir(data[0][0])
    out_dir = op.join(work_dir, "seqcluster", "prepare")
    seq_out = op.join(out_dir, "seqs.fastq")
    bam_dir = op.join(work_dir, "align")
    new_bam_file = op.join(bam_dir, "seqs.bam")
    tools = dd.get_expression_caller(data[0][0])
    if not file_exists(new_bam_file):
        sample = process_alignment(data[0][0], [seq_out, None])
        bam_file = dd.get_work_bam(sample[0][0])
        shutil.move(bam_file, new_bam_file)
        shutil.move(bam_file + ".bai", new_bam_file + ".bai")
        shutil.rmtree(op.join(bam_dir, sample[0][0]["rgnames"]['sample']))
    for sample in data:
        sample[0]["align_bam"] = sample[0]["clean_fastq"]
        sample[0]["work_bam"] = new_bam_file

    if "mirdeep2" in tools:
        novel_db = mirdeep.run(data)
    return data
Example #45
0
def gatk_indel_realignment(runner,
                           align_bam,
                           ref_file,
                           intervals,
                           region=None,
                           out_file=None,
                           deep_coverage=False,
                           config=None):
    """Perform realignment of BAM file in specified regions
    """
    if out_file is None:
        out_file = "%s-realign.bam" % os.path.splitext(align_bam)[0]
    if not file_exists(out_file):
        with curdir_tmpdir({"config": config}) as tmp_dir:
            with file_transaction(out_file) as tx_out_file:
                logger.info("GATK IndelRealigner: %s %s" %
                            (os.path.basename(align_bam), region))
                cl = gatk_indel_realignment_cl(runner, align_bam, ref_file,
                                               intervals, tmp_dir, region,
                                               deep_coverage)
                cl += ["-o", tx_out_file]
                do.run(cl, "GATK indel realignment", {})
    return out_file
Example #46
0
def clipping_profile(in_file, config, out_prefix=None):
    """
    estimate the clipping profile of the reads
    """
    PROGRAM = "clipping_profile.py"
    if not program_exists(PROGRAM):
        logger.info("%s is not in the path or is not executable." % (PROGRAM))
        exit(1)

    prefix = "clipping"
    out_prefix = _get_out_prefix(in_file, config, out_prefix, "clipping")
    clip_plot_file = out_prefix + ".clipping_profile.pdf"
    print clip_plot_file
    if file_exists(clip_plot_file):
        return clip_plot_file

    clip_run = sh.Command(which(PROGRAM))
    clip_run(i=in_file, o=out_prefix)
    # hack to get around the fact that clipping_profile saves the file in
    # the script execution directory
    #sh.mv("clipping_profile.pdf", clip_plot_file)

    return clip_plot_file
Example #47
0
def kallisto_table(kallisto_dir, index):
    """
    convert kallisto output to a count table where the rows are
    equivalence classes and the columns are cells
    """
    quant_dir = os.path.join(kallisto_dir, "quant")
    out_file = os.path.join(quant_dir, "matrix.csv")
    if file_exists(out_file):
        return out_file
    tsvfile = os.path.join(quant_dir, "matrix.tsv")
    ecfile = os.path.join(quant_dir, "matrix.ec")
    cellsfile = os.path.join(quant_dir, "matrix.cells")
    fastafile = os.path.splitext(index)[0] + ".fa"
    fasta_names = fasta.sequence_names(fastafile)
    ec_names = get_ec_names(ecfile, fasta_names)
    df = pd.read_table(tsvfile, header=None, names=["ec", "cell", "count"])
    df["ec"] = [ec_names[x] for x in df["ec"]]
    df = df.pivot(index='ec', columns='cell', values='count')
    cellnames = get_cell_names(cellsfile)
    colnames = [cellnames[x] for x in df.columns]
    df.columns = colnames
    df.to_csv(out_file)
    return out_file
Example #48
0
def by_regions(items):
    """Plot for a union set of combined ensemble regions across all of the data
       items.
    """
    work_dir = os.path.join(dd.get_work_dir(items[0]), "structural",
                            "coverage")
    safe_makedir(work_dir)
    out_file = os.path.join(work_dir,
                            "%s-coverage.pdf" % (dd.get_sample_name(items[0])))
    if file_exists(out_file):
        items = _add_regional_coverage_plot(items, out_file)
    else:
        bed_files = _get_ensemble_bed_files(items)
        merged = bed.merge(bed_files)
        breakpoints = breakpoints_by_caller(bed_files)
        if merged:
            priority_merged = _prioritize_plot_regions(merged, items[0])
            out_file = plot_multiple_regions_coverage(items, out_file,
                                                      items[0],
                                                      priority_merged,
                                                      breakpoints)
            items = _add_regional_coverage_plot(items, out_file)
    return items
Example #49
0
def _freebayes_custom(in_file, ref_file, data):
    """Custom FreeBayes filtering using bcbio.variation, tuned to human NA12878 results.

    Experimental: for testing new methods.
    """
    if vcfutils.get_paired_phenotype(data):
        return None
    config = data["config"]
    bv_ver = programs.get_version("bcbio_variation", config=config)
    if LooseVersion(bv_ver) < LooseVersion("0.1.1"):
        return None
    out_file = "%s-filter%s" % os.path.splitext(in_file)
    if not utils.file_exists(out_file):
        tmp_dir = utils.safe_makedir(os.path.join(os.path.dirname(in_file), "tmp"))
        bv_jar = config_utils.get_jar("bcbio.variation",
                                      config_utils.get_program("bcbio_variation", config, "dir"))
        resources = config_utils.get_resources("bcbio_variation", config)
        jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"])
        java_args = ["-Djava.io.tmpdir=%s" % tmp_dir]
        cmd = ["java"] + jvm_opts + java_args + ["-jar", bv_jar, "variant-filter", "freebayes",
                                                 in_file, ref_file]
        do.run(cmd, "Custom FreeBayes filtering using bcbio.variation")
    return out_file
Example #50
0
def pizzly(pizzly_path, gtf, gtf_fa, fraglength, cachefile, pizzlydir, fusions,
           samplename, data):
    outdir = os.path.join(pizzlydir, samplename)
    out_stem = os.path.join(outdir, samplename)
    pizzly_gtf = make_pizzly_gtf(gtf, os.path.join(pizzlydir, "pizzly.gtf"),
                                 data)
    sentinel = os.path.join(out_stem, "-flat-filtered.tsv")
    pizzlycalls = out_stem + ".json"
    if not file_exists(pizzlycalls):
        with file_transaction(data, outdir) as tx_out_dir:
            safe_makedir(tx_out_dir)
            tx_out_stem = os.path.join(tx_out_dir, samplename)
            cmd = (
                "{pizzly_path} -k 31 --gtf {pizzly_gtf} --cache {cachefile} "
                "--align-score 2 --insert-size {fraglength} --fasta {gtf_fa} "
                "--output {tx_out_stem} {fusions}")
            message = ("Running pizzly on %s." % fusions)
            do.run(cmd.format(**locals()), message)
    flatfile = out_stem + "-flat.tsv"
    filteredfile = out_stem + "-flat-filtered.tsv"
    flatten_pizzly(pizzlycalls, flatfile, data)
    filter_pizzly(flatfile, filteredfile, data)
    return outdir
Example #51
0
def _snpeff_args_from_config(data):
    """Retrieve snpEff arguments supplied through input configuration.
    """
    config = data["config"]
    args = ["-hgvs"]
    # General supplied arguments
    resources = config_utils.get_resources("snpeff", config)
    if resources.get("options"):
        args += [str(x) for x in resources.get("options", [])]
    # cancer specific calling arguments
    if vcfutils.get_paired_phenotype(data):
        args += ["-cancer"]

    effects_transcripts = dd.get_effects_transcripts(data)
    if effects_transcripts in set(["canonical_cancer"]):
        _, snpeff_base_dir = get_db(data)
        canon_list_file = os.path.join(snpeff_base_dir, "transcripts", "%s.txt" % effects_transcripts)
        if not utils.file_exists(canon_list_file):
            raise ValueError("Cannot find expected file for effects_transcripts: %s" % canon_list_file)
        args += ["-canonList", canon_list_file]
    elif effects_transcripts == "canonical" or tz.get_in(("config", "algorithm", "clinical_reporting"), data):
        args += ["-canon"]
    return args
Example #52
0
def _work_handles(in_files, dirs, ext):
    """Create working handles for input files and close on completion.
    """
    out_dir = safe_makedir(os.path.join(dirs["work"], "trim"))
    out_handles = {}
    in_handles = {}
    name_map = {}
    for in_file in in_files:
        out_file = os.path.join(
            out_dir, "{base}{ext}".format(base=os.path.splitext(
                os.path.basename(in_file))[0],
                                          ext=ext))
        name_map[in_file] = out_file
        if not file_exists(out_file):
            in_handles[in_file] = open(in_file)
            out_handles[in_file] = open(out_file, "w")
    try:
        yield in_handles, out_handles, name_map
    finally:
        for h in in_handles.values():
            h.close()
        for h in out_handles.values():
            h.close()
Example #53
0
def merge_overlaps(in_file, data, distance=None, out_dir=None):
    """Merge bed file intervals to avoid overlapping regions.

    Overlapping regions (1:1-100, 1:90-100) cause issues with callers like FreeBayes
    that don't collapse BEDs prior to using them.
    """
    if in_file:
        bedtools = config_utils.get_program("bedtools", data["config"])
        work_dir = tz.get_in(["dirs", "work"], data)
        if out_dir:
            bedprep_dir = out_dir
        elif work_dir:
            bedprep_dir = utils.safe_makedir(os.path.join(work_dir, "bedprep"))
        else:
            bedprep_dir = os.path.dirname(in_file)
        out_file = os.path.join(bedprep_dir, "%s-merged.bed" % (utils.splitext_plus(os.path.basename(in_file))[0]))
        if not utils.file_exists(out_file):
            with file_transaction(data, out_file) as tx_out_file:
                distance = "-d %s" % distance if distance else ""
                cmd = "{bedtools} merge {distance} -i {in_file} > {tx_out_file}"
                do.run(cmd.format(**locals()), "Prepare merged BED file", data)
        vcfutils.bgzip_and_index(out_file, data["config"], remove_orig=False)
        return out_file
Example #54
0
def gatk_rnaseq_calling(data):
    """
    use GATK to perform variant calling on RNA-seq data
    """
    broad_runner = broad.runner_from_config(dd.get_config(data))
    ref_file = dd.get_ref_file(data)
    split_bam = dd.get_split_bam(data)
    out_file = os.path.splitext(split_bam)[0] + ".gvcf"
    num_cores = dd.get_num_cores(data)
    if file_exists(out_file):
        data = dd.set_vrn_file(data, out_file)
        return data
    with file_transaction(data, out_file) as tx_out_file:
        params = [
            "-T", "HaplotypeCaller", "-R", ref_file, "-I", split_bam, "-o",
            tx_out_file, "-nct",
            str(num_cores), "--emitRefConfidence", "GVCF",
            "--variant_index_type", "LINEAR", "--variant_index_parameter",
            "128000", "-dontUseSoftClippedBases"
        ]
        broad_runner.run_gatk(params)
    data = dd.set_vrn_file(data, out_file)
    return data
Example #55
0
def align_transcriptome(fastq_file, pair_file, ref_file, data):
    """
    bowtie2 with settings for aligning to the transcriptome for eXpress/RSEM/etc
    """
    work_bam = dd.get_work_bam(data)
    base, ext = os.path.splitext(work_bam)
    out_file = base + ".transcriptome" + ext
    if utils.file_exists(out_file):
        data = dd.set_transcriptome_bam(data, out_file)
        return data
    bowtie2 = config_utils.get_program("bowtie2", data["config"])
    gtf_file = dd.get_gtf_file(data)
    gtf_index = index_transcriptome(gtf_file, ref_file, data)
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    fastq_cmd = "-1 %s" % fastq_file if pair_file else "-U %s" % fastq_file
    pair_cmd = "-2 %s " % pair_file if pair_file else ""
    cmd = ("{bowtie2} -p {num_cores} -a -X 600 --rdg 6,5 --rfg 6,5 --score-min L,-.6,-.4 --no-discordant --no-mixed -x {gtf_index} {fastq_cmd} {pair_cmd} ")
    with file_transaction(out_file) as tx_out_file:
        message = "Aligning %s and %s to the transcriptome." % (fastq_file, pair_file)
        cmd += "| " + postalign.sam_to_sortbam_cl(data, tx_out_file, name_sort=True)
        do.run(cmd.format(**locals()), message)
    data = dd.set_transcriptome_bam(data, out_file)
    return data
Example #56
0
def _maybe_add_alignment(algorithm, sample, out):
    if _has_alignment_file(algorithm, sample):
        for (fname, ext, isplus) in [(sample.get("work_bam"), "ready", False),
                                     (dd.get_disc_bam(sample), "disc", True),
                                     (dd.get_sr_bam(sample), "sr", True)]:
            if fname and os.path.exists(fname):
                if fname.endswith("bam"):
                    ftype, fext = "bam", ".bai"
                elif fname.endswith("cram"):
                    ftype, fext = "cram", ".crai"
                else:
                    raise ValueError("Unexpected alignment file type %s" % fname)
                out.append({"path": fname,
                            "type": ftype,
                            "plus": isplus,
                            "ext": ext})
                if utils.file_exists(fname + fext):
                    out.append({"path": fname + fext,
                                "type": ftype + fext,
                                "plus": isplus,
                                "index": True,
                                "ext": ext})
    return out
Example #57
0
def picard_mark_duplicates(picard, align_bam, remove_dups=False):
    base, ext = os.path.splitext(align_bam)
    base = base.replace(".", "-")
    dup_bam = "%s-dup%s" % (base, ext)
    dup_metrics = "%s-dup.dup_metrics" % base
    if not file_exists(dup_bam):
        with tx_tmpdir(picard._config) as tmp_dir:
            with file_transaction(picard._config, dup_bam,
                                  dup_metrics) as (tx_dup_bam, tx_dup_metrics):
                opts = [("INPUT", align_bam), ("OUTPUT", tx_dup_bam),
                        ("TMP_DIR", tmp_dir),
                        ("REMOVE_DUPLICATES",
                         "true" if remove_dups else "false"),
                        ("METRICS_FILE", tx_dup_metrics)]
                if picard.get_picard_version("MarkDuplicates") >= 1.82:
                    opts += [("PROGRAM_RECORD_ID", "null")]
                picard.run("MarkDuplicates",
                           opts,
                           memscale={
                               "direction": "decrease",
                               "magnitude": 2
                           })
    return dup_bam, dup_metrics
Example #58
0
def _get_vcf(x, key):
    """Retrieve VCF file with the given key if it exists, handling bgzipped.
    """
    out = []
    fname = utils.get_in(x, key)
    if fname:
        if fname.endswith(".gz"):
            out.append({"path": fname,
                        "type": "vcf.gz",
                        "ext": x["variantcaller"],
                        "variantcaller": x["variantcaller"]})
            if utils.file_exists(fname + ".tbi"):
                out.append({"path": fname + ".tbi",
                            "type": "vcf.gz.tbi",
                            "index": True,
                            "ext": x["variantcaller"],
                            "variantcaller": x["variantcaller"]})
        else:
            out.append({"path": fname,
                        "type": "vcf",
                        "ext": x["variantcaller"],
                        "variantcaller": x["variantcaller"]})
    return out
Example #59
0
def picard_fastq_to_bam(picard,
                        fastq_one,
                        fastq_two,
                        out_dir,
                        names,
                        order="queryname"):
    """Convert fastq file(s) to BAM, adding sample, run group and platform information.
    """
    out_bam = os.path.join(
        out_dir,
        "%s-fastq.bam" % os.path.splitext(os.path.basename(fastq_one))[0])
    if not file_exists(out_bam):
        with tx_tmpdir(picard._config) as tmp_dir:
            with file_transaction(picard._config, out_bam) as tx_out_bam:
                opts = [("FASTQ", fastq_one), ("READ_GROUP_NAME", names["rg"]),
                        ("SAMPLE_NAME", names["sample"]),
                        ("PLATFORM_UNIT", names["pu"]),
                        ("PLATFORM", names["pl"]), ("TMP_DIR", tmp_dir),
                        ("OUTPUT", tx_out_bam), ("SORT_ORDER", order)]
                if fastq_two:
                    opts.append(("FASTQ2", fastq_two))
                picard.run("FastqToSam", opts)
    return out_bam
Example #60
0
def _run_snpeff(snp_in, out_format, data):
    snpeff_db, datadir = get_db(data)
    assert datadir is not None, \
        "Did not find snpEff resources in genome configuration: %s" % data["genome_resources"]
    assert os.path.exists(os.path.join(datadir, snpeff_db)), \
        "Did not find %s snpEff genome data in %s" % (snpeff_db, datadir)
    snpeff_cmd = get_cmd("eff", datadir, data["config"])
    ext = utils.splitext_plus(snp_in)[1] if out_format == "vcf" else ".tsv"
    out_file = "%s-effects%s" % (utils.splitext_plus(snp_in)[0], ext)
    if not utils.file_exists(out_file):
        config_args = " ".join(_snpeff_args_from_config(data))
        if ext.endswith(".gz"):
            bgzip_cmd = "| %s -c" % tools.get_bgzip_cmd(data["config"])
        else:
            bgzip_cmd = ""
        with file_transaction(out_file) as tx_out_file:
            cmd = (
                "{snpeff_cmd} {config_args} -noLog -1 -i vcf -o {out_format} "
                "{snpeff_db} {snp_in} {bgzip_cmd} > {tx_out_file}")
            do.run(cmd.format(**locals()), "snpEff effects", data)
    if ext.endswith(".gz"):
        out_file = vcfutils.bgzip_and_index(out_file, data["config"])
    return out_file