Beispiel #1
0
def gatk_rnaseq_calling(data):
    """
    use GATK to perform variant calling on RNA-seq data
    """
    broad_runner = broad.runner_from_config(dd.get_config(data))
    ref_file = dd.get_ref_file(data)
    split_bam = dd.get_split_bam(data)
    out_file = os.path.splitext(split_bam)[0] + ".vcf"
    bgzipped_file = out_file + ".gz"
    num_cores = dd.get_num_cores(data)
    if file_exists(bgzipped_file):
        data = dd.set_vrn_file(data, bgzipped_file)
        return data
    with file_transaction(data, out_file) as tx_out_file:
        params = ["-T", "HaplotypeCaller",
                  "-R", ref_file,
                  "-I", split_bam,
                  "-o", tx_out_file,
                  "-nct", str(num_cores),
                  "--emitRefConfidence", "GVCF",
                  "--variant_index_type", "LINEAR",
                  "--variant_index_parameter", "128000",
                  "-dontUseSoftClippedBases"]
        broad_runner.run_gatk(params)
    bgzip_and_index(out_file, dd.get_config(data))
    data = dd.set_vrn_file(data, bgzipped_file)
    return data
Beispiel #2
0
def count(data):
    """
    count reads mapping to genes using featureCounts
    http://subread.sourceforge.net
    """
    in_bam = dd.get_work_bam(data)
    sorted_bam = bam.sort(in_bam, dd.get_config(data), order="queryname")
    gtf_file = dd.get_gtf_file(data)
    work_dir = dd.get_work_dir(data)
    out_dir = os.path.join(work_dir, "htseq-count")
    safe_makedir(out_dir)
    count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts"
    if file_exists(count_file):
        return count_file

    featureCounts = config_utils.get_program("featureCounts", dd.get_config(data))
    paired_flag = _paired_flag(in_bam)
    strand_flag = _strand_flag(data)

    filtered_bam = bam.filter_primary(sorted_bam, data)

    cmd = ("{featureCounts} -a {gtf_file} -o {tx_count_file} -s {strand_flag} "
           "{paired_flag} {filtered_bam}")

    message = ("Count reads in {tx_count_file} mapping to {gtf_file} using "
               "featureCounts")
    with file_transaction(data, count_file) as tx_count_file:
        do.run(cmd.format(**locals()), message.format(**locals()))
    fixed_count_file = _format_count_file(count_file, data)
    shutil.move(fixed_count_file, count_file)

    return count_file
Beispiel #3
0
def gatk_filter_rnaseq(data, vrn_file, out_file):
    """
    this incorporates filters listed here, dropping clusters of variants
    within a 35 nucleotide window, high fischer strand values and low
    quality by depth
    https://software.broadinstitute.org/gatk/guide/article?id=3891
    java -jar GenomeAnalysisTK.jar -T VariantFiltration -R hg_19.fasta -V
    input.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0"
    -filterName QD -filter "QD < 2.0" -o output.vcf
    """
    broad_runner = broad.runner_from_config(dd.get_config(data))
    ref_file = dd.get_ref_file(data)
    if file_exists(out_file):
        return out_file
    with file_transaction(out_file) as tx_out_file:
        params = ["-T", "VariantFiltration",
                  "-R", ref_file,
                  "-V", vrn_file,
                  "--clusterWindowSize", "35",
                  "--clusterSize", "3",
                  "--filterExpression", "\"'FS > 30.0'\"",
                  "--filterName", "FS",
                  "--filterExpression", "\"'QD < 2.0'\"",
                  "--filterName", "QD",
                  "-o", tx_out_file]
        jvm_opts = broad.get_gatk_framework_opts(dd.get_config(data), os.path.dirname(tx_out_file))
        do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params),
               "Filter variants.")
    return out_file
Beispiel #4
0
def calculate_complexity_metrics(work_bam, data):
    """
    the work_bam should have duplicates marked but not removed
    mitochondrial reads should be removed 
    """
    bedtools = config_utils.get_program("bedtools", dd.get_config(data))
    work_dir = dd.get_work_dir(data)
    metrics_dir = os.path.join(work_dir, "metrics", "atac")
    utils.safe_makedir(metrics_dir)
    metrics_file = os.path.join(
        metrics_dir, f"{dd.get_sample_name(data)}-atac-metrics.csv")
    # complexity metrics only make sense for paired-end reads
    if not bam.is_paired(work_bam):
        return data
    if utils.file_exists(metrics_file):
        data = tz.assoc_in(data, ['atac', 'complexity_metrics_file'],
                           metrics_file)
        return data
    # BAM file must be sorted by read name
    work_bam = bam.sort(work_bam, dd.get_config(data), order="queryname")
    with file_transaction(metrics_file) as tx_metrics_file:
        with open(tx_metrics_file, "w") as out_handle:
            out_handle.write("mt,m0,m1,m2\n")
        cmd = (
            f"{bedtools} bamtobed -bedpe -i {work_bam} | "
            "awk 'BEGIN{OFS=\"\\t\"}{print $1,$2,$4,$6,$9,$10}' | "
            "sort | "
            "uniq -c | "
            "awk 'BEGIN{mt=0;m0=0;m1=0;m2=0}($1==1){m1=m1+1} "
            "($1==2){m2=m2+1}{m0=m0+1}{mt=mt+$1}END{printf \"%d,%d,%d,%d\\n\", mt,m0,m1,m2}' >> "
            f"{tx_metrics_file}")
        message = f"Calculating ATAC-seq complexity metrics on {work_bam}, saving as {metrics_file}."
        do.run(cmd, message)
    data = tz.assoc_in(data, ['atac', 'complexity_metrics_file'], metrics_file)
    return data
Beispiel #5
0
def salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, ref_file, data):
    samplename = dd.get_sample_name(data)
    quant_dir = os.path.join(salmon_dir, "quant")
    safe_makedir(salmon_dir)
    out_file = os.path.join(quant_dir, "quant.sf")
    if file_exists(out_file):
        return out_file
    num_cores = dd.get_num_cores(data)
    strandedness = dd.get_strandedness(data).lower()
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    libtype = sailfish._libtype_string(fq1, fq2, strandedness)
    num_cores = dd.get_num_cores(data)
    index = salmon_index(gtf_file, ref_file, data, salmon_dir)
    resources = config_utils.get_resources("salmon", dd.get_config(data))
    params = ""
    if resources.get("options") is not None:
        params = " ".join([str(x) for x in resources.get("options", [])])
    cmd = ("{salmon} quant {libtype} -i {index} -p {num_cores} "
           "-o {tx_out_dir} {params} ")
    fq1_cmd = "<(cat {fq1})" if not is_gzipped(fq1) else "<(gzip -cd {fq1})"
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    if not fq2:
        cmd += " -r {fq1_cmd} "
    else:
        fq2_cmd = "<(cat {fq2})" if not is_gzipped(
            fq2) else "<(gzip -cd {fq2})"
        fq2_cmd = fq2_cmd.format(fq2=fq2)
        cmd += " -1 {fq1_cmd} -2 {fq2_cmd} "
    # skip --useVBOpt for now, it can cause segfaults
    cmd += "--numBootstraps 30 "
    with file_transaction(data, quant_dir) as tx_out_dir:
        message = ("Quantifying transcripts in %s and %s with Salmon." %
                   (fq1, fq2))
        do.run(cmd.format(**locals()), message, None)
    return out_file
Beispiel #6
0
def gatk_filter_rnaseq(data, vrn_file, out_file):
    """
    this incorporates filters listed here, dropping clusters of variants
    within a 35 nucleotide window, high fischer strand values and low
    quality by depth
    https://software.broadinstitute.org/gatk/guide/article?id=3891
    java -jar GenomeAnalysisTK.jar -T VariantFiltration -R hg_19.fasta -V
    input.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0"
    -filterName QD -filter "QD < 2.0" -o output.vcf
    """
    broad_runner = broad.runner_from_config(dd.get_config(data))
    ref_file = dd.get_ref_file(data)
    if file_exists(out_file):
        return out_file
    with file_transaction(out_file) as tx_out_file:
        params = ["-T", "VariantFiltration",
                  "-R", ref_file,
                  "-V", vrn_file,
                  "--clusterWindowSize", "35",
                  "--clusterSize", "3",
                  "--filterExpression", "\"'FS > 30.0'\"",
                  "--filterName", "FS",
                  "--filterExpression", "\"'QD < 2.0'\"",
                  "--filterName", "QD",
                  "-o", tx_out_file]
        jvm_opts = broad.get_gatk_framework_opts(dd.get_config(data))
        do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params),
               "Filter variants.")
    return out_file
def count(data):
    """
    count reads mapping to genes using featureCounts
    http://subread.sourceforge.net
    """
    in_bam = dd.get_work_bam(data)
    sorted_bam = bam.sort(in_bam, dd.get_config(data), order="queryname")
    gtf_file = dd.get_gtf_file(data)
    work_dir = dd.get_work_dir(data)
    out_dir = os.path.join(work_dir, "htseq-count")
    safe_makedir(out_dir)
    count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts"
    summary_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts.summary"
    if file_exists(count_file):
        return count_file

    featureCounts = config_utils.get_program("featureCounts", dd.get_config(data))
    paired_flag = _paired_flag(in_bam)
    strand_flag = _strand_flag(data)

    filtered_bam = bam.filter_primary(sorted_bam, data)

    cmd = "{featureCounts} -a {gtf_file} -o {tx_count_file} -s {strand_flag} " "{paired_flag} {filtered_bam}"

    message = "Count reads in {tx_count_file} mapping to {gtf_file} using " "featureCounts"
    with file_transaction(data, [count_file, summary_file]) as tx_files:
        tx_count_file, tx_summary_file = tx_files
        do.run(cmd.format(**locals()), message.format(**locals()))
    fixed_count_file = _format_count_file(count_file, data)
    fixed_summary_file = _change_sample_name(summary_file, dd.get_sample_name(data), data=data)
    shutil.move(fixed_count_file, count_file)
    shutil.move(fixed_summary_file, summary_file)

    return count_file
Beispiel #8
0
def gatk_splitreads(data):
    """
    use GATK to split reads with Ns in the CIGAR string, hard clipping regions
    that end up in introns
    """
    broad_runner = broad.runner_from_config(dd.get_config(data))
    ref_file = dd.get_ref_file(data)
    deduped_bam = dd.get_deduped_bam(data)
    base, ext = os.path.splitext(deduped_bam)
    split_bam = base + ".splitN" + ext
    if dd.get_quality_format(data) == "illumina":
        quality_flag = ["--fix_misencoded_quality_scores", "-fixMisencodedQuals"]
    else:
        quality_flag = []
    if file_exists(split_bam):
        data = dd.set_split_bam(data, split_bam)
        return data
    with file_transaction(split_bam) as tx_split_bam:
        params = ["-T", "SplitNCigarReads",
                  "-R", ref_file,
                  "-I", deduped_bam,
                  "-o", tx_split_bam,
                  "-rf", "ReassignOneMappingQuality",
                  "-RMQF", "255",
                  "-RMQT", "60",
                  "-rf", "UnmappedRead",
                  "-U", "ALLOW_N_CIGAR_READS"] + quality_flag
        broad_runner.run_gatk(params)
    bam.index(split_bam, dd.get_config(data))
    data = dd.set_split_bam(data, split_bam)
    return data
Beispiel #9
0
def clean_chipseq_alignment(data):
    # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr"))
    method = dd.get_chip_method(data)
    if method == "atac":
        data = clean_ATAC(data)
    # for ATAC-seq, this will be the NF BAM
    work_bam = dd.get_work_bam(data)
    work_bam = bam.sort(work_bam, dd.get_config(data))
    bam.index(work_bam, dd.get_config(data))
    clean_bam = remove_nonassembled_chrom(work_bam, data)
    clean_bam = remove_mitochondrial_reads(clean_bam, data)
    data = atac.calculate_complexity_metrics(clean_bam, data)
    if not dd.get_keep_multimapped(data):
        clean_bam = remove_multimappers(clean_bam, data)
    if not dd.get_keep_duplicates(data):
        clean_bam = bam.remove_duplicates(clean_bam, data)
    data["work_bam"] = clean_bam
    encode_bed = tz.get_in(
        ["genome_resources", "variation", "encode_blacklist"], data)
    if encode_bed:
        data["work_bam"] = remove_blacklist_regions(dd.get_work_bam(data),
                                                    encode_bed, data['config'])
        bam.index(data["work_bam"], data['config'])
    try:
        data["bigwig"] = _normalized_bam_coverage(dd.get_sample_name(data),
                                                  dd.get_work_bam(data), data)
    except subprocess.CalledProcessError:
        logger.warning(f"{dd.get_work_bam(data)} was too sparse to normalize, "
                       f" falling back to non-normalized coverage.")
        data["bigwig"] = _bam_coverage(dd.get_sample_name(data),
                                       dd.get_work_bam(data), data)
    return [[data]]
Beispiel #10
0
def salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, ref_file, data):
    samplename = dd.get_sample_name(data)
    quant_dir = os.path.join(salmon_dir, "quant")
    safe_makedir(salmon_dir)
    out_file = os.path.join(quant_dir, "quant.sf")
    if file_exists(out_file):
        return out_file
    num_cores = dd.get_num_cores(data)
    strandedness = dd.get_strandedness(data).lower()
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    libtype = sailfish._libtype_string(fq1, fq2, strandedness)
    num_cores = dd.get_num_cores(data)
    index = salmon_index(gtf_file, ref_file, data, salmon_dir)
    resources = config_utils.get_resources("salmon", dd.get_config(data))
    params = ""
    if resources.get("options") is not None:
        params = " ".join([str(x) for x in resources.get("options", [])])
    cmd = ("{salmon} quant {libtype} -i {index} -p {num_cores} "
           "-o {tx_out_dir} {params} ")
    fq1_cmd = "<(cat {fq1})" if not is_gzipped(fq1) else "<(gzip -cd {fq1})"
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    if not fq2:
        cmd += " -r {fq1_cmd} "
    else:
        fq2_cmd = "<(cat {fq2})" if not is_gzipped(fq2) else "<(gzip -cd {fq2})"
        fq2_cmd = fq2_cmd.format(fq2=fq2)
        cmd += " -1 {fq1_cmd} -2 {fq2_cmd} "
    # skip --useVBOpt for now, it can cause segfaults
    cmd += "--numBootstraps 30 "
    with file_transaction(data, quant_dir) as tx_out_dir:
        message = ("Quantifying transcripts in %s and %s with Salmon."
                   %(fq1, fq2))
        do.run(cmd.format(**locals()), message, None)
        #sailfish.sleuthify_sailfish(tx_out_dir)
    return out_file
Beispiel #11
0
def rapmap_align(fq1, fq2, rapmap_dir, gtf_file, ref_file, algorithm, data):
    valid_algorithms = ["pseudo", "quasi"]
    assert algorithm in valid_algorithms, \
        "RapMap algorithm needs to be one of %s." % valid_algorithms
    safe_makedir(rapmap_dir)
    samplename = dd.get_sample_name(data)
    out_file = os.path.join(rapmap_dir, samplename + ".bam")
    if file_exists(out_file):
        bam.index(out_file, dd.get_config(data))
        return out_file
    rapmap_index_loc = rapmap_index(gtf_file, ref_file, algorithm, data,
                                    rapmap_dir)
    num_cores = dd.get_num_cores(data)
    algorithm_subcommand = algorithm + "map"
    rapmap = config_utils.get_program("rapmap", dd.get_config(data))
    cmd = "{rapmap} {algorithm_subcommand} -t {num_cores} -i {rapmap_index_loc} "
    fq1_cmd = "{fq1} " if not is_gzipped(fq1) else "<(gzip -cd {fq1}) "
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    if not fq2:
        cmd += "-r {fq1_cmd} "
    else:
        fq2_cmd = "{fq2} " if not is_gzipped(fq2) else "<(gzip -cd {fq2}) "
        fq2_cmd = fq2_cmd.format(fq2=fq2)
        cmd += "-1 {fq2_cmd} -2 {fq2_cmd} "
    with file_transaction(out_file) as tx_out_file:
        cmd += "| " + postalign.sam_to_sortbam_cl(data, tx_out_file)
        run_message = ("%smapping %s and %s to %s with Rapmap. "
                       % (algorithm, fq1, fq2, rapmap_index))
        do.run(cmd.format(**locals()), run_message, None)
    bam.index(out_file, dd.get_config(data))
    return out_file
def count(data):
    """
    count reads mapping to genes using featureCounts
    http://subread.sourceforge.net
    """
    in_bam = dd.get_work_bam(data) or dd.get_align_bam(data)
    out_dir = os.path.join(dd.get_work_dir(data), "align",
                           dd.get_sample_name(data))
    if dd.get_aligner(data) == "star":
        out_dir = os.path.join(
            out_dir,
            "%s_%s" % (dd.get_sample_name(data), dd.get_aligner(data)))
    sorted_bam = bam.sort(in_bam,
                          dd.get_config(data),
                          order="queryname",
                          out_dir=safe_makedir(out_dir))
    gtf_file = dd.get_transcriptome_gtf(data, default=dd.get_gtf_file(data))
    work_dir = dd.get_work_dir(data)
    out_dir = os.path.join(work_dir, "htseq-count")
    safe_makedir(out_dir)
    count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts"
    summary_file = os.path.join(out_dir,
                                dd.get_sample_name(data)) + ".counts.summary"
    if file_exists(count_file) and _is_fixed_count_file(count_file):
        return count_file

    featureCounts = config_utils.get_program("featureCounts",
                                             dd.get_config(data))
    paired_flag = _paired_flag(in_bam)
    strand_flag = _strand_flag(data)

    filtered_bam = bam.filter_primary(sorted_bam, data)

    cmd = ("{featureCounts} -a {gtf_file} -o {tx_count_file} -s {strand_flag} "
           "{paired_flag} {filtered_bam}")

    resources = config_utils.get_resources("featureCounts", data["config"])
    if resources:
        options = resources.get("options")
        if options:
            cmd += " %s" % " ".join([str(x) for x in options])

    message = ("Count reads in {tx_count_file} mapping to {gtf_file} using "
               "featureCounts")
    with file_transaction(data, [count_file, summary_file]) as tx_files:
        tx_count_file, tx_summary_file = tx_files
        do.run(cmd.format(**locals()), message.format(**locals()))
    fixed_count_file = _format_count_file(count_file, data)
    fixed_summary_file = _change_sample_name(summary_file,
                                             dd.get_sample_name(data),
                                             data=data)
    shutil.move(fixed_count_file, count_file)
    shutil.move(fixed_summary_file, summary_file)

    return count_file
Beispiel #13
0
def gatk_joint_calling(data, vrn_files, ref_file):
    joint_file = os.path.join("variation", "joint.vcf")
    out_file = os.path.join("variation", "combined.vcf")
    bgzjoint_file = os.path.join("variation", "joint.vcf.gz")
    bgzout_file = os.path.join("variation", "combined.vcf.gz")
    if not file_exists(bgzout_file):
        joint_file = _run_genotype_gvcfs(data, vrn_files, ref_file, joint_file)
        bgzip_and_index(joint_file, dd.get_config(data))
        out_file = gatk_filter_rnaseq(data, bgzjoint_file, out_file)
        bgzip_and_index(out_file, dd.get_config(data))
    return bgzout_file
Beispiel #14
0
def chipseq_count(data):
    """
    count reads mapping to ChIP/ATAC consensus peaks with featureCounts
    """
    method = dd.get_chip_method(data)
    if method == "chip":
        in_bam = dd.get_work_bam(data)
    elif method == "atac":
        in_bam = tz.get_in(("atac", "align", "NF"), data)
    out_dir = os.path.join(dd.get_work_dir(data), "align",
                           dd.get_sample_name(data))
    sorted_bam = bam.sort(in_bam,
                          dd.get_config(data),
                          order="queryname",
                          out_dir=safe_makedir(out_dir))
    consensus_file = tz.get_in(("peaks_files", "consensus", "main"), data)
    saf_file = os.path.splitext(consensus_file)[0] + ".saf"
    work_dir = dd.get_work_dir(data)
    out_dir = os.path.join(work_dir, "consensus")
    safe_makedir(out_dir)
    count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts"
    summary_file = os.path.join(out_dir,
                                dd.get_sample_name(data)) + ".counts.summary"
    if file_exists(count_file) and _is_fixed_count_file(count_file):
        if method == "atac":
            data = tz.assoc_in(data, ("peak_counts", "NF"), count_file)
        elif method == "chip":
            data = tz.assoc_in(data, ("peak_counts"), count)
        return [[data]]
    featureCounts = config_utils.get_program("featureCounts",
                                             dd.get_config(data))
    paired_flag = _paired_flag(in_bam)
    strand_flag = _strand_flag(data)

    cmd = (
        "{featureCounts} -F SAF -a {saf_file} -o {tx_count_file} -s {strand_flag} "
        "{paired_flag} {sorted_bam}")

    message = ("Count reads in {sorted_bam} overlapping {saf_file} using "
               "featureCounts.")
    with file_transaction(data, [count_file, summary_file]) as tx_files:
        tx_count_file, tx_summary_file = tx_files
        do.run(cmd.format(**locals()), message.format(**locals()))
    fixed_count_file = _format_count_file(count_file, data)
    fixed_summary_file = _change_sample_name(summary_file,
                                             dd.get_sample_name(data),
                                             data=data)
    shutil.move(fixed_count_file, count_file)
    shutil.move(fixed_summary_file, summary_file)
    if method == "atac":
        data = tz.assoc_in(data, ("peak_counts", "NF"), count_file)
    elif method == "chip":
        data = tz.assoc_in(data, ("peak_counts"), count)
    return [[data]]
Beispiel #15
0
def salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, ref_file, data):
    safe_makedir(salmon_dir)
    samplename = dd.get_sample_name(data)
    out_file = os.path.join(salmon_dir, "quant.sf")
    if file_exists(out_file):
        return out_file
    gtf_fa = sailfish._create_combined_fasta(data, salmon_dir)
    num_cores = dd.get_num_cores(data)
    strandedness = dd.get_strandedness(data).lower()
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    libtype = sailfish._libtype_string(fq1, fq2, strandedness)
    num_cores = dd.get_num_cores(data)
    index = salmon_index(gtf_file, ref_file, data, salmon_dir)
    cmd = ("{salmon} quant {libtype} -i {index} -p {num_cores} "
           "-o {tx_out_dir} ")
    fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})"
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    if not fq2:
        cmd += " -r {fq1_cmd} "
    else:
        fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})"
        fq2_cmd = fq2_cmd.format(fq2=fq2)
        cmd += " -1 {fq1_cmd} -2 {fq2_cmd} "
    cmd += "--numBootstraps 30 --useVBOpt "
    with file_transaction(data, salmon_dir) as tx_out_dir:
        message = ("Quantifying transcripts in %s and %s with Salmon." %
                   (fq1, fq2))
        do.run(cmd.format(**locals()), message, None)
    return out_file
Beispiel #16
0
def rapmap_index(gtf_file, ref_file, algorithm, data, out_dir):
    valid_indexes = ["pseudoindex", "quasiindex"]
    index_type = algorithm + "index"
    assert index_type in valid_indexes, \
        "RapMap only supports %s indices." % valid_indexes
    out_dir = os.path.join(out_dir, index_type, dd.get_genome_build(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambguate(data))
    rapmap = config_utils.get_program("rapmap", dd.get_config(data))
    # use user supplied transcriptome FASTA file if it exists
    if dd.get_transcriptome_fasta(data):
        out_dir = os.path.join(out_dir, index_type, dd.get_genome_build(data))
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data)
    tmpdir = dd.get_tmp_dir(data)
    if file_exists(out_dir + "rapidx.jfhash"):
        return out_dir
    files = dd.get_input_sequence_files(data)
    kmersize = sailfish.pick_kmersize(files[0])
    message = "Creating rapmap {index_type} for {gtf_fa} with {kmersize} bp kmers."
    with file_transaction(out_dir) as tx_out_dir:
        cmd = "{rapmap} {index_type} -k {kmersize} -i {tx_out_dir} -t {gtf_fa}"
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
Beispiel #17
0
def collect_oxog_metrics(data):
    """ extracts 8-oxoguanine (OxoG) artifact metrics from CollectSequencingArtifacts
    output so we don't have to run CollectOxoGMetrics.
    """
    input_base = os.path.join(dd.get_work_dir(data), "metrics", "artifact", dd.get_sample_name(data),
                              dd.get_sample_name(data))
    if not utils.file_exists(input_base + ".pre_adapter_detail_metrics"):
        return None
    OUT_SUFFIXES = [".oxog_metrics"]
    picard = broad.runner_from_path("picard", dd.get_config(data))
    out_dir = os.path.join(dd.get_work_dir(data), "metrics", "oxog", dd.get_sample_name(data))
    utils.safe_makedir(out_dir)
    ref_file = dd.get_ref_file(data)
    out_base = os.path.join(out_dir, dd.get_sample_name(data))
    out_files = [out_base + x for x in OUT_SUFFIXES]
    if all([utils.file_exists(x) for x in out_files]):
        return out_files
    with file_transaction(data, out_dir) as tx_out_dir:
        utils.safe_makedir(tx_out_dir)
        out_base = os.path.join(tx_out_dir, dd.get_sample_name(data))
        params = [("--INPUT_BASE", input_base),
                  ("--OUTPUT_BASE", out_base),
                  ("--REFERENCE_SEQUENCE", ref_file)]
        picard.run("ConvertSequencingArtifactToOxoG", params)
    return out_files
Beispiel #18
0
def trim_adapters(data):
    to_trim = [x for x in data["files"] if x is not None]
    logger.info("Trimming low quality ends and read through adapter "
                "sequence from %s." % (", ".join(to_trim)))
    out_dir = os.path.join(dd.get_work_dir(data), "trimmed")
    config = dd.get_config(data)
    return _trim_adapters(to_trim, out_dir, config)
Beispiel #19
0
def salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, ref_file, data):
    safe_makedir(salmon_dir)
    samplename = dd.get_sample_name(data)
    out_file = os.path.join(salmon_dir, "quant.sf")
    if file_exists(out_file):
        return out_file
    gtf_fa = sailfish._create_combined_fasta(data, salmon_dir)
    num_cores = dd.get_num_cores(data)
    strandedness = dd.get_strandedness(data).lower()
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    libtype = sailfish._libtype_string(fq1, fq2, strandedness)
    num_cores = dd.get_num_cores(data)
    index = salmon_index(gtf_file, ref_file, data, salmon_dir)
    cmd = ("{salmon} quant {libtype} -i {index} -p {num_cores} "
           "-o {tx_out_dir} ")
    fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})"
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    if not fq2:
        cmd += " -r {fq1_cmd} "
    else:
        fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})"
        fq2_cmd = fq2_cmd.format(fq2=fq2)
        cmd += " -1 {fq1_cmd} -2 {fq2_cmd} "
    cmd += "--numBootstraps 30 --useVBOpt "
    with file_transaction(data, salmon_dir) as tx_out_dir:
        message = ("Quantifying transcripts in %s and %s with Salmon."
                   %(fq1, fq2))
        do.run(cmd.format(**locals()), message, None)
    return out_file
Beispiel #20
0
def _salmon_quant_reads(fq1, fq2, salmon_dir, index, data):
    samplename = dd.get_sample_name(data)
    quant_dir = os.path.join(salmon_dir, "quant")
    safe_makedir(salmon_dir)
    out_file = os.path.join(quant_dir, "quant.sf")
    if file_exists(out_file):
        return out_file
    num_cores = dd.get_num_cores(data)
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    num_cores = dd.get_num_cores(data)
    cmd = ("{salmon} quant -l A -i {index} -p {num_cores} "
           "-o {tx_out_dir} ")
    fq1_cmd = "<(cat {fq1})" if not is_gzipped(fq1) else "<(gzip -cd {fq1})"
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    if not fq2:
        cmd += " -r {fq1_cmd} "
    else:
        fq2_cmd = "<(cat {fq2})" if not is_gzipped(fq2) else "<(gzip -cd {fq2})"
        fq2_cmd = fq2_cmd.format(fq2=fq2)
        cmd += " -1 {fq1_cmd} -2 {fq2_cmd} "
    with file_transaction(data, quant_dir) as tx_out_dir:
        message = ("Quantifying transcripts in %s and %s with Salmon."
                   %(fq1, fq2))
        do.run(cmd.format(**locals()), message, None)
    return out_file
Beispiel #21
0
def count(data):
    """
    count reads mapping to genes using featureCounts
    http://subread.sourceforge.net
    """
    in_bam = dd.get_work_bam(data)
    gtf_file = dd.get_gtf_file(data)
    work_dir = dd.get_work_dir(data)
    out_dir = os.path.join(work_dir, "htseq-count")
    safe_makedir(out_dir)
    count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts"
    if file_exists(count_file):
        return count_file

    featureCounts = config_utils.get_program("featureCounts", dd.get_config(data))
    paired_flag = _paired_flag(in_bam)
    strand_flag = _strand_flag(data)

    cmd = ("{featureCounts} -a {gtf_file} -o {tx_count_file} -s {strand_flag} "
           "{paired_flag} {in_bam}")

    message = ("Count reads in {tx_count_file} mapping to {gtf_file} using "
               "featureCounts")
    with file_transaction(data, count_file) as tx_count_file:
        do.run(cmd.format(**locals()), message.format(**locals()))
    fixed_count_file = _format_count_file(count_file, data)
    os.rename(fixed_count_file, count_file)

    return count_file
Beispiel #22
0
def dedup_bismark(data):
    """Remove alignments to the same position in the genome from the Bismark
    mapping output using deduplicate_bismark
    """
    input_file = datadict.get_work_bam(data)
    input_file = bam.sort(input_file,
                          datadict.get_config(data),
                          order="queryname")
    sample_name = datadict.get_sample_name(data)
    output_dir = os.path.join(datadict.get_work_dir(data), 'dedup',
                              sample_name)
    output_dir = utils.safe_makedir(output_dir)

    input_file_name, input_file_extension = os.path.splitext(
        os.path.basename(input_file))
    output_file = os.path.join(
        output_dir, f'{input_file_name}.deduplicated{input_file_extension}')

    if utils.file_exists(output_file):
        data = datadict.set_work_bam(data, output_file)
        return [[data]]

    deduplicate_bismark = config_utils.get_program('deduplicate_bismark',
                                                   data['config'])
    command = f'{deduplicate_bismark} --output_dir {output_dir} {input_file}'
    with transaction.file_transaction(output_dir):
        do.run(command, 'remove deduplicate alignments')

    data = datadict.set_work_bam(data, output_file)
    return [[data]]
Beispiel #23
0
def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, data):
    """
    Run macs2 for chip and input samples avoiding
    errors due to samples.
    """
    # output file name need to have the caller name
    config = dd.get_config(data)
    out_file = os.path.join(out_dir, name + "_peaks_macs2.xls")
    macs2_file = os.path.join(out_dir, name + "_peaks.xls")
    if utils.file_exists(out_file):
        _compres_bdg_files(out_dir)
        return _get_output_files(out_dir)
    macs2 = config_utils.get_program("macs2", config)
    options = " ".join(resources.get("macs2", {}).get("options", ""))
    genome_size = bam.fasta.total_sequence_length(dd.get_ref_file(data))
    genome_size = "" if options.find("-g") > -1 else "-g %s" % genome_size
    paired = "-f BAMPE" if bam.is_paired(chip_bam) else ""
    with utils.chdir(out_dir):
        cmd = _macs2_cmd(method)
        try:
            do.run(cmd.format(**locals()), "macs2 for %s" % name)
            utils.move_safe(macs2_file, out_file)
        except subprocess.CalledProcessError:
            raise RuntimeWarning("macs2 terminated with an error.\n"
                                 "Please, check the message and report "
                                 "error if it is related to bcbio.\n"
                                 "You can add specific options for the sample "
                                 "setting resources as explained in docs: "
                                 "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources")
    _compres_bdg_files(out_dir)
    return _get_output_files(out_dir)
Beispiel #24
0
def salmon_index(gtf_file, ref_file, data, out_dir):
    out_dir = os.path.join(out_dir, "index", sailfish.get_build_string(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambiguate(data))
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    num_cores = dd.get_num_cores(data)
    if dd.get_transcriptome_fasta(data):
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data, out_dir)
    assert file_exists(gtf_fa), "%s was not found, exiting." % gtf_fa
    tmpdir = dd.get_tmp_dir(data)
    out_file = os.path.join(out_dir, "versionInfo.json")
    if file_exists(out_file):
        return out_dir
    files = dd.get_input_sequence_files(data)
    readlength = bam.fastq.estimate_read_length(files[0])
    if readlength % 2 == 0:
        readlength -= 1
    kmersize = min(readlength, 31)
    with file_transaction(data, out_dir) as tx_out_dir:
        cmd = "{salmon} index -k {kmersize} -p {num_cores} -i {tx_out_dir} -t {gtf_fa}"
        message = "Creating Salmon index for {gtf_fa}."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
Beispiel #25
0
def kallisto_rnaseq(fq1, fq2, kallisto_dir, gtf_file, fasta_file, data):
    samplename = dd.get_sample_name(data)
    quant_dir = os.path.join(kallisto_dir, "quant")
    safe_makedir(kallisto_dir)
    num_cores = dd.get_num_cores(data)
    strandedness = dd.get_strandedness(data).lower()
    kallisto = config_utils.get_program("kallisto", dd.get_config(data))
    index = kallisto_index(gtf_file, fasta_file, data, os.path.dirname(kallisto_dir))
    fusion_flag = "--fusion" if dd.get_fusion_mode(data) or dd.get_fusion_caller(data) else ""
    single_flag = "--single" if not fq2 else ""
    fraglength_flag = "--fragment-length=200" if not fq2 else ""
    sd_flag = "--sd=25" if not fq2 else ""
    bootstrap_flag = "--bootstrap-samples=30"
    fq2 = "" if not fq2 else fq2
    if not fq2:
        logger.warning("kallisto was run on single-end data and we set the "
          "estimated fragment length to 200 and the standard "
          "deviation to 25, if these don't reflect your data then "
          "the results may be inaccurate. Use with caution. See "
          "https://groups.google.com/forum/#!topic/kallisto-sleuth-users/h5LeAlWS33w "
          "for details.")
    cmd = ("{kallisto} quant {fusion_flag} -t {num_cores} {single_flag} "
           "{fraglength_flag} {sd_flag} {bootstrap_flag} "
           "-o {tx_out_dir} -i {index} {fq1} {fq2}")
    with file_transaction(data, quant_dir) as tx_out_dir:
        message = ("Quantifying transcripts with kallisto.")
        do.run(cmd.format(**locals()), message, None)
    return quant_dir
Beispiel #26
0
def collect_artifact_metrics(data):
    """Run CollectSequencingArtifacts to collect pre-adapter ligation artifact metrics
    https://gatk.broadinstitute.org/hc/en-us/articles/360037429491-CollectSequencingArtifactMetrics-Picard-
    use picard wrapper rather than gatk - works for gatk4 and gatk3 projects
    refactor - move to broad/picardrun
    """
    OUT_SUFFIXES = [".bait_bias_detail_metrics", ".error_summary_metrics",
                    ".pre_adapter_detail_metrics", ".pre_adapter_summary_metrics"]
    picard = broad.runner_from_path("picard", dd.get_config(data))
    ref_file = dd.get_ref_file(data)
    bam_file = dd.get_work_bam(data)
    if not bam_file:
        return None
    if "collectsequencingartifacts" in dd.get_tools_off(data):
        return None
    out_dir = os.path.join(dd.get_work_dir(data), "metrics", "artifact", dd.get_sample_name(data))
    utils.safe_makedir(out_dir)
    out_base = os.path.join(out_dir, dd.get_sample_name(data))
    out_files = [out_base + x for x in OUT_SUFFIXES]
    if all([utils.file_exists(x) for x in out_files]):
        return out_files
    with file_transaction(data, out_dir) as tx_out_dir:
        utils.safe_makedir(tx_out_dir)
        out_base = os.path.join(tx_out_dir, dd.get_sample_name(data))
        params = [("-REFERENCE_SEQUENCE", ref_file),
                  ("-INPUT", bam_file),
                  ("-OUTPUT", out_base)]
        # picard runner sets VALIDATION_STRINGENCY
        picard.run("CollectSequencingArtifactMetrics", params)
    return out_files
Beispiel #27
0
def starts_by_depth(bam_file, data, sample_size=10000000):
    """
    Return a set of x, y points where x is the number of reads sequenced and
    y is the number of unique start sites identified
    If sample size < total reads in a file the file will be downsampled.
    """
    config = dd.get_config(data)
    binsize = (sample_size / 100) + 1
    seen_starts = set()
    counted = 0
    num_reads = []
    starts = []
    buffer = []
    downsampled = bam.downsample(bam_file, data, sample_size)
    with bam.open_samfile(downsampled) as samfile:
        for read in samfile:
            if read.is_unmapped:
                continue
            counted += 1
            buffer.append(str(read.tid) + ":" + str(read.pos))
            if counted % binsize == 0:
                seen_starts.update(buffer)
                buffer = []
                num_reads.append(counted)
                starts.append(len(seen_starts))
        seen_starts.update(buffer)
        num_reads.append(counted)
        starts.append(len(seen_starts))
    return pd.DataFrame({"reads": num_reads, "starts": starts})
Beispiel #28
0
def trim_adapters(data):
    to_trim = [x for x in data["files"] if x is not None]
    logger.info("Trimming low quality ends and read through adapter "
                "sequence from %s." % (", ".join(to_trim)))
    out_dir = os.path.join(dd.get_work_dir(data), "trimmed")
    config = dd.get_config(data)
    return _trim_adapters(to_trim, out_dir, config)
Beispiel #29
0
def collect_artifact_metrics(data):
    """Run CollectSequencingArtifacts to collect pre-adapter ligation artifact metrics
    https://gatk.broadinstitute.org/hc/en-us/articles/360037429491-CollectSequencingArtifactMetrics-Picard-
    """
    OUT_SUFFIXES = [
        ".bait_bias_detail_metrics", ".error_summary_metrics",
        ".pre_adapter_detail_metrics", ".pre_adapter_summary_metrics"
    ]
    broad_runner = broad.runner_from_config(dd.get_config(data))
    gatk_type = broad_runner.gatk_type()
    ref_file = dd.get_ref_file(data)
    bam_file = dd.get_work_bam(data)
    out_dir = os.path.join(dd.get_work_dir(data), "metrics", "artifact",
                           dd.get_sample_name(data))
    utils.safe_makedir(out_dir)
    out_base = os.path.join(out_dir, dd.get_sample_name(data))
    out_files = [out_base + x for x in OUT_SUFFIXES]
    if all([utils.file_exists(x) for x in out_files]):
        return out_files
    with file_transaction(data, out_dir) as tx_out_dir:
        utils.safe_makedir(tx_out_dir)
        out_base = os.path.join(tx_out_dir, dd.get_sample_name(data))
        params = [
            "-T", "CollectSequencingArtifactMetrics", "-R", ref_file, "-I",
            bam_file, "-O", out_base
        ]
        broad_runner.run_gatk(params, log_error=False, parallel_gc=True)
    return out_files
Beispiel #30
0
def collect_oxog_metrics(data):
    """
    extracts 8-oxoguanine (OxoG) artifact metrics from CollectSequencingArtifacts
    output so we don't have to run CollectOxoGMetrics.
    """
    input_base = os.path.join(dd.get_work_dir(data), "metrics", "artifact",
                              dd.get_sample_name(data),
                              dd.get_sample_name(data))
    if not utils.file_exists(input_base + ".pre_adapter_detail_metrics"):
        return None
    OUT_SUFFIXES = [".oxog_metrics"]
    broad_runner = broad.runner_from_config(dd.get_config(data))
    gatk_type = broad_runner.gatk_type()
    out_dir = os.path.join(dd.get_work_dir(data), "metrics", "oxog",
                           dd.get_sample_name(data))
    utils.safe_makedir(out_dir)
    ref_file = dd.get_ref_file(data)
    out_base = os.path.join(out_dir, dd.get_sample_name(data))
    out_files = [out_base + x for x in OUT_SUFFIXES]
    if all([utils.file_exists(x) for x in out_files]):
        return out_files
    with file_transaction(data, out_dir) as tx_out_dir:
        utils.safe_makedir(tx_out_dir)
        out_base = os.path.join(tx_out_dir, dd.get_sample_name(data))
        params = [
            "-T", "ConvertSequencingArtifactToOxoG", "--INPUT_BASE",
            input_base, "-O", out_base, "-R", ref_file
        ]
        broad_runner.run_gatk(params, log_error=False, parallel_gc=True)
        # multiqc <= 1.9 looks for INPUT not INPUT_BASE for these files
        # see (https://github.com/ewels/MultiQC/pull/1310)
        cmd = f"sed 's/INPUT_BASE/INPUT/g' {out_base}.oxog_metrics -i"
        do.run(cmd, f"Fixing {out_base}.oxog_metrics to work with MultiQC.")
    return out_files
Beispiel #31
0
def salmon_decoy_index(gtf_file, data, out_dir):
    input_dir = os.path.join(dd.get_work_dir(data), "inputs", "transcriptome")
    decoy_transcriptome = os.path.join(
        input_dir,
        sailfish.get_build_string(data) + "-decoy.fa")
    out_dir = os.path.join(out_dir, "index", sailfish.get_build_string(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambiguate(data))
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    num_cores = dd.get_num_cores(data)
    if dd.get_transcriptome_fasta(data):
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data)
    assert file_exists(gtf_fa), "%s was not found, exiting." % gtf_fa
    decoy_sequence_file = get_decoy_sequence_file(data)
    decoy_name_file = get_decoy_name_file(data)
    gtf_fa = create_decoy_transcriptome(gtf_fa, get_decoy_sequence_file(data),
                                        decoy_transcriptome)
    out_file = os.path.join(out_dir, "versionInfo.json")
    if file_exists(out_file):
        logger.info("Transcriptome index for %s detected, skipping building." %
                    gtf_fa)
        return out_dir
    files = dd.get_input_sequence_files(data)
    kmersize = sailfish.pick_kmersize(files[0])
    with file_transaction(data, out_dir) as tx_out_dir:
        cmd = (
            "{salmon} index -k {kmersize} -p {num_cores} -i {tx_out_dir} -t {gtf_fa} "
            "--decoys {decoy_name_file} ")
        message = "Creating decoy-aware Salmon index for {gtf_fa} with {kmersize} bp kmers."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
Beispiel #32
0
def _salmon_quant_reads(fq1, fq2, salmon_dir, index, data):
    samplename = dd.get_sample_name(data)
    quant_dir = os.path.join(salmon_dir, "quant")
    safe_makedir(salmon_dir)
    out_file = os.path.join(quant_dir, "quant.sf")
    if file_exists(out_file):
        return out_file
    num_cores = dd.get_num_cores(data)
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    num_cores = dd.get_num_cores(data)
    cmd = ("{salmon} quant -l A -i {index} -p {num_cores} " "-o {tx_out_dir} ")
    fq1_cmd = "<(cat {fq1})" if not is_gzipped(fq1) else "<(gzip -cd {fq1})"
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    if not fq2:
        cmd += " -r {fq1_cmd} "
    else:
        fq2_cmd = "<(cat {fq2})" if not is_gzipped(
            fq2) else "<(gzip -cd {fq2})"
        fq2_cmd = fq2_cmd.format(fq2=fq2)
        cmd += " -1 {fq1_cmd} -2 {fq2_cmd} "
    with file_transaction(data, quant_dir) as tx_out_dir:
        message = ("Quantifying transcripts in %s and %s with Salmon." %
                   (fq1, fq2))
        do.run(cmd.format(**locals()), message, None)
    return out_file
Beispiel #33
0
def convert_to_kallisto(data):
    files = dd.get_input_sequence_files(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    kallisto_dir = os.path.join(work_dir, "kallisto", samplename, "fastq")
    out_file = os.path.join(kallisto_dir, "barcodes.batch")
    umis = config_utils.get_program("umis", dd.get_config(data))
    if file_exists(out_file):
        return out_file
    if dd.get_minimum_barcode_depth(data):
        cb_histogram = os.path.join(work_dir, "umis", samplename,
                                    "cb-histogram.txt")
        cb_cutoff = dd.get_minimum_barcode_depth(data)
        cb_options = "--cb_histogram {cb_histogram} --cb_cutoff {cb_cutoff}"
        cb_options = cb_options.format(**locals())
    else:
        cb_options = ""
    cmd = ("{umis} kallisto {cb_options} --out_dir {tx_kallisto_dir} {fq1}")
    with file_transaction(data, kallisto_dir) as tx_kallisto_dir:
        safe_makedir(tx_kallisto_dir)
        message = ("Transforming %s to Kallisto singlecell format. " % fq1)
        do.run(cmd.format(**locals()), message)
    return out_file
Beispiel #34
0
def salmon_index(gtf_file, ref_file, data, out_dir):
    out_dir = os.path.join(out_dir, "index", sailfish.get_build_string(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambiguate(data))
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    num_cores = dd.get_num_cores(data)
    if dd.get_transcriptome_fasta(data):
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data)
    assert file_exists(gtf_fa), "%s was not found, exiting." % gtf_fa
    tmpdir = dd.get_tmp_dir(data)
    out_file = os.path.join(out_dir, "versionInfo.json")
    if file_exists(out_file):
        logger.info("Transcriptome index for %s detected, skipping building." %
                    gtf_fa)
        return out_dir
    files = dd.get_input_sequence_files(data)
    readlength = bam.fastq.estimate_read_length(files[0])
    if readlength % 2 == 0:
        readlength -= 1
    kmersize = min(readlength, 31)
    with file_transaction(data, out_dir) as tx_out_dir:
        cmd = "{salmon} index -k {kmersize} -p {num_cores} -i {tx_out_dir} -t {gtf_fa}"
        message = "Creating Salmon index for {gtf_fa}."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
Beispiel #35
0
def kallisto_rnaseq(fq1, fq2, kallisto_dir, gtf_file, fasta_file, data):
    samplename = dd.get_sample_name(data)
    quant_dir = os.path.join(kallisto_dir, "quant")
    safe_makedir(kallisto_dir)
    sentinel_file = os.path.join(quant_dir, "abundance.h5")
    if os.path.exists(sentinel_file):
        return quant_dir
    num_cores = dd.get_num_cores(data)
    strandedness = dd.get_strandedness(data).lower()
    kallisto = config_utils.get_program("kallisto", dd.get_config(data))
    index = kallisto_index(gtf_file, fasta_file, data,
                           os.path.dirname(kallisto_dir))
    fusion_flag = "--fusion" if dd.get_fusion_mode(
        data) or dd.get_fusion_caller(data) else ""
    single_flag = "--single" if not fq2 else ""
    fraglength_flag = "--fragment-length=200" if not fq2 else ""
    sd_flag = "--sd=25" if not fq2 else ""
    bootstrap_flag = "--bootstrap-samples=30"
    fq2 = "" if not fq2 else fq2
    if not fq2:
        logger.warning(
            "kallisto was run on single-end data and we set the "
            "estimated fragment length to 200 and the standard "
            "deviation to 25, if these don't reflect your data then "
            "the results may be inaccurate. Use with caution. See "
            "https://groups.google.com/forum/#!topic/kallisto-sleuth-users/h5LeAlWS33w "
            "for details.")
    cmd = ("{kallisto} quant {fusion_flag} -t {num_cores} {single_flag} "
           "{fraglength_flag} {sd_flag} {bootstrap_flag} "
           "-o {tx_out_dir} -i {index} {fq1} {fq2}")
    with file_transaction(data, quant_dir) as tx_out_dir:
        message = ("Quantifying transcripts with kallisto.")
        do.run(cmd.format(**locals()), message, None)
    return quant_dir
Beispiel #36
0
def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources,
        data):
    """
    Run macs2 for chip and input samples avoiding
    errors due to samples.
    """
    # output file name need to have the caller name
    config = dd.get_config(data)
    out_file = os.path.join(out_dir, name + "_peaks_macs2.xls")
    macs2_file = os.path.join(out_dir, name + "_peaks.xls")
    if utils.file_exists(out_file):
        _compres_bdg_files(out_dir)
        return _get_output_files(out_dir)
    macs2 = config_utils.get_program("macs2", config)
    options = " ".join(resources.get("macs2", {}).get("options", ""))
    genome_size = HS.get(
        genome_build, bam.fasta.total_sequence_length(dd.get_ref_file(data)))
    genome_size = "" if options.find("-g") > -1 else "-g %s" % genome_size
    paired = "-f BAMPE" if bam.is_paired(chip_bam) else ""
    with utils.chdir(out_dir):
        cmd = _macs2_cmd(method)
        try:
            do.run(cmd.format(**locals()), "macs2 for %s" % name)
            utils.move_safe(macs2_file, out_file)
        except subprocess.CalledProcessError:
            raise RuntimeWarning(
                "macs2 terminated with an error.\n"
                "Please, check the message and report "
                "error if it is related to bcbio.\n"
                "You can add specific options for the sample "
                "setting resources as explained in docs: "
                "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources"
            )
    _compres_bdg_files(out_dir)
    return _get_output_files(out_dir)
Beispiel #37
0
def split_ATAC(data, bam_file=None):
    """
    splits a BAM into nucleosome-free (NF) and mono/di/tri nucleosome BAMs based
    on the estimated insert sizes
    uses the current working BAM file if no BAM file is supplied
    """
    sambamba = config_utils.get_program("sambamba", data)
    num_cores = dd.get_num_cores(data)
    base_cmd = f'{sambamba} view --format bam --nthreads {num_cores} '
    bam_file = bam_file if bam_file else dd.get_work_bam(data)
    out_stem = os.path.splitext(bam_file)[0]
    split_files = {}
    # we can only split these fractions from paired runs
    if not bam.is_paired(bam_file):
        split_files["full"] = bam_file
        data = tz.assoc_in(data, ['atac', 'align'], split_files)
        return data
    for arange in ATACRanges.values():
        out_file = f"{out_stem}-{arange.label}.bam"
        if not utils.file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                cmd = base_cmd +\
                    f'-F "template_length > {arange.min} and template_length < {arange.max}" ' +\
                    f'{bam_file} > {tx_out_file}'
                message = f'Splitting {arange.label} regions from {bam_file}.'
                do.run(cmd, message)
            bam.index(out_file, dd.get_config(data))
        split_files[arange.label] = out_file
    split_files["full"] = bam_file
    data = tz.assoc_in(data, ['atac', 'align'], split_files)
    return data
Beispiel #38
0
def filter_multimappers(align_file, data):
    """
    Filtering a BWA alignment file for uniquely mapped reads, from here:
    https://bioinformatics.stackexchange.com/questions/508/obtaining-uniquely-mapped-reads-from-bwa-mem-alignment
    """
    config = dd.get_config(data)
    type_flag = "" if bam.is_bam(align_file) else "S"
    base, ext = os.path.splitext(align_file)
    out_file = base + ".unique" + ext
    bed_file = dd.get_variant_regions(data) or dd.get_sample_callable(data)
    bed_cmd = '-L {0}'.format(bed_file) if bed_file else " "
    if utils.file_exists(out_file):
        return out_file
    base_filter = '-F "not unmapped {paired_filter} and not duplicate and [XA] == null and [SA] == null and not supplementary " '
    if bam.is_paired(align_file):
        paired_filter = "and paired and proper_pair"
    else:
        paired_filter = ""
    filter_string = base_filter.format(paired_filter=paired_filter)
    sambamba = config_utils.get_program("sambamba", config)
    num_cores = dd.get_num_cores(data)
    with file_transaction(out_file) as tx_out_file:
        cmd = ('{sambamba} view -h{type_flag} '
               '--nthreads {num_cores} '
               '-f bam {bed_cmd} '
               '{filter_string} '
               '{align_file} '
               '> {tx_out_file}')
        message = "Removing multimapped reads from %s." % align_file
        do.run(cmd.format(**locals()), message)
    bam.index(out_file, config)
    return out_file
Beispiel #39
0
def create_ataqv_report(samples):
    """
    make the ataqv report from a set of ATAC-seq samples
    """
    data = samples[0][0]
    new_samples = []
    reportdir = os.path.join(dd.get_work_dir(data), "qc", "ataqv")
    sentinel = os.path.join(reportdir, "index.html")
    if utils.file_exists(sentinel):
        ataqv_output = {"base": sentinel, "secondary": get_ataqv_report_files(reportdir)}
        new_data = []
        for data in dd.sample_data_iterator(samples):
            data = tz.assoc_in(data, ["ataqv_report"], ataqv_output)
            new_data.append(data)
        return dd.get_samples_from_datalist(new_data)
    mkarv = config_utils.get_program("mkarv", dd.get_config(data))
    ataqv_files = []
    for data in dd.sample_data_iterator(samples):
        qc = dd.get_summary_qc(data)
        ataqv_file = tz.get_in(("ataqv", "base"), qc, None)
        if ataqv_file and utils.file_exists(ataqv_file):
            ataqv_files.append(ataqv_file)
    if not ataqv_files:
        return samples
    ataqv_json_file_string = " ".join(ataqv_files)
    with file_transaction(reportdir) as txreportdir:
        cmd = f"{mkarv} {txreportdir} {ataqv_json_file_string}"
        message = f"Creating ataqv report from {ataqv_json_file_string}."
        do.run(cmd, message)
    new_data = []
    ataqv_output = {"base": sentinel, "secondary": get_ataqv_report_files(reportdir)}
    for data in dd.sample_data_iterator(samples):
        data = tz.assoc_in(data, ["ataqv_report"], ataqv_output)
        new_data.append(data)
    return dd.get_samples_from_datalist(new_data)
Beispiel #40
0
def gatk_filter_rnaseq(vrn_file, data):
    """
    this incorporates filters listed here, dropping clusters of variants
    within a 35 nucleotide window, high fischer strand values and low
    quality by depth
    https://software.broadinstitute.org/gatk/guide/article?id=3891
    java -jar GenomeAnalysisTK.jar -T VariantFiltration -R hg_19.fasta -V
    input.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0"
    -filterName QD -filter "QD < 2.0" -o output.vcf
    """
    out_file = "%s-filter%s" % utils.splitext_plus(vrn_file)
    if not file_exists(out_file):
        ref_file = dd.get_ref_file(data)
        with file_transaction(data, out_file) as tx_out_file:
            params = ["VariantFiltration",
                      "-R", ref_file,
                      "-V", vrn_file,
                      "--cluster-window-size", "35",
                      "--cluster-size", "3",
                      "--filter-expression", "'FS > 30.0'",
                      "--filter-name", "FS",
                      "--filter-expression", "'QD < 2.0'",
                      "--filter-name", "QD",
                      "--output", tx_out_file]
            # Use GATK4 for filtering, tools_off is for variant calling
            config = utils.deepish_copy(dd.get_config(data))
            if "gatk4" in dd.get_tools_off({"config": config}):
                config["algorithm"]["tools_off"].remove("gatk4")
            jvm_opts = broad.get_gatk_opts(config, os.path.dirname(tx_out_file))
            do.run(broad.gatk_cmd("gatk", jvm_opts, params, config), "Filter RNA-seq variants.")
    return out_file
Beispiel #41
0
def rapmap_index(gtf_file, ref_file, algorithm, data, out_dir):
    valid_indexes = ["pseudoindex", "quasiindex"]
    index_type = algorithm + "index"
    assert index_type in valid_indexes, \
        "RapMap only supports %s indices." % valid_indexes
    out_dir = os.path.join(out_dir, index_type, dd.get_genome_build(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambguate(data))
    rapmap = config_utils.get_program("rapmap", dd.get_config(data))
    # use user supplied transcriptome FASTA file if it exists
    if dd.get_transcriptome_fasta(data):
        out_dir = os.path.join(out_dir, index_type, dd.get_genome_build(data))
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data)
    tmpdir = dd.get_tmp_dir(data)
    if file_exists(out_dir + "rapidx.jfhash"):
        return out_dir
    files = dd.get_input_sequence_files(data)
    kmersize = sailfish.pick_kmersize(files[0])
    message = "Creating rapmap {index_type} for {gtf_fa} with {kmersize} bp kmers."
    with file_transaction(out_dir) as tx_out_dir:
        cmd = "{rapmap} {index_type} -k {kmersize} -i {tx_out_dir} -t {gtf_fa}"
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
Beispiel #42
0
def filter_multimappers(align_file, data):
    """
    It does not seem like bowtie2 has a corollary to the -m 1 flag in bowtie,
    there are some options that are close but don't do the same thing. Bowtie2
    sets the XS flag for reads mapping in more than one place, so we can just
    filter on that. This will not work for other aligners.
    """
    config = dd.get_config(data)
    type_flag = "" if bam.is_bam(align_file) else "S"
    base, ext = os.path.splitext(align_file)
    out_file = base + ".unique" + ext
    bed_file = dd.get_variant_regions(data)
    bed_cmd = '-L {0}'.format(bed_file) if bed_file else " "
    if utils.file_exists(out_file):
        return out_file
    base_filter = '-F "[XS] == null and not unmapped {paired_filter} and not duplicate" '
    if bam.is_paired(align_file):
        paired_filter = "and paired and proper_pair"
    else:
        paired_filter = ""
    filter_string = base_filter.format(paired_filter=paired_filter)
    sambamba = config_utils.get_program("sambamba", config)
    num_cores = dd.get_num_cores(data)
    with file_transaction(out_file) as tx_out_file:
        cmd = ('{sambamba} view -h{type_flag} '
               '--nthreads {num_cores} '
               '-f bam {bed_cmd} '
               '{filter_string} '
               '{align_file} '
               '> {tx_out_file}')
        message = "Removing multimapped reads from %s." % align_file
        do.run(cmd.format(**locals()), message)
    bam.index(out_file, config)
    return out_file
Beispiel #43
0
def starts_by_depth(bam_file, data, sample_size=10000000):
    """
    Return a set of x, y points where x is the number of reads sequenced and
    y is the number of unique start sites identified
    If sample size < total reads in a file the file will be downsampled.
    """
    config = dd.get_config(data)
    binsize = (sample_size / 100) + 1
    seen_starts = set()
    counted = 0
    num_reads = []
    starts = []
    buffer = []
    downsampled = bam.downsample(bam_file, data, sample_size)
    with bam.open_samfile(downsampled) as samfile:
        for read in samfile:
            if read.is_unmapped:
                continue
            counted += 1
            buffer.append(str(read.tid) + ":" + str(read.pos))
            if counted % binsize == 0:
                seen_starts.update(buffer)
                buffer = []
                num_reads.append(counted)
                starts.append(len(seen_starts))
        seen_starts.update(buffer)
        num_reads.append(counted)
        starts.append(len(seen_starts))
    return pd.DataFrame({"reads": num_reads, "starts": starts})
Beispiel #44
0
def gatk_filter_rnaseq(vrn_file, data):
    """
    this incorporates filters listed here, dropping clusters of variants
    within a 35 nucleotide window, high fischer strand values and low
    quality by depth
    https://software.broadinstitute.org/gatk/guide/article?id=3891
    java -jar GenomeAnalysisTK.jar -T VariantFiltration -R hg_19.fasta -V
    input.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0"
    -filterName QD -filter "QD < 2.0" -o output.vcf
    """
    out_file = "%s-filter%s" % utils.splitext_plus(vrn_file)
    if not file_exists(out_file):
        ref_file = dd.get_ref_file(data)
        with file_transaction(data, out_file) as tx_out_file:
            params = [
                "VariantFiltration", "-R", ref_file, "-V", vrn_file,
                "--cluster-window-size", "35", "--cluster-size", "3",
                "--filter-expression", "'FS > 30.0'", "--filter-name", "FS",
                "--filter-expression", "'QD < 2.0'", "--filter-name", "QD",
                "--output", tx_out_file
            ]
            jvm_opts = broad.get_gatk_opts(dd.get_config(data),
                                           os.path.dirname(tx_out_file))
            do.run(broad.gatk_cmd("gatk", jvm_opts, params),
                   "Filter RNA-seq variants.")
    return out_file
Beispiel #45
0
def convert_to_kallisto(data):
    files = dd.get_input_sequence_files(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    kallisto_dir = os.path.join(work_dir, "kallisto", samplename, "fastq")
    out_file = os.path.join(kallisto_dir, "barcodes.batch")
    umis = config_utils.get_program("umis", dd.get_config(data))
    if file_exists(out_file):
        return out_file
    if dd.get_minimum_barcode_depth(data):
        cb_histogram = os.path.join(work_dir, "umis", samplename, "cb-histogram.txt")
        cb_cutoff = dd.get_minimum_barcode_depth(data)
        cb_options = "--cb_histogram {cb_histogram} --cb_cutoff {cb_cutoff}"
        cb_options = cb_options.format(**locals())
    else:
        cb_options = ""
    cmd = ("{umis} kallisto {cb_options} --out_dir {tx_kallisto_dir} {fq1}")
    with file_transaction(data, kallisto_dir) as tx_kallisto_dir:
        safe_makedir(tx_kallisto_dir)
        message = ("Transforming %s to Kallisto singlecell format. "
                   % fq1)
        do.run(cmd.format(**locals()), message)
    return out_file
Beispiel #46
0
def filter_multimappers(align_file, data):
    """
    It does not seem like bowtie2 has a corollary to the -m 1 flag in bowtie,
    there are some options that are close but don't do the same thing. Bowtie2
    sets the XS flag for reads mapping in more than one place, so we can just
    filter on that. This will not work for other aligners.
    """
    config = dd.get_config(data)
    type_flag = "" if bam.is_bam(align_file) else "S"
    base, ext = os.path.splitext(align_file)
    out_file = base + ".unique" + ext
    if file_exists(out_file):
        return out_file
    base_filter = '-F "[XS] == null and not unmapped {paired_filter}"'
    if bam.is_paired(align_file):
        paired_filter = "and paired and proper_pair"
    else:
        paired_filter = ""
    filter_string = base_filter.format(paired_filter=paired_filter)
    sambamba = config_utils.get_program("sambamba", config)
    num_cores = dd.get_num_cores(data)
    with file_transaction(out_file) as tx_out_file:
        cmd = ('{sambamba} view -h{type_flag} '
               '--nthreads {num_cores} '
               '-f bam '
               '{filter_string} '
               '{align_file} '
               '> {tx_out_file}')
        message = "Removing multimapped reads from %s." % align_file
        do.run(cmd.format(**locals()), message)
    return out_file
Beispiel #47
0
def filter_multimappers(align_file, data):
    """
    Filtering a BWA alignment file for uniquely mapped reads, from here:
    https://bioinformatics.stackexchange.com/questions/508/obtaining-uniquely-mapped-reads-from-bwa-mem-alignment
    """
    config = dd.get_config(data)
    type_flag = "" if bam.is_bam(align_file) else "S"
    base, ext = os.path.splitext(align_file)
    out_file = base + ".unique" + ext
    bed_file = dd.get_variant_regions(data) or dd.get_sample_callable(data)
    bed_cmd = '-L {0}'.format(bed_file) if bed_file else " "
    if utils.file_exists(out_file):
        return out_file
    base_filter = '-F "not unmapped {paired_filter} and [XA] == null and [SA] == null and not supplementary " '
    if bam.is_paired(align_file):
        paired_filter = "and paired and proper_pair"
    else:
        paired_filter = ""
    filter_string = base_filter.format(paired_filter=paired_filter)
    sambamba = config_utils.get_program("sambamba", config)
    num_cores = dd.get_num_cores(data)
    with file_transaction(out_file) as tx_out_file:
        cmd = ('{sambamba} view -h{type_flag} '
               '--nthreads {num_cores} '
               '-f bam {bed_cmd} '
               '{filter_string} '
               '{align_file} '
               '> {tx_out_file}')
        message = "Removing multimapped reads from %s." % align_file
        do.run(cmd.format(**locals()), message)
    bam.index(out_file, config)
    return out_file
Beispiel #48
0
def gatk_rnaseq_calling(data):
    """
    use GATK to perform variant calling on RNA-seq data
    """
    broad_runner = broad.runner_from_config(dd.get_config(data))
    ref_file = dd.get_ref_file(data)
    split_bam = dd.get_split_bam(data)
    out_file = os.path.splitext(split_bam)[0] + ".gvcf"
    num_cores = dd.get_num_cores(data)
    if file_exists(out_file):
        data = dd.set_vrn_file(data, out_file)
        return data
    with file_transaction(out_file) as tx_out_file:
        params = ["-T", "HaplotypeCaller",
                  "-R", ref_file,
                  "-I", split_bam,
                  "-o", tx_out_file,
                  "-nct", str(num_cores),
                  "--emitRefConfidence", "GVCF",
                  "--variant_index_type", "LINEAR",
                  "--variant_index_parameter", "128000",
                  "-dontUseSoftClippedBases",
                  "-stand_call_conf", "20.0",
                  "-stand_emit_conf", "20.0"]
        broad_runner.run_gatk(params)
    data = dd.set_vrn_file(data, out_file)
    return data
Beispiel #49
0
def _gtf_to_fasta(gtf_file, ref_file, out_file, data=None):
    gtf_to_fasta = config_utils.get_program("gtf_to_fasta",
                                            dd.get_config(data))
    with file_transaction(data, out_file) as tx_gtf_fa:
        cmd = "{gtf_to_fasta} {gtf_file} {ref_file} {tx_gtf_fa}"
        message = "Extracting genomic sequences of {gtf_file}."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_file
Beispiel #50
0
def includes_missingalt(data):
    """
    As of GATK 4.1.0.0, variants with missing alts are generated
    (see https://github.com/broadinstitute/gatk/issues/5650)
    """
    MISSINGALT_VERSION = LooseVersion("4.1.0.0")
    version = LooseVersion(broad.get_gatk_version(config=dd.get_config(data)))
    return version >= MISSINGALT_VERSION
Beispiel #51
0
def get_star_version(data):
    star_path = config_utils.get_program("STAR", dd.get_config(data))
    cmd = "%s --version" % star_path
    subp = subprocess.Popen(cmd, stdout=subprocess.PIPE,
                            stderr=subprocess.STDOUT,
                            shell=True)
    with contextlib.closing(subp.stdout) as stdout:
        for line in stdout:
            if "STAR_" in line:
                version = line.split("STAR_")[1].strip()
    return version
Beispiel #52
0
def trim_adapters(data):
    to_trim = [x for x in data["files"] if x is not None and is_fastq(x)]
    if not to_trim:
        return data["files"]

    logger.info("Trimming low quality ends and read through adapter "
                "sequence from %s." % (", ".join(to_trim)))
    out_dir = safe_makedir(os.path.join(dd.get_work_dir(data), "trimmed"))
    config = dd.get_config(data)
    name = dd.get_sample_name(data)
    return _trim_adapters(to_trim, out_dir, name, config)
Beispiel #53
0
def _index_spikein(fasta, out_dir, data, kmer=31):
    out_dir = safe_makedir(os.path.join(out_dir, "index"))
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    num_cores = dd.get_num_cores(data)
    out_file = os.path.join(out_dir, "versionInfo.json")
    if file_exists(out_file):
        return out_dir
    with file_transaction(out_dir) as tx_out_dir:
        cmd = "{salmon} index -k {kmer} -p {num_cores} -i {tx_out_dir} -t {fasta}"
        message = "Creating Salmon index for {fasta}."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
Beispiel #54
0
def rapmap_pseudoindex(gtf_file, ref_file, data, out_dir):
    out_dir = os.path.join(out_dir, "pseudoindex", dd.get_genome_build(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambguate(data))
    rapmap = config_utils.get_program("rapmap", dd.get_config(data))
    gtf_fa = sailfish._create_combined_fasta(data, out_dir)
    tmpdir = dd.get_tmp_dir(data)
    if file_exists(out_dir + "rapidx.jfhash"):
        return out_dir
    with file_transaction(out_dir) as tx_out_dir:
        cmd = "{rapmap} pseudoindex -k 31 -i {tx_out_dir} -t {gtf_fa}"
        message = "Creating rapmap pseudoindex for {gtf_fa}."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
Beispiel #55
0
def salmon_index(gtf_file, ref_file, data, out_dir):
    out_dir = os.path.join(out_dir, "index", dd.get_genome_build(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambguate(data))
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    num_cores = dd.get_num_cores(data)
    gtf_fa = sailfish._create_combined_fasta(data, out_dir)
    tmpdir = dd.get_tmp_dir(data)
    ### TODO PUT MEMOZATION HERE
    with file_transaction(out_dir) as tx_out_dir:
        cmd = "{salmon} index -k 31 -p {num_cores} -i {tx_out_dir} -t {gtf_fa}"
        message = "Creating Salmon index for {gtf_fa}."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
Beispiel #56
0
def run_pizzly(data):
    work_dir = dd.get_work_dir(data)
    pizzlydir = os.path.join(work_dir, "pizzly")
    samplename = dd.get_sample_name(data)
    gtf = dd.get_gtf_file(data)
    if dd.get_transcriptome_fasta(data):
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data)
    fraglength = get_fragment_length(data)
    cachefile = os.path.join(pizzlydir, "pizzly.cache")
    fusions = kallisto.get_kallisto_fusions(data)
    pizzlypath = config_utils.get_program("pizzly", dd.get_config(data))
    outdir = pizzly(pizzlypath, gtf, gtf_fa, fraglength, cachefile, pizzlydir,
                    fusions, samplename, data)
    return outdir
Beispiel #57
0
def rapmap_index(gtf_file, ref_file, algorithm, data, out_dir):
    valid_indexes = ["pseudoindex", "quasiindex"]
    index_type = algorithm + "index"
    assert index_type in valid_indexes, \
        "RapMap only supports %s indices." % valid_indexes
    out_dir = os.path.join(out_dir, index_type, dd.get_genome_build(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambguate(data))
    rapmap = config_utils.get_program("rapmap", dd.get_config(data))
    gtf_fa = sailfish.create_combined_fasta(data, out_dir)
    tmpdir = dd.get_tmp_dir(data)
    if file_exists(out_dir + "rapidx.jfhash"):
        return out_dir
    with file_transaction(out_dir) as tx_out_dir:
        cmd = "{rapmap} {index_type} -k 31 -i {tx_out_dir} -t {gtf_fa}"
        message = "Creating rapmap {index_type} for {gtf_fa}."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
Beispiel #58
0
def salmon_index(gtf_file, ref_file, data, out_dir):
    out_dir = os.path.join(out_dir, "index", dd.get_genome_build(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambguate(data))
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    num_cores = dd.get_num_cores(data)
    if dd.get_transcriptome_fasta(data):
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data, out_dir)
    tmpdir = dd.get_tmp_dir(data)
    out_file = os.path.join(out_dir, "versionInfo.json")
    if file_exists(out_file):
        return out_dir
    with file_transaction(out_dir) as tx_out_dir:
        cmd = "{salmon} index -k 31 -p {num_cores} -i {tx_out_dir} -t {gtf_fa}"
        message = "Creating Salmon index for {gtf_fa}."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir