예제 #1
0
def run_rnaseq(bam_file, data, out_dir):
    """
    Run qualimap for a rnaseq bam file and parse results
    """
    strandedness = {"firststrand": "strand-specific-reverse",
                    "secondstrand": "strand-specific-forward",
                    "unstranded": "non-strand-specific"}
    report_file = os.path.join(out_dir, "qualimapReport.html")
    raw_file = os.path.join(out_dir, "rnaseq_qc_results.txt")
    config = data["config"]
    gtf_file = dd.get_gtf_file(data)
    single_end = not bam.is_paired(bam_file)
    library = strandedness[dd.get_strandedness(data)]
    if not utils.file_exists(report_file):
        utils.safe_makedir(out_dir)
        bam.index(bam_file, config)
        cmd = _rnaseq_qualimap_cmd(data, bam_file, out_dir, gtf_file, single_end, library)
        do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data)))
        cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), raw_file)
        do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data)))
    metrics = _parse_rnaseq_qualimap_metrics(report_file)
    metrics.update(_detect_duplicates(bam_file, out_dir, data))
    metrics.update(_detect_rRNA(data))
    metrics.update({"Average insert size": bam.estimate_fragment_size(bam_file)})
    metrics = _parse_metrics(metrics)
    return metrics
예제 #2
0
def _detect_rRNA(data, out_dir):
    out_file = os.path.join(out_dir, "rRNA_metrics.txt")
    if not utils.file_exists(out_file):
        gtf_file = dd.get_gtf_file(data)
        quant = tz.get_in(["quant", "tsv"], data)
        if not quant:
            salmon_dir = dd.get_salmon_dir(data)
            if salmon_dir:
                quant = os.path.join(salmon_dir, "quant", "quant.sf")
        logger.info("Calculating RNA-seq rRNA metrics for %s." % quant)
        rrna_features = gtf.get_rRNA(gtf_file)
        transcripts = set([x[1] for x in rrna_features if x])
        if not (transcripts and quant and utils.file_exists(quant)):
            return {'rRNA': "NA", "rRNA_rate": "NA"}
        sample_table = pd.read_csv(quant, sep="\t")
        rrna_exp = list(map(float, sample_table[sample_table["Name"].isin(transcripts)]["NumReads"]))
        total_exp = list(map(float, sample_table["NumReads"]))
        rrna = sum(rrna_exp)
        if sum(total_exp) == 0:
            rrna_rate = "NA"
        else:
            rrna_rate = float(rrna) / sum(total_exp)
        with file_transaction(out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                out_handle.write(",".join(["rRNA", str(rrna)]) + "\n")
                out_handle.write(",".join(["rRNA_rate", str(rrna_rate)]) + "\n")
    return _read_memoized_rrna(out_file)
예제 #3
0
def get_qc_tools(data):
    """Retrieve a list of QC tools to use based on configuration and analysis type.

    Uses defaults if previously set.
    """
    if dd.get_algorithm_qc(data):
        return dd.get_algorithm_qc(data)
    analysis = data["analysis"].lower()
    to_run = []
    if "fastqc" not in dd.get_tools_off(data):
        to_run.append("fastqc")
    if any([tool in dd.get_tools_on(data)
            for tool in ["qualimap", "qualimap_full"]]):
        to_run.append("qualimap")
    if analysis.startswith("rna-seq"):
        if gtf.is_qualimap_compatible(dd.get_gtf_file(data)):
            to_run.append("qualimap_rnaseq")
        else:
            logger.debug("GTF not compatible with Qualimap, skipping.")
    if analysis.startswith("smallrna-seq"):
        to_run.append("small-rna")
    if not analysis.startswith("smallrna-seq"):
        to_run.append("samtools")
        to_run.append("gemini")
        if tz.get_in(["config", "algorithm", "kraken"], data):
            to_run.append("kraken")
    if analysis.startswith(("standard", "variant", "variant2")):
        to_run += ["qsignature", "coverage", "variants", "picard"]
    return to_run
예제 #4
0
def rnaseq_vardict_variant_calling(data):
    sample = dd.get_sample_name(data)
    variation_dir = os.path.join(dd.get_work_dir(data), "variation")
    safe_makedir(variation_dir)
    out_file = os.path.join(variation_dir, sample + "-vardict.vcf.gz")
    if file_exists(out_file):
        data = dd.set_vrn_file(data, out_file)
        return data
    vardict_cmd = vardict.get_vardict_command(data)
    strandbias = "teststrandbias.R"
    var2vcf = "var2vcf_valid.pl"
    vcfstreamsort = config_utils.get_program("vcfstreamsort", data)
    compress_cmd = "| bgzip -c"
    freq = float(dd.get_min_allele_fraction(data, 20) / 100.0)
    var2vcf_opts = "-v 50"
    fix_ambig = vcfutils.fix_ambiguous_cl()
    remove_dup = vcfutils.remove_dup_cl()
    r_setup = ("unset R_HOME && export PATH=%s:$PATH && "
                % os.path.dirname(Rscript_cmd()))
    ref_file = dd.get_ref_file(data)
    bamfile = dd.get_work_bam(data)
    bed_file = gtf.gtf_to_bed(dd.get_gtf_file(data))
    opts = " -c 1 -S 2 -E 3 -g 4 "
    with file_transaction(out_file) as tx_out_file:
        jvm_opts = vardict._get_jvm_opts(data, tx_out_file)
        cmd = ("{r_setup}{jvm_opts}{vardict_cmd} -G {ref_file} -f {freq} "
                "-N {sample} -b {bamfile} {opts} {bed_file} "
                "| {strandbias}"
                "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} "
                "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd} "
                "> {tx_out_file}")
        message = "Calling RNA-seq variants with VarDict"
        do.run(cmd.format(**locals()), message)
    data = dd.set_vrn_file(data, out_file)
    return data
예제 #5
0
def _create_combined_fasta(data, out_dir):
    """
    if there are genomes to be disambiguated, create a FASTA file of
    all of the transcripts for all genomes
    """
    items = disambiguate.split([data])
    fasta_files = []
    for i in items:
        odata = i[0]
        gtf_file = dd.get_gtf_file(odata)
        ref_file = dd.get_ref_file(odata)
        out_file = os.path.join(out_dir, dd.get_genome_build(odata) + ".fa")
        if file_exists(out_file):
            fasta_files.append(out_file)
        else:
            out_file = _gtf_to_fasta(gtf_file, ref_file, out_file)
            out_file = _clean_gtf_fa(out_file, out_file)
            fasta_files.append(out_file)
    out_stem = os.path.join(out_dir, dd.get_genome_build(data))
    if dd.get_disambiguate(data):
        out_stem = "-".join([out_stem] + dd.get_disambiguate(data))
    combined_file = out_stem + ".fa"
    if file_exists(combined_file):
        return combined_file

    fasta_file_string = " ".join(fasta_files)
    cmd = "cat {fasta_file_string} > {tx_out_file}"
    with file_transaction(combined_file) as tx_out_file:
        do.run(cmd.format(**locals()), "Combining transcriptome FASTA files.")
    return combined_file
예제 #6
0
def _detect_rRNA(data):
    gtf_file = dd.get_gtf_file(data)
    count_file = dd.get_count_file(data)
    rrna_features = gtf.get_rRNA(gtf_file)
    genes = [x[0] for x in rrna_features if x]
    count_table = pd.read_csv(count_file, sep="\t", names=["id", "counts"])
    return {'rRNA': sum(count_table.ix[genes]["counts"])}
예제 #7
0
def count(data):
    """
    count reads mapping to genes using featureCounts
    http://subread.sourceforge.net
    """
    in_bam = dd.get_work_bam(data)
    gtf_file = dd.get_gtf_file(data)
    work_dir = dd.get_work_dir(data)
    out_dir = os.path.join(work_dir, "htseq-count")
    safe_makedir(out_dir)
    count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts"
    if file_exists(count_file):
        return count_file

    featureCounts = config_utils.get_program("featureCounts", dd.get_config(data))
    paired_flag = _paired_flag(in_bam)
    strand_flag = _strand_flag(data)

    cmd = ("{featureCounts} -a {gtf_file} -o {tx_count_file} -s {strand_flag} "
           "{paired_flag} {in_bam}")

    message = ("Count reads in {tx_count_file} mapping to {gtf_file} using "
               "featureCounts")
    with file_transaction(data, count_file) as tx_count_file:
        do.run(cmd.format(**locals()), message.format(**locals()))
    fixed_count_file = _format_count_file(count_file, data)
    os.rename(fixed_count_file, count_file)

    return count_file
예제 #8
0
def calling(data):
    """Main function to parallelize peak calling."""
    chip_bam = dd.get_work_bam(data)
    if data["work_bam_rep"] != "":
        rep_bam = data["work_bam_rep"]
    else:
        rep_bam = ""
    input_bam = data["work_bam_input"]
    caller_fn = get_callers()[data["rmats_fn"]]
    name = dd.get_sample_name(data)
    fastq_file = fastq.get_fastq_files(data)
    read_len = bam.fastq.estimate_read_length(fastq_file[0])
    if read_len < 50:
        read_len = 50
    elif read_len > 50 and read_len < 75:
        read_len = 75
    else:
        read_len = 100
    if len(fastq_file) > 1:
        read_pair = "paired"
    else:
        read_pair = "single"
    out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["rmats_fn"], name ))
    out_file = caller_fn(name, chip_bam, rep_bam, input_bam, dd.get_gtf_file(data), out_dir, read_len, read_pair, data["config"])
    data["rmats_file"] = out_file
    return [[data]]
예제 #9
0
 def _get_ericscript_db(self, data):
     transcript_file = dd.get_gtf_file(data)
     if transcript_file and os.path.exists(transcript_file):
         transcript_dir = os.path.dirname(transcript_file)
         ericscript_dirs = glob.glob(os.path.join(transcript_dir, "ericscript", "ericscript_db*"))
         if ericscript_dirs:
             return sorted(ericscript_dirs)[-1]
예제 #10
0
def run_rnaseq(bam_file, data, out_dir):
    """
    Run qualimap for a rnaseq bam file and parse results
    """
    strandedness = {"firststrand": "strand-specific-reverse",
                    "secondstrand": "strand-specific-forward",
                    "unstranded": "non-strand-specific"}

    # Qualimap results should be saved to a directory named after sample.
    # MultiQC (for parsing additional data) picks the sample name after the dir as follows:
    #   <sample name>/raw_data_qualimapReport/insert_size_histogram.txt
    results_dir = os.path.join(out_dir, dd.get_sample_name(data))
    report_file = os.path.join(results_dir, "qualimapReport.html")
    config = data["config"]
    gtf_file = dd.get_gtf_file(data)
    single_end = not bam.is_paired(bam_file)
    library = strandedness[dd.get_strandedness(data)]
    if not utils.file_exists(report_file):
        with file_transaction(data, results_dir) as tx_out_dir:
            utils.safe_makedir(tx_out_dir)
            raw_file = os.path.join(tx_out_dir, "rnaseq_qc_results.txt")
            bam.index(bam_file, config)
            cmd = _rnaseq_qualimap_cmd(data, bam_file, tx_out_dir, gtf_file, single_end, library)
            do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data)))
            cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), raw_file)
            do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data)))
    metrics = _parse_rnaseq_qualimap_metrics(report_file)
    metrics.update(_detect_duplicates(bam_file, results_dir, data))
    metrics.update(_detect_rRNA(data))
    metrics.update({"Average_insert_size": bam.estimate_fragment_size(bam_file)})
    metrics = _parse_metrics(metrics)
    return metrics
예제 #11
0
def get_known_splicesites_file(align_dir, data):
    gtf_file = dd.get_gtf_file(data)
    splicesites = os.path.join(os.path.dirname(gtf_file),
                               "ref-transcripts-splicesites.txt")
    if not file_exists(splicesites):
        splicesites = create_splicesites_file(gtf_file, align_dir, data)
    return splicesites
예제 #12
0
def count(data):
    """
    count reads mapping to genes using featureCounts
    http://subread.sourceforge.net
    """
    in_bam = dd.get_work_bam(data)
    sorted_bam = bam.sort(in_bam, dd.get_config(data), order="queryname")
    gtf_file = dd.get_gtf_file(data)
    work_dir = dd.get_work_dir(data)
    out_dir = os.path.join(work_dir, "htseq-count")
    safe_makedir(out_dir)
    count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts"
    summary_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts.summary"
    if file_exists(count_file):
        return count_file

    featureCounts = config_utils.get_program("featureCounts", dd.get_config(data))
    paired_flag = _paired_flag(in_bam)
    strand_flag = _strand_flag(data)

    filtered_bam = bam.filter_primary(sorted_bam, data)

    cmd = "{featureCounts} -a {gtf_file} -o {tx_count_file} -s {strand_flag} " "{paired_flag} {filtered_bam}"

    message = "Count reads in {tx_count_file} mapping to {gtf_file} using " "featureCounts"
    with file_transaction(data, [count_file, summary_file]) as tx_files:
        tx_count_file, tx_summary_file = tx_files
        do.run(cmd.format(**locals()), message.format(**locals()))
    fixed_count_file = _format_count_file(count_file, data)
    fixed_summary_file = _change_sample_name(summary_file, dd.get_sample_name(data), data=data)
    shutil.move(fixed_count_file, count_file)
    shutil.move(fixed_summary_file, summary_file)

    return count_file
예제 #13
0
def run_stringtie_expression(data):
    """
    estimate expression from Stringtie, using the bcbio datadict
    does not do transcriptome assembly
    """
    bam = dd.get_work_bam(data)
    gtf_file = dd.get_gtf_file(data)
    num_cores = dd.get_num_cores(data)
    sample_name = dd.get_sample_name(data)
    out_dir = os.path.join("stringtie", sample_name)
    isoform_fpkm = os.path.join(out_dir, sample_name + ".isoform.fpkm")
    gene_fpkm = os.path.join(out_dir, sample_name + ".fpkm")
    if file_exists(isoform_fpkm) and file_exists(gene_fpkm):
        data = dd.set_cufflinks_dir(data, out_dir)
        data = dd.set_fpkm(data, gene_fpkm)
        data = dd.set_fpkm_isoform(data, isoform_fpkm)
        return data
    with file_transaction(data, out_dir) as tx_out_dir:
        exon_file, transcript_file = _stringtie_expression(bam, gtf_file, num_cores, tx_out_dir)
        df = _parse_ballgown(transcript_file)
        _write_fpkms(df, tx_out_dir, sample_name)
    data = dd.set_cufflinks_dir(data, out_dir)
    data = dd.set_fpkm(data, gene_fpkm)
    data = dd.set_fpkm_isoform(data, isoform_fpkm)
    return data
예제 #14
0
def run(data):
    """Quantitaive isoforms expression by eXpress"""
    name = dd.get_sample_name(data)
    in_bam = dd.get_transcriptome_bam(data)
    config = data['config']
    if not in_bam:
        logger.info("Transcriptome-mapped BAM file not found, skipping eXpress.")
        return data
    gtf_fasta = gtf.gtf_to_fasta(dd.get_gtf_file(data), dd.get_ref_file(data))
    out_dir = os.path.join(dd.get_work_dir(data), "express", name)
    out_file = os.path.join(out_dir, name + ".xprs")
    express = config_utils.get_program("express", data['config'])
    strand = _set_stranded_flag(in_bam, data)
    if not file_exists(out_file):
        with tx_tmpdir(data) as tmp_dir:
            with file_transaction(out_dir) as tx_out_dir:
                bam_file = _prepare_bam_file(in_bam, tmp_dir, config)
                cmd = ("{express} --no-update-check -o {tx_out_dir} {strand} {gtf_fasta} {bam_file}")
                do.run(cmd.format(**locals()), "Run express on %s." % in_bam, {})
            shutil.move(os.path.join(out_dir, "results.xprs"), out_file)
    eff_count_file = _get_column(out_file, out_file.replace(".xprs", "_eff.counts"), 7)
    tpm_file = _get_column(out_file, out_file.replace("xprs", "tpm"), 14)
    fpkm_file = _get_column(out_file, out_file.replace("xprs", "fpkm"), 10)
    data = dd.set_express_counts(data, eff_count_file)
    data = dd.set_express_tpm(data, tpm_file)
    data = dd.set_express_fpkm(data, fpkm_file)
    return data
예제 #15
0
파일: bwa.py 프로젝트: Kisun/bcbio-nextgen
def align_transcriptome(fastq_file, pair_file, ref_file, data):
    """
    bwa mem with settings for aligning to the transcriptome for eXpress/RSEM/etc
    """
    work_bam = dd.get_work_bam(data)
    base, ext = os.path.splitext(work_bam)
    out_file = base + ".transcriptome" + ext
    if utils.file_exists(out_file):
        data = dd.set_transcriptome_bam(data, out_file)
        return data
    # bwa mem needs phred+33 quality, so convert if it is Illumina
    if dd.get_quality_format(data).lower() == "illumina":
        logger.info("bwa mem does not support the phred+64 quality format, " "converting %s and %s to phred+33.")
        fastq_file = fastq.groom(fastq_file, in_qual="fastq-illumina", data=data)
        if pair_file:
            pair_file = fastq.groom(pair_file, in_qual="fastq-illumina", data=data)
    bwa = config_utils.get_program("bwa", data["config"])
    gtf_file = dd.get_gtf_file(data)
    gtf_fasta = index_transcriptome(gtf_file, ref_file, data)
    args = " ".join(_bwa_args_from_config(data["config"]))
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    cmd = (
        "{bwa} mem {args} -a -t {num_cores} {gtf_fasta} {fastq_file} "
        "{pair_file} | samtools view -bhS - > {tx_out_file}"
    )

    with file_transaction(out_file) as tx_out_file:
        message = "Aligning %s and %s to the transcriptome." % (fastq_file, pair_file)
        do.run(cmd.format(**locals()), message)
    data = dd.set_transcriptome_bam(data, out_file)
    return data
예제 #16
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    paired = True if pair_file else False
    hisat2 = config_utils.get_program("hisat2", data)
    num_cores = dd.get_num_cores(data)
    quality_flag = _get_quality_flag(data)
    stranded_flag = _get_stranded_flag(data, paired)
    rg_flags = _get_rg_flags(names)
    out_file = os.path.join(align_dir, dd.get_lane(data)) + ".bam"
    if file_exists(out_file):
        data = dd.set_work_bam(data, out_file)
        return data
    cmd = ("{hisat2} -x {ref_file} -p {num_cores} {quality_flag} {stranded_flag} "
           "{rg_flags} ")
    if paired:
        cmd += "-1 {fastq_file} -2 {pair_file} "
    else:
        cmd += "-U {fastq_file} "
    if dd.get_analysis(data).lower() == "smallrna-seq":
        cmd += "-k 1000 "
    # if assembling transcripts, set flags that cufflinks can use
    if dd.get_assemble_transcripts(data):
        cmd += "--dta-cufflinks "
    if dd.get_analysis(data).lower() == "rna-seq":
        gtf_file = dd.get_gtf_file(data)
        splicesites = os.path.join(os.path.dirname(gtf_file),
                                   "ref-transcripts-splicesites.txt")
        cmd += "--known-splicesite-infile {splicesites} "
    message = "Aligning %s and %s with hisat2." %(fastq_file, pair_file)
    with file_transaction(out_file) as tx_out_file:
        cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_out_file)
        do.run(cmd.format(**locals()), message)
    data = dd.set_work_bam(data, out_file)
    return data
예제 #17
0
def combine_files(samples):
    """
    after quantitation, combine the counts/FPKM/TPM/etc into a single table with
    all samples
    """
    gtf_file = dd.get_gtf_file(samples[0][0], None)
    dexseq_gff = dd.get_dexseq_gff(samples[0][0])

    # combine featureCount files
    count_files = filter_missing([dd.get_count_file(x[0]) for x in samples])
    combined = count.combine_count_files(count_files, ext=".counts")
    annotated = count.annotate_combined_count_file(combined, gtf_file)

    # combine eXpress files
    express_counts_combined = combine_express(samples, combined)

    # combine Cufflinks files
    fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm"
    fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples])
    if fpkm_files:
        fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file)
    else:
        fpkm_combined = None
    fpkm_isoform_combined_file = os.path.splitext(combined)[0] + ".isoform.fpkm"
    isoform_files = filter_missing([dd.get_fpkm_isoform(x[0]) for x in samples])
    if isoform_files:
        fpkm_isoform_combined = count.combine_count_files(isoform_files,
                                                          fpkm_isoform_combined_file,
                                                          ".isoform.fpkm")
    else:
        fpkm_isoform_combined = None
    # combine DEXseq files
    dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq"
    to_combine_dexseq = filter_missing([dd.get_dexseq_counts(data[0]) for data in samples])
    if to_combine_dexseq:
        dexseq_combined = count.combine_count_files(to_combine_dexseq,
                                                    dexseq_combined_file, ".dexseq")
        dexseq.create_dexseq_annotation(dexseq_gff, dexseq_combined)
    else:
        dexseq_combined = None
    samples = spikein.combine_spikein(samples)
    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_combined_counts(data, combined)
        if annotated:
            data = dd.set_annotated_combined_counts(data, annotated)
        if fpkm_combined:
            data = dd.set_combined_fpkm(data, fpkm_combined)
        if fpkm_isoform_combined:
            data = dd.set_combined_fpkm_isoform(data, fpkm_isoform_combined)
        if express_counts_combined:
            data = dd.set_express_counts(data, express_counts_combined['counts'])
            data = dd.set_express_tpm(data, express_counts_combined['tpm'])
            data = dd.set_express_fpkm(data, express_counts_combined['fpkm'])
            data = dd.set_isoform_to_gene(data, express_counts_combined['isoform_to_gene'])
        if dexseq_combined:
            data = dd.set_dexseq_counts(data, dexseq_combined_file)
        updated_samples.append([data])
    return updated_samples
예제 #18
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    if not ref_file:
        logger.error("STAR index not found. We don't provide the STAR indexes "
                     "by default because they are very large. You can install "
                     "the index for your genome with: bcbio_nextgen.py upgrade "
                     "--aligners star --genomes genome-build-name --data")
        sys.exit(1)

    max_hits = 10
    srna = True if data["analysis"].lower().startswith("smallrna-seq") else False
    srna_opts = ""
    if srna:
        max_hits = 1000
        srna_opts = "--alignIntronMax 1"
    config = data["config"]
    star_dirs = _get_star_dirnames(align_dir, data, names)
    if file_exists(star_dirs.final_out):
        data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data)
        return data

    star_path = config_utils.get_program("STAR", config)
    fastq_files = " ".join([fastq_file, pair_file]) if pair_file else fastq_file
    num_cores = dd.get_num_cores(data)
    gtf_file = dd.get_gtf_file(data)
    if ref_file.endswith("chrLength"):
        ref_file = os.path.dirname(ref_file)

    with file_transaction(data, align_dir) as tx_align_dir:
        tx_star_dirnames = _get_star_dirnames(tx_align_dir, data, names)
        tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames
        safe_makedir(tx_align_dir)
        safe_makedir(tx_out_dir)
        cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} "
            "--runThreadN {num_cores} --outFileNamePrefix {tx_out_prefix} "
            "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} "
            "--outStd SAM {srna_opts} "
            "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS))
        cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file) if not srna else ""
        cmd += " --readFilesCommand zcat " if is_gzipped(fastq_file) else ""
        cmd += _read_group_option(names)
        fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"), False)
        if fusion_mode:
            cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 "
                    "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 "
                    "--chimScoreSeparation 5 "
                    "--chimOutType WithinSAM ")
        strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"),
                                    "unstranded").lower()
        if strandedness == "unstranded" and not srna:
            cmd += " --outSAMstrandField intronMotif "
        if not srna:
            cmd += " --quantMode TranscriptomeSAM "
        cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out)
        run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file)
        do.run(cmd.format(**locals()), run_message, None)
        print("hello")

    data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data)
    return data
예제 #19
0
def run_salmon_index(*samples):
    for data in dd.sample_data_iterator(samples):
        work_dir = dd.get_work_dir(data)
        salmon_dir = os.path.join(work_dir, "salmon")
        gtf_file = dd.get_gtf_file(data)
        fasta_file = dd.get_ref_file(data)
        assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
        salmon_index(gtf_file, fasta_file, data, salmon_dir)
    return samples
예제 #20
0
def run_rapmap_index(*samples):
    for data in dd.sample_data_iterator(samples):
        work_dir = dd.get_work_dir(data)
        rapmap_dir = os.path.join(work_dir, "rapmap")
        gtf_file = dd.get_gtf_file(data)
        fasta_file = dd.get_ref_file(data)
        assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
        rapmap_index(gtf_file, fasta_file, "quasi", data, rapmap_dir)
    return samples
예제 #21
0
def tagcount(data):
    bam = dd.get_transcriptome_bam(data)
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    sample_dir = os.path.join(umi_dir, dd.get_sample_name(data))
    out_prefix = os.path.join(sample_dir, dd.get_sample_name(data))
    out_file = out_prefix + ".mtx"
    if file_exists(out_file):
        data = dd.set_count_file(data, out_file)
        return [[data]]
    umis = config_utils.get_program("umis", data, default="umis")
    safe_makedir(sample_dir)
    cutoff = dd.get_minimum_barcode_depth(data)
    cb_histogram = os.path.join(sample_dir, "cb-histogram.txt")
    positional = "--positional" if dd.get_positional_umi(data, False) else ""
    if use_installed_transcriptome(data):
        gtf_file = dd.get_gtf_file(data)
    else:
        gtf_file  = dd.get_transcriptome_gtf(data, None)

    if gtf_file:
        gene_map_file = os.path.join(dd.get_work_dir(data), "annotation",
                                     os.path.splitext(gtf_file)[0] + "-tx2gene.tsv")
        gene_map_file = gtf.tx2genefile(gtf_file, gene_map_file, tsv=True)
        gene_map_flag = " --genemap {0} ".format(gene_map_file)
    else:
        gene_map_flag = ""

    message = "Counting alignments of transcripts in %s." % bam
    cmd = ("{umis} fasttagcount --cb_cutoff {cutoff} "
           "{gene_map_flag} "
           "{positional} "
           "--cb_histogram {cb_histogram}")
    out_files = [out_file, out_file + ".rownames", out_file + ".colnames"]
    umi_matrix_file = out_prefix + "-dupes.mtx"
    out_files += [umi_matrix_file, umi_matrix_file + ".rownames",
                  umi_matrix_file + ".colnames"]
    if has_umi_matrix(data):
        umi_matrix_flag = " --umi_matrix {tx_umi_matrix_full} "
    else:
        umi_matrix_flag = ""
    cmd += umi_matrix_flag
    cmd += " {bam} {tx_out_file_full}"
    with file_transaction(out_files) as tx_out_files:
        tx_out_file = tx_out_files[0]
        tx_out_file_full = tx_out_file + ".full"
        tx_umi_matrix = tx_out_files[3]
        tx_umi_matrix_full = tx_out_files[3] + ".full"
        do.run(cmd.format(**locals()), message)
        cmd = ("{umis} sparse {tx_out_file_full} {tx_out_file}")
        message = "Converting %s to sparse format." % tx_out_file_full
        do.run(cmd.format(**locals()), message)
        if has_umi_matrix(data):
            cmd = ("{umis} sparse {tx_umi_matrix_full} {tx_umi_matrix}")
            message = "Converting %s to sparse format." % tx_umi_matrix_full
        do.run(cmd.format(**locals()), message)
    data = dd.set_count_file(data, out_file)
    return [[data]]
예제 #22
0
def run_kallisto_index(*samples):
    for data in dd.sample_data_iterator(samples):
        work_dir = dd.get_work_dir(data)
        kallisto_dir = os.path.join(work_dir, "kallisto")
        gtf_file = dd.get_gtf_file(data)
        assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file
        fasta_file = dd.get_ref_file(data)
        assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
        kallisto_index(gtf_file, fasta_file, data, kallisto_dir)
    return samples
예제 #23
0
def _get_files(data):
    mapped = bam.mapped(data["work_bam"], data["config"])
    in_file = bam.sort(mapped, data["config"], order="queryname")
    gtf_file = dd.get_gtf_file(data)
    work_dir = dd.get_work_dir(data)
    out_dir = os.path.join(work_dir, "htseq-count")
    sample_name = dd.get_sample_name(data)
    out_file = os.path.join(out_dir, sample_name + ".counts")
    stats_file = os.path.join(out_dir, sample_name + ".stats")
    return in_file, gtf_file, out_file, stats_file
예제 #24
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    max_hits = 10
    srna = True if data["analysis"].lower().startswith("smallrna-seq") else False
    srna_opts = ""
    if srna:
        max_hits = 1000
        srna_opts = "--alignIntronMax 1"
    config = data["config"]
    out_prefix = os.path.join(align_dir, dd.get_lane(data))
    out_file = out_prefix + "Aligned.out.sam"
    out_dir = os.path.join(align_dir, "%s_star" % dd.get_lane(data))

    if not ref_file:
        logger.error("STAR index not found. We don't provide the STAR indexes "
                     "by default because they are very large. You can install "
                     "the index for your genome with: bcbio_nextgen.py upgrade "
                     "--aligners star --genomes genome-build-name --data")
        sys.exit(1)

    final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"]))
    if file_exists(final_out):
        data = _update_data(final_out, out_dir, names, data)
        return data
    star_path = config_utils.get_program("STAR", config)
    fastq_files = " ".join([fastq_file, pair_file]) if pair_file else fastq_file
    num_cores = dd.get_num_cores(data)
    gtf_file = dd.get_gtf_file(data)

    safe_makedir(align_dir)
    cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} "
           "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} "
           "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} "
           "--outStd SAM {srna_opts} "
           "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS))
    cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file)
    cmd += " --readFilesCommand zcat " if is_gzipped(fastq_file) else ""
    cmd += _read_group_option(names)
    fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"), False)
    if fusion_mode:
        cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15"
    strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"),
                                "unstranded").lower()
    if strandedness == "unstranded" and not srna:
        cmd += " --outSAMstrandField intronMotif "

    if dd.get_transcriptome_align(data) and not is_transcriptome_broken(data):
        cmd += " --quantMode TranscriptomeSAM "

    with file_transaction(data, final_out) as tx_final_out:
        cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out)
        run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file)
        do.run(cmd.format(**locals()), run_message, None)

    data = _update_data(final_out, out_dir, names, data)
    return data
예제 #25
0
def _detect_rRNA(data):
    gtf_file = dd.get_gtf_file(data)
    count_file = dd.get_count_file(data)
    rrna_features = gtf.get_rRNA(gtf_file)
    genes = [x[0] for x in rrna_features if x]
    if not genes:
        return {'rRNA': "NA", "rRNA_rate": "NA"}
    count_table = pd.read_csv(count_file, sep="\t", names=["id", "counts"])
    rrna = sum(count_table[count_table["id"].isin(genes)]["counts"])
    rrna_rate = float(rrna) / sum(count_table["counts"])
    return {'rRNA': str(rrna), 'rRNA_rate': str(rrna_rate)}
예제 #26
0
def run_sailfish_index(*samples):
    samples = [utils.to_single_data(x) for x in samples]
    Build = namedtuple('Build', ['build', 'ref', 'gtf'])
    builds = {Build(get_build_string(x), dd.get_ref_file(x), dd.get_gtf_file(x))
              for x in samples}
    data = samples[0]
    indexdirs = {}
    for build in builds:
        indexdirs[build.build] = sailfish_index(build.ref, build.gtf, data,
                                                build.build)
    return [[x] for x in samples]
예제 #27
0
def run_salmon_bam(data):
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    salmon_dir = os.path.join(work_dir, "salmon", samplename)
    gtf_file = dd.get_gtf_file(data)
    bam_file = dd.get_transcriptome_bam(data)
    fasta_file = dd.get_ref_file(data)
    out_file = salmon_quant_bam(bam_file, salmon_dir, gtf_file, fasta_file, data)
    data = dd.set_salmon(data, out_file)
    data = dd.set_salmon_dir(data, salmon_dir)
    data = dd.set_salmon_fraglen_file(data, _get_fraglen_file(salmon_dir))
    return [[data]]
예제 #28
0
def run_sailfish_index(*samples):
    fq1, _ = dd.get_input_sequence_files(samples[0][0])
    kmer_size = estimate_kmer_size(fq1)
    Build = namedtuple('Build', ['build', 'ref', 'gtf'])
    builds = {Build(get_build_string(x), dd.get_ref_file(x), dd.get_gtf_file(x))
              for x in dd.sample_data_iterator(samples)}
    data = samples[0][0]
    indexdirs = {}
    for build in builds:
        indexdirs[build.build] = sailfish_index(build.ref, build.gtf, data,
                                                build.build, kmer_size)
    return samples
예제 #29
0
def stringtie_merge(*samples):
    to_merge = filter_missing(flatten([dd.get_assembled_gtf(data) for data in
                                       dd.sample_data_iterator(samples)]))
    data = samples[0][0]
    ref_file = dd.get_sam_ref(data)
    gtf_file = dd.get_gtf_file(data)
    num_cores = dd.get_num_cores(data)
    merged_gtf = stringtie.merge(to_merge, ref_file, gtf_file, num_cores, data)
    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_merged_gtf(data, merged_gtf)
        updated_samples.append([data])
    return updated_samples
예제 #30
0
def run_salmon_bam(data):
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    salmon_dir = os.path.join(work_dir, "salmon", samplename)
    gtf_file = dd.get_gtf_file(data)
    bam_file = dd.get_transcriptome_bam(data)
    assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file
    fasta_file = dd.get_ref_file(data)
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    out_file = salmon_quant_bam(bam_file, salmon_dir, gtf_file, fasta_file, data)
    data = dd.set_salmon(data, out_file)
    data = dd.set_salmon_dir(data, salmon_dir)
    return [[data]]
예제 #31
0
def index(ref_file, out_dir, data):
    """Create a bismark index in the defined reference directory.
    """
    (ref_dir, local_file) = os.path.split(ref_file)
    gtf_file = dd.get_gtf_file(data)
    bismark = config_utils.find_program("bismark", data["config"])
    if not utils.file_exists(gtf_file):
        raise ValueError("%s not found, could not create a star index." %
                         (gtf_file))
    if not utils.file_exists(out_dir):
        with tx_tmpdir(data, os.path.dirname(out_dir)) as tx_out_dir:
            num_cores = dd.get_cores(data)
            cmd = "{bismark} --bowtie2 -p {num_cores} -n 1 -o {tx_out_dir} --basename {sample} --unmapped {ref_file} {in_fastq}"
            do.run(cmd.format(**locals()), "Index STAR")
            if os.path.exists(out_dir):
                shutil.rmtree(out_dir)
            shutil.move(tx_out_dir, out_dir)
    return out_dir
예제 #32
0
def run_rapmap_pseudoalign(data):
    samplename = dd.get_sample_name(data)
    files = dd.get_input_sequence_files(data)
    work_dir = dd.get_work_dir(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    rapmap_dir = os.path.join(work_dir, "rapmap", samplename)
    gtf_file = dd.get_gtf_file(data)
    assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file
    fasta_file = dd.get_ref_file(data)
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    out_file = rapmap_pseudoalign(fq1, fq2, rapmap_dir, gtf_file, fasta_file,
                                  data)
    data = dd.set_work_bam(data, out_file)
    data = dd.set_transcriptome_bam(data, out_file)
    return data
예제 #33
0
def run_sailfish(data):
    samplename = dd.get_sample_name(data)
    files = dd.get_input_sequence_files(data)
    work_dir = dd.get_work_dir(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    sailfish_dir = os.path.join(work_dir, "sailfish", samplename)
    gtf_file = dd.get_gtf_file(data)
    assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file
    fasta_file = dd.get_ref_file(data)
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    stranded = dd.get_strandedness(data).lower()
    out_file = sailfish(fq1, fq2, sailfish_dir, gtf_file, fasta_file, stranded, data)
    data = dd.set_sailfish(data, out_file)
    data = dd.set_sailfish_dir(data, sailfish_dir)
    return [[data]]
예제 #34
0
def _detect_rRNA(data):
    sample = dd.get_sample_name(data)
    gtf_file = dd.get_gtf_file(data)
    tidy_file = dd.get_sailfish_tidy(data)
    rrna_features = gtf.get_rRNA(gtf_file)
    transcripts = set([x[1] for x in rrna_features if x])
    if not (transcripts and tidy_file):
        return {'rRNA': "NA", "rRNA_rate": "NA"}
    count_table = pd.read_csv(tidy_file, sep="\t")
    sample_table = count_table[count_table["sample"].isin([sample])]
    rrna_exp = map(
        float, sample_table[sample_table["id"].isin(transcripts)]["numreads"])
    total_exp = map(float, sample_table["numreads"])
    rrna = sum(rrna_exp)
    if sum(total_exp) == 0:
        return {'rRNA': str(rrna), 'rRNA_rate': "NA"}
    rrna_rate = float(rrna) / sum(total_exp)
    return {'rRNA': str(rrna), 'rRNA_rate': str(rrna_rate)}
예제 #35
0
def cufflinks_merge(*samples):
    to_merge = set(
        filter_missing(
            flatten([
                dd.get_assembled_gtf(data)
                for data in dd.sample_data_iterator(samples)
            ])))
    data = samples[0][0]
    ref_file = dd.get_sam_ref(data)
    gtf_file = dd.get_gtf_file(data)
    num_cores = dd.get_num_cores(data)
    merged_gtf = cufflinks.merge(to_merge, ref_file, gtf_file, num_cores,
                                 samples[0][0])
    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_merged_gtf(data, merged_gtf)
        updated_samples.append([data])
    return updated_samples
예제 #36
0
def index(ref_file, out_dir, data):
    """Create a STAR index in the defined reference directory.
    """
    (ref_dir, local_file) = os.path.split(ref_file)
    gtf_file = dd.get_gtf_file(data)
    if not utils.file_exists(gtf_file):
        raise ValueError("%s not found, could not create a star index." % (gtf_file))
    if not utils.file_exists(out_dir):
        with tx_tmpdir(data, os.path.dirname(out_dir)) as tx_out_dir:
            num_cores = dd.get_cores(data)
            cmd = ("STAR --genomeDir {tx_out_dir} --genomeFastaFiles {ref_file} "
                   "--runThreadN {num_cores} "
                   "--runMode genomeGenerate --sjdbOverhang 99 --sjdbGTFfile {gtf_file}")
            do.run(cmd.format(**locals()), "Index STAR")
            if os.path.exists(out_dir):
                shutil.rmtree(out_dir)
            shutil.move(tx_out_dir, out_dir)
    return out_dir
예제 #37
0
def run_salmon_reads(data):
    data = utils.to_single_data(data)
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    salmon_dir = os.path.join(work_dir, "salmon", samplename)
    gtf_file = dd.get_gtf_file(data)
    files = dd.get_input_sequence_files(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file
    fasta_file = dd.get_ref_file(data)
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    out_file = salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, fasta_file, data)
    data = dd.set_sailfish(data, out_file)
    data = dd.set_sailfish_dir(data, salmon_dir)
    return [[data]]
예제 #38
0
def run_ataqv(data):
    if not dd.get_chip_method(data) == "atac":
        return None
    work_dir = dd.get_work_dir(data)
    sample_name = dd.get_sample_name(data)
    out_dir = os.path.join(work_dir, "qc", sample_name, "ataqv")
    peak_file = get_full_peaks(data)
    bam_file = get_unfiltered_bam(data)
    out_file = os.path.join(out_dir, sample_name + ".ataqv.json.gz")
    if not peak_file:
        logger.info(f"Full peak file for {sample_name} not found, skipping ataqv")
        return None
    if not bam_file:
        logger.info(f"Unfiltered BAM file for {sample_name} not found, skipping ataqv")
        return None
    if utils.file_exists(out_file):
        return out_file
    tss_bed_file = os.path.join(out_dir, "TSS.bed")
    tss_bed_file = gtf.get_tss_bed(dd.get_gtf_file(data), tss_bed_file, data, padding=0)
    if chromhacks.is_human(data):
        organism = "human"
        autosomal_reference_flag = ""
    elif chromhacks.is_mouse(data):
        organism = "mouse"
        autosomal_reference_flag = ""
    else:
        autosomal_reference = os.path.join(out_dir, "autosomal.txt")
        autosomal_reference = _make_autosomal_reference_file(autosomal_reference, data)
        organism = "None"
        autosomal_reference_flag = f"--autosomal-reference-file {autosomal_reference} "
    ataqv = config_utils.get_program("ataqv", data)
    mitoname = chromhacks.get_mitochondrial_chroms(data)[0]
    if not ataqv:
        logger.info(f"ataqv executable not found, skipping running ataqv.")
        return None
    with file_transaction(out_file) as tx_out_file:
        cmd = (f"{ataqv} --peak-file {peak_file} --name {sample_name} --metrics-file {tx_out_file} "
               f"--tss-file {tss_bed_file} {autosomal_reference_flag} "
               f"--ignore-read-groups --mitochondrial-reference-name {mitoname} "
               f"--tss-extension 1000 "
               f"{organism} {bam_file}")
        message = f"Running ataqv on {sample_name}."
        do.run(cmd, message)
    return out_file
예제 #39
0
def get_qc_tools(data):
    """Retrieve a list of QC tools to use based on configuration and analysis type.

    Uses defaults if previously set.
    """
    if dd.get_algorithm_qc(data):
        return dd.get_algorithm_qc(data)
    analysis = data["analysis"].lower()
    to_run = []
    if tz.get_in(["config", "algorithm", "kraken"], data):
        to_run.append("kraken")
    if "fastqc" not in dd.get_tools_off(data):
        to_run.append("fastqc")
    if any([
            tool in dd.get_tools_on(data)
            for tool in ["qualimap", "qualimap_full"]
    ]):
        to_run.append("qualimap")
    if analysis.startswith("rna-seq") or analysis == "smallrna-seq":
        if "qualimap" not in dd.get_tools_off(data):
            if gtf.is_qualimap_compatible(dd.get_gtf_file(data)):
                to_run.append("qualimap_rnaseq")
            else:
                logger.debug("GTF not compatible with Qualimap, skipping.")
    if analysis.startswith("smallrna-seq"):
        to_run.append("small-rna")
        to_run.append("atropos")
    if "coverage_qc" not in dd.get_tools_off(data):
        to_run.append("samtools")
    if analysis.startswith(("standard", "variant", "variant2")):
        if "coverage_qc" not in dd.get_tools_off(data):
            to_run += ["coverage", "picard"]
        to_run += ["qsignature", "variants"]
        if peddy.is_human(data):
            to_run += ["peddy"]
        if vcfutils.get_paired([data]):
            to_run += ["viral"]
        if damage.should_filter([data]):
            to_run += ["damage"]
    if dd.get_umi_consensus(data):
        to_run += ["umi"]
    if tz.get_in(["config", "algorithm", "preseq"], data):
        to_run.append("preseq")
    return to_run
예제 #40
0
def run_kallisto_rnaseq(data):
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    kallisto_dir = os.path.join(work_dir, "kallisto", samplename)
    gtf_file = dd.get_gtf_file(data)
    files = dd.get_input_sequence_files(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file
    fasta_file = dd.get_ref_file(data)
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    assert fq2, ("bcbio doesn't support kallisto for single-end reads, we can "
                 "add support for this if you open up an issue about it here: "
                 "https://github.com/bcbio/bcbio-nextgen/issues")
    out_file = kallisto_rnaseq(fq1, fq2, kallisto_dir, gtf_file, fasta_file, data)
    data = dd.set_kallisto_quant(data, out_file)
    return [[data]]
예제 #41
0
def run_kallisto_rnaseq(data):
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    kallisto_dir = os.path.join(work_dir, "kallisto", samplename)
    gtf_file = dd.get_transcriptome_gtf(data, default=dd.get_gtf_file(data))
    files = dd.get_input_sequence_files(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file
    fasta_file = dd.get_ref_file(data)
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    assert fq2, ("We don't support kallisto for single-end reads and fusion "
                 "calling with pizzly does not accept single end reads.")
    out_file = kallisto_rnaseq(fq1, fq2, kallisto_dir, gtf_file, fasta_file,
                               data)
    data = dd.set_kallisto_quant(data, out_file)
    return [[data]]
예제 #42
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    paired = True if pair_file else False
    hisat2 = config_utils.get_program("hisat2", data)
    num_cores = dd.get_num_cores(data)
    quality_flag = _get_quality_flag(data)
    stranded_flag = _get_stranded_flag(data, paired)
    rg_flags = _get_rg_flags(names)
    out_file = os.path.join(align_dir, dd.get_lane(data)) + ".bam"
    if file_exists(out_file):
        data = dd.set_work_bam(data, out_file)
        return data
    cmd = (
        "{hisat2} --new-summary -x {ref_file} -p {num_cores} {quality_flag} {stranded_flag} "
        "{rg_flags} ")
    if paired:
        cmd += "-1 {fastq_file} -2 {pair_file} "
    else:
        cmd += "-U {fastq_file} "
    if dd.get_analysis(data).lower() == "smallrna-seq":
        cmd += "-k 1000 "
    # if assembling transcripts, set flags that cufflinks/stringtie can use
    if dd.get_transcript_assembler(data):
        cmd += "--dta-cufflinks "
    if dd.get_analysis(data).lower() == "rna-seq":
        gtf_file = dd.get_gtf_file(data)
        splicesites = os.path.join(os.path.dirname(gtf_file),
                                   "ref-transcripts-splicesites.txt")
        if not file_exists(splicesites):
            splicesites = create_splicesites_file(gtf_file, align_dir, data)
        # empty splicesite files means there is no splicing, so skip this option
        # if there is no splicing for this organism
        if file_exists(splicesites):
            cmd += "--known-splicesite-infile {splicesites} "

    # apply additional hisat2 options
    cmd += " ".join(_get_options_from_config(data))

    message = "Aligning %s and %s with hisat2." % (fastq_file, pair_file)
    with file_transaction(data, out_file) as tx_out_file:
        cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_out_file)
        do.run(cmd.format(**locals()), message)
    data = dd.set_work_bam(data, out_file)
    return data
예제 #43
0
def _detect_rRNA(data):
    sample = dd.get_sample_name(data)
    gtf_file = dd.get_gtf_file(data)
    salmon_dir = dd.get_salmon_dir(data)
    quant = os.path.join(salmon_dir, "quant", "quant.sf")
    rrna_features = gtf.get_rRNA(gtf_file)
    transcripts = set([x[1] for x in rrna_features if x])
    if not (transcripts and utils.file_exists(quant)):
        return {'rRNA': "NA", "rRNA_rate": "NA"}
    sample_table = pd.read_csv(quant, sep="\t")
    rrna_exp = map(
        float,
        sample_table[sample_table["Name"].isin(transcripts)]["NumReads"])
    total_exp = map(float, sample_table["NumReads"])
    rrna = sum(rrna_exp)
    if sum(total_exp) == 0:
        return {'rRNA': str(rrna), 'rRNA_rate': "NA"}
    rrna_rate = float(rrna) / sum(total_exp)
    return {'rRNA': str(rrna), 'rRNA_rate': str(rrna_rate)}
예제 #44
0
def index(ref_file, out_dir, data):
    """Create a bismark index in the defined reference directory.
    """
    (ref_dir, local_file) = os.path.split(ref_file)
    gtf_file = dd.get_transcriptome_gtf(data, default=dd.get_gtf_file(data))
    bismark = config_utils.find_program("bismark", data["config"])
    if not utils.file_exists(gtf_file):
        raise ValueError("%s not found, could not create a bismark index." % (gtf_file))
    if not utils.file_exists(out_dir):
        with tx_tmpdir(data, os.path.dirname(out_dir)) as tx_out_dir:
            num_cores = dd.get_cores(data)
            other_opts = config_utils.get_resources("bismark", data["config"]).get("options", [])
            other_opts = " ".join([str(x) for x in other_opts]).strip()
            cmd = "{bismark} {other_opts} --bowtie2 -p {num_cores} -n 1 -o {tx_out_dir} --basename {sample} --unmapped {ref_file} {in_fastq}"
            do.run(cmd.format(**locals()), "Index STAR")
            if os.path.exists(out_dir):
                shutil.rmtree(out_dir)
            shutil.move(tx_out_dir, out_dir)
    return out_dir
예제 #45
0
def combine_express(samples, combined):
    """Combine tpm, effective counts and fpkm from express results"""
    if not combined:
        return None
    to_combine = [
        dd.get_express_counts(x) for x in dd.sample_data_iterator(samples)
        if dd.get_express_counts(x)
    ]
    gtf_file = dd.get_gtf_file(samples[0][0])
    isoform_to_gene_file = os.path.join(os.path.dirname(combined),
                                        "isoform_to_gene.txt")
    isoform_to_gene_file = express.isoform_to_gene_name(
        gtf_file, isoform_to_gene_file,
        dd.sample_data_iterator(samples).next())
    if len(to_combine) > 0:
        eff_counts_combined_file = os.path.splitext(
            combined)[0] + ".isoform.express_counts"
        eff_counts_combined = count.combine_count_files(
            to_combine, eff_counts_combined_file, ext=".counts")
        to_combine = [
            dd.get_express_tpm(x) for x in dd.sample_data_iterator(samples)
            if dd.get_express_tpm(x)
        ]
        tpm_counts_combined_file = os.path.splitext(
            combined)[0] + ".isoform.express_tpm"
        tpm_counts_combined = count.combine_count_files(
            to_combine, tpm_counts_combined_file)
        to_combine = [
            dd.get_express_fpkm(x) for x in dd.sample_data_iterator(samples)
            if dd.get_express_fpkm(x)
        ]
        fpkm_counts_combined_file = os.path.splitext(
            combined)[0] + ".isoform.express_fpkm"
        fpkm_counts_combined = count.combine_count_files(
            to_combine, fpkm_counts_combined_file, ext=".fpkm")
        return {
            'counts': eff_counts_combined,
            'tpm': tpm_counts_combined,
            'fpkm': fpkm_counts_combined,
            'isoform_to_gene': isoform_to_gene_file
        }
    return {}
예제 #46
0
파일: qc.py 프로젝트: vhuarui/bcbio-nextgen
def sample_summary(bam_file, data, out_dir):
    """Run RNA-SeQC on a single RNAseq sample, writing to specified output directory.
    """
    metrics_file = os.path.join(out_dir, "metrics.tsv")
    if not file_exists(metrics_file):
        with file_transaction(data, out_dir) as tx_out_dir:
            config = data["config"]
            ref_file = data["sam_ref"]
            genome_dir = os.path.dirname(os.path.dirname(ref_file))
            gtf_file = dd.get_gtf_file(data)
            sample_file = os.path.join(safe_makedir(tx_out_dir), "sample_file.txt")
            _write_sample_id_file(data, bam_file, sample_file)
            runner = rnaseqc_runner_from_config(config)
            rna_file = config_utils.get_rRNA_sequence(genome_dir)
            bam.index(bam_file, config)
            single_end = not bam.is_paired(bam_file)
            runner.run(sample_file, ref_file, rna_file, gtf_file, tx_out_dir, single_end)
            # we don't need this large directory for just the report
            shutil.rmtree(os.path.join(tx_out_dir, data["description"]))
    return _parse_rnaseqc_metrics(metrics_file, data["name"][-1])
예제 #47
0
def run_pizzly(data):
    work_dir = dd.get_work_dir(data)
    pizzlydir = os.path.join(work_dir, "pizzly")
    samplename = dd.get_sample_name(data)
    gtf = dd.get_gtf_file(data)
    if dd.get_transcriptome_fasta(data):
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data)
    stripped_fa = os.path.splitext(
        os.path.basename(gtf_fa))[0] + "-noversions.fa"
    stripped_fa = os.path.join(pizzlydir, stripped_fa)
    gtf_fa = fasta.strip_transcript_versions(gtf_fa, stripped_fa)
    fraglength = get_fragment_length(data)
    cachefile = os.path.join(pizzlydir, "pizzly.cache")
    fusions = kallisto.get_kallisto_fusions(data)
    pizzlypath = config_utils.get_program("pizzly", dd.get_config(data))
    outdir = pizzly(pizzlypath, gtf, gtf_fa, fraglength, cachefile, pizzlydir,
                    fusions, samplename, data)
    return outdir
예제 #48
0
def _rnaseq_qualimap(bam_file, data, out_dir):
    """
    Run qualimap for a rnaseq bam file and parse results
    """
    report_file = os.path.join(out_dir, "qualimapReport.html")
    config = data["config"]
    gtf_file = dd.get_gtf_file(data)
    ref_file = dd.get_ref_file(data)
    single_end = not bam.is_paired(bam_file)
    if not utils.file_exists(report_file):
        utils.safe_makedir(out_dir)
        bam.index(bam_file, config)
        cmd = _rnaseq_qualimap_cmd(config, bam_file, out_dir, gtf_file, single_end)
        do.run(cmd, "Qualimap for {}".format(data["name"][-1]))
    metrics = _parse_rnaseq_qualimap_metrics(report_file)
    metrics.update(_detect_duplicates(bam_file, out_dir, data))
    metrics.update(_detect_rRNA(data))
    metrics.update({"Fragment Length Mean": bam.estimate_fragment_size(bam_file)})
    metrics = _parse_metrics(metrics)
    return metrics
예제 #49
0
def run_salmon_reads(data):
    data = utils.to_single_data(data)
    files = dd.get_input_sequence_files(data)
    if bam.is_bam(files[0]):
        files = fastq.convert_bam_to_fastq(files[0], data["dirs"]["work"],
                                           data, data["dirs"], data["config"])
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    salmon_dir = os.path.join(work_dir, "salmon", samplename)
    gtf_file = dd.get_gtf_file(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    fasta_file = dd.get_ref_file(data)
    out_file = salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, fasta_file,
                                  data)
    data = dd.set_salmon(data, out_file)
    data = dd.set_salmon_dir(data, salmon_dir)
    return [[data]]
예제 #50
0
def run(data):
    """Quantitaive isoforms expression by eXpress"""
    name = dd.get_sample_name(data)
    in_bam = dd.get_transcriptome_bam(data)
    config = data['config']
    if not in_bam:
        logger.info(
            "Transcriptome-mapped BAM file not found, skipping eXpress.")
        return data
    out_dir = os.path.join(dd.get_work_dir(data), "express", name)
    out_file = os.path.join(out_dir, name + ".xprs")
    express = config_utils.get_program("express", data['config'])
    strand = _set_stranded_flag(in_bam, data)
    if not file_exists(out_file):
        gtf_fasta = gtf.gtf_to_fasta(dd.get_gtf_file(data),
                                     dd.get_ref_file(data))
        with tx_tmpdir(data) as tmp_dir:
            with file_transaction(data, out_dir) as tx_out_dir:
                bam_file = _prepare_bam_file(in_bam, tmp_dir, config)
                cmd = (
                    "{express} --no-update-check -o {tx_out_dir} {strand} {gtf_fasta} {bam_file}"
                )
                do.run(cmd.format(**locals()), "Run express on %s." % in_bam,
                       {})
            shutil.move(os.path.join(out_dir, "results.xprs"), out_file)
    eff_count_file = _get_column(out_file,
                                 out_file.replace(".xprs", "_eff.counts"),
                                 7,
                                 data=data)
    tpm_file = _get_column(out_file,
                           out_file.replace("xprs", "tpm"),
                           14,
                           data=data)
    fpkm_file = _get_column(out_file,
                            out_file.replace("xprs", "fpkm"),
                            10,
                            data=data)
    data = dd.set_express_counts(data, eff_count_file)
    data = dd.set_express_tpm(data, tpm_file)
    data = dd.set_express_fpkm(data, fpkm_file)
    return data
예제 #51
0
def estimate_expression(samples, run_parallel):
    samples = run_parallel("generate_transcript_counts", samples)
    combined = count.combine_count_files(
        [x[0]["count_file"] for x in samples if "count_file" in x[0]])
    gtf_file = dd.get_gtf_file(samples[0][0], None)
    annotated = count.annotate_combined_count_file(combined, gtf_file)
    samples = run_parallel("run_cufflinks", samples)
    #gene
    fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm"
    to_combine = [x[0]["fpkm"] for x in samples if "fpkm" in x[0]]
    fpkm_combined = count.combine_count_files(to_combine, fpkm_combined_file)
    #isoform
    fpkm_isoform_combined_file = os.path.splitext(
        combined)[0] + ".isoform.fpkm"
    to_combine_isoform = [
        x[0]["fpkm_isoform"] for x in samples if "fpkm_isoform" in x[0]
    ]
    fpkm_isoform_combined = count.combine_count_files(
        to_combine_isoform, fpkm_isoform_combined_file, ".isoform.fpkm")
    dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq"
    to_combine_dexseq = [dd.get_dexseq_counts(data[0]) for data in samples]
    to_combine_dexseq = filter(lambda x: x, to_combine_dexseq)
    if to_combine_dexseq:
        dexseq_combined = count.combine_count_files(to_combine_dexseq,
                                                    dexseq_combined_file,
                                                    ".dexseq")
    else:
        dexseq_combined = None

    for x in samples:
        x[0]["combined_counts"] = combined
        if annotated:
            x[0]["annotated_combined_counts"] = annotated
        if fpkm_combined:
            x[0]["combined_fpkm"] = fpkm_combined
        if fpkm_isoform_combined:
            x[0]["combined_fpkm_isoform"] = fpkm_isoform_combined
        if dexseq_combined:
            x[0] = dd.set_dexseq_counts(x[0], dexseq_combined_file)

    return samples
예제 #52
0
def align_transcriptome(fastq_file, pair_file, ref_file, data):
    """
    bowtie2 with settings for aligning to the transcriptome for eXpress/RSEM/etc
    """
    work_bam = dd.get_work_bam(data)
    base, ext = os.path.splitext(work_bam)
    out_file = base + ".transcriptome" + ext
    if file_exists(out_file):
        data = dd.set_transcriptome_bam(data, out_file)
        return data
    bowtie2 = config_utils.get_program("bowtie2", data["config"])
    gtf_file = dd.get_gtf_file(data)
    gtf_index = index_transcriptome(gtf_file, ref_file, data)
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    pair_cmd = "-2 %s " % pair_file if pair_file else ""
    cmd = ("{bowtie2} -p {num_cores} -a -X 600 --rdg 6,5 --rfg 6,5 --score-min L,-.6,-.4 --no-discordant --no-mixed -x {gtf_index} -1 {fastq_file} {pair_cmd} | samtools view -hbS - > {tx_out_file}")
    with file_transaction(out_file) as tx_out_file:
        message = "Aligning %s and %s to the transcriptome." % (fastq_file, pair_file)
        do.run(cmd.format(**locals()), message)
    data = dd.set_transcriptome_bam(data, out_file)
    return data
예제 #53
0
def run_salmon_decoy(data):
    data = utils.to_single_data(data)
    files = dd.get_input_sequence_files(data)
    if bam.is_bam(files[0]):
        files = fastq.convert_bam_to_fastq(files[0], data["dirs"]["work"],
                                           data, data["dirs"], data["config"])
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    salmon_dir = os.path.join(work_dir, "salmon", samplename)
    gtf_file = dd.get_gtf_file(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    index = salmon_decoy_index(gtf_file, data, os.path.dirname(salmon_dir))
    out_file = salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, data, index)
    data = dd.set_salmon(data, out_file)
    data = dd.set_salmon_dir(data, salmon_dir)
    data = dd.set_salmon_fraglen_file(data, _get_fraglen_file(salmon_dir))
    data = dd.update_summary_qc(data, "salmon", base=dd.get_salmon_fraglen_file(data))
    return [[data]]
예제 #54
0
def count(data):
    """
    count reads mapping to genes using featureCounts
    falls back on htseq_count method if featureCounts is not
    found
    """
    in_bam = dd.get_work_bam(data)
    gtf_file = dd.get_gtf_file(data)
    work_dir = dd.get_work_dir(data)
    out_dir = os.path.join(work_dir, "htseq-count")
    safe_makedir(out_dir)
    count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts"
    if file_exists(count_file):
        return count_file

    config = data["config"]

    try:
        featureCounts = config_utils.get_program("featureCounts", config)
    except config_utils.CmdNotFound:
        logger.info("featureCounts not found, falling back to htseq-count "
                    "for feature counting. You can upgrade the tools to "
                    "install featureCount with bcbio_nextgen.py upgrade "
                    "--tools.")
        return htseq_count(data)

    paired_flag = _paired_flag(in_bam)
    strand_flag = _strand_flag(config)

    cmd = ("{featureCounts} -a {gtf_file} -o {tx_count_file} -s {strand_flag} "
           "{paired_flag} {in_bam}")

    message = ("Count reads in {tx_count_file} mapping to {gtf_file} using "
               "featureCounts")
    with file_transaction(data, count_file) as tx_count_file:
        do.run(cmd.format(**locals()), message.format(**locals()))
    fixed_count_file = _format_count_file(count_file, data)
    os.rename(fixed_count_file, count_file)

    return count_file
예제 #55
0
def count(data):
    """
    count reads mapping to genes using featureCounts
    http://subread.sourceforge.net
    """
    in_bam = dd.get_work_bam(data)
    sorted_bam = bam.sort(in_bam, dd.get_config(data), order="queryname")
    gtf_file = dd.get_gtf_file(data)
    work_dir = dd.get_work_dir(data)
    out_dir = os.path.join(work_dir, "htseq-count")
    safe_makedir(out_dir)
    count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts"
    summary_file = os.path.join(out_dir,
                                dd.get_sample_name(data)) + ".counts.summary"
    if file_exists(count_file):
        return count_file

    featureCounts = config_utils.get_program("featureCounts",
                                             dd.get_config(data))
    paired_flag = _paired_flag(in_bam)
    strand_flag = _strand_flag(data)

    filtered_bam = bam.filter_primary(sorted_bam, data)

    cmd = ("{featureCounts} -a {gtf_file} -o {tx_count_file} -s {strand_flag} "
           "{paired_flag} {filtered_bam}")

    message = ("Count reads in {tx_count_file} mapping to {gtf_file} using "
               "featureCounts")
    with file_transaction(data, [count_file, summary_file]) as tx_files:
        tx_count_file, tx_summary_file = tx_files
        do.run(cmd.format(**locals()), message.format(**locals()))
    fixed_count_file = _format_count_file(count_file, data)
    fixed_summary_file = _change_sample_name(summary_file,
                                             dd.get_sample_name(data),
                                             data=data)
    shutil.move(fixed_count_file, count_file)
    shutil.move(fixed_summary_file, summary_file)

    return count_file
예제 #56
0
def run_rnaseq(bam_file, data, out_dir):
    """
    Run qualimap for a rnaseq bam file and parse results
    """
    strandedness = {"firststrand": "strand-specific-reverse",
                    "secondstrand": "strand-specific-forward",
                    "unstranded": "non-strand-specific"}

    # Qualimap results should be saved to a directory named after sample.
    # MultiQC (for parsing additional data) picks the sample name after the dir as follows:
    #   <sample name>/raw_data_qualimapReport/insert_size_histogram.txt
    results_dir = os.path.join(out_dir, dd.get_sample_name(data))
    results_file = os.path.join(results_dir, "rnaseq_qc_results.txt")
    report_file = os.path.join(results_dir, "qualimapReport.html")
    config = data["config"]
    gtf_file = dd.get_gtf_file(data)
    single_end = not bam.is_paired(bam_file)
    library = strandedness[dd.get_strandedness(data)]
    if not utils.file_exists(results_file):
        with file_transaction(data, results_dir) as tx_results_dir:
            utils.safe_makedir(tx_results_dir)
            bam.index(bam_file, config)
            cmd = _rnaseq_qualimap_cmd(data, bam_file, tx_results_dir, gtf_file, single_end, library)
            do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data)))
            tx_results_file = os.path.join(tx_results_dir, "rnaseq_qc_results.txt")
            cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), tx_results_file)
            do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data)))
    metrics = _parse_rnaseq_qualimap_metrics(report_file)
    metrics.update(_detect_duplicates(bam_file, results_dir, data))
    metrics.update(_detect_rRNA(data, results_dir))
    metrics.update({"Average_insert_size": bam.estimate_fragment_size(bam_file)})
    metrics = _parse_metrics(metrics)
    # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order
    # to keep its name after upload, we need to put  the base QC file (results_file) into the root directory (out_dir):
    base_results_file = os.path.join(out_dir, os.path.basename(results_file))
    shutil.copyfile(results_file, base_results_file)
    return {"base": base_results_file,
            "secondary": _find_qualimap_secondary_files(results_dir, base_results_file),
            "metrics": metrics}
예제 #57
0
def create_combined_tx2gene(data):
    out_dir = os.path.join(dd.get_work_dir(data), "inputs", "transcriptome")
    items = disambiguate.split([data])
    tx2gene_files = []
    for i in items:
        odata = i[0]
        gtf_file = dd.get_gtf_file(odata)
        out_file = os.path.join(out_dir, dd.get_genome_build(odata) + "-tx2gene.csv")
        if file_exists(out_file):
            tx2gene_files.append(out_file)
        else:
            out_file = gtf.tx2genefile(gtf_file, out_file, tsv=False)
            tx2gene_files.append(out_file)
    combined_file = os.path.join(out_dir, "tx2gene.csv")
    if file_exists(combined_file):
        return combined_file

    tx2gene_file_string = " ".join(tx2gene_files)
    cmd = "cat {tx2gene_file_string} > {tx_out_file}"
    with file_transaction(data, combined_file) as tx_out_file:
        do.run(cmd.format(**locals()), "Combining tx2gene CSV files.")
    return combined_file
예제 #58
0
def _stringtie_expression(bam, data, out_dir="."):
    """
    only estimate expression the Stringtie, do not assemble new transcripts
    """
    gtf_file = dd.get_transcriptome_gtf(data, dd.get_gtf_file(data))
    num_cores = dd.get_num_cores(data)
    error_message = "The %s file for %s is missing. StringTie has an error."
    stringtie = config_utils.get_program("stringtie", data, default="stringtie")
    # don't assemble transcripts unless asked
    exp_flag = ("-e" if "stringtie" not in dd.get_transcript_assembler(data)
                else "")
    base_cmd = ("{stringtie} {exp_flag} -b {out_dir} -p {num_cores} -G {gtf_file} "
                "-o {out_gtf} {bam}")
    transcript_file = os.path.join(out_dir, "t_data.ctab")
    exon_file = os.path.join(out_dir, "e_data.ctab")
    out_gtf = os.path.join(out_dir, "stringtie-assembly.gtf")
    if file_exists(transcript_file):
        return exon_file, transcript_file, out_gtf
    cmd = base_cmd.format(**locals())
    do.run(cmd, "Running Stringtie on %s." % bam)
    assert file_exists(exon_file), error_message % ("exon", exon_file)
    assert file_exists(transcript_file), error_message % ("transcript", transcript_file)
    return transcript_file
예제 #59
0
def run_rnaseq(bam_file, data, out_dir):
    """
    Run qualimap for a rnaseq bam file and parse results
    """
    strandedness = {
        "firststrand": "strand-specific-reverse",
        "secondstrand": "strand-specific-forward",
        "unstranded": "non-strand-specific"
    }

    # Qualimap results should be saved to a directory named after sample.
    # MultiQC (for parsing additional data) picks the sample name after the dir as follows:
    #   <sample name>/raw_data_qualimapReport/insert_size_histogram.txt
    results_dir = os.path.join(out_dir, dd.get_sample_name(data))
    report_file = os.path.join(results_dir, "qualimapReport.html")
    raw_file = os.path.join(results_dir, "rnaseq_qc_results.txt")
    config = data["config"]
    gtf_file = dd.get_gtf_file(data)
    single_end = not bam.is_paired(bam_file)
    library = strandedness[dd.get_strandedness(data)]
    if not utils.file_exists(report_file):
        utils.safe_makedir(results_dir)
        bam.index(bam_file, config)
        cmd = _rnaseq_qualimap_cmd(data, bam_file, results_dir, gtf_file,
                                   single_end, library)
        do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data)))
        cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (
            dd.get_sample_name(data), raw_file)
        do.run(cmd,
               "Fix Name Qualimap for {}".format(dd.get_sample_name(data)))
    metrics = _parse_rnaseq_qualimap_metrics(report_file)
    metrics.update(_detect_duplicates(bam_file, results_dir, data))
    metrics.update(_detect_rRNA(data))
    metrics.update(
        {"Average_insert_size": bam.estimate_fragment_size(bam_file)})
    metrics = _parse_metrics(metrics)
    return metrics
예제 #60
0
def rnaseq_vardict_variant_calling(data):
    sample = dd.get_sample_name(data)
    variation_dir = os.path.join(dd.get_work_dir(data), "variation")
    safe_makedir(variation_dir)
    out_file = os.path.join(variation_dir, sample + "-vardict.vcf.gz")
    if file_exists(out_file):
        data = dd.set_vrn_file(data, out_file)
        return data
    vardict_cmd = vardict.get_vardict_command(data)
    strandbias = "teststrandbias.R"
    var2vcf = "var2vcf_valid.pl"
    vcfstreamsort = config_utils.get_program("vcfstreamsort", data)
    compress_cmd = "| bgzip -c"
    freq = float(dd.get_min_allele_fraction(data, 20) / 100.0)
    var2vcf_opts = "-v 50"
    fix_ambig = vcfutils.fix_ambiguous_cl()
    remove_dup = vcfutils.remove_dup_cl()
    r_setup = get_R_exports()
    ref_file = dd.get_ref_file(data)
    bamfile = dd.get_work_bam(data)
    bed_file = gtf.gtf_to_bed(dd.get_gtf_file(data))
    opts = " -c 1 -S 2 -E 3 -g 4 "
    resources = config_utils.get_resources("vardict", data)
    if resources.get("options"):
        opts += " ".join([str(x) for x in resources["options"]])
    with file_transaction(data, out_file) as tx_out_file:
        jvm_opts = vardict._get_jvm_opts(data, tx_out_file)
        cmd = ("{r_setup} && {jvm_opts}{vardict_cmd} -G {ref_file} -f {freq} "
                "-N {sample} -b {bamfile} {opts} {bed_file} "
                "| {strandbias}"
                "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} "
                "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd} "
                "> {tx_out_file}")
        message = "Calling RNA-seq variants with VarDict"
        do.run(cmd.format(**locals()), message)
    data = dd.set_vrn_file(data, out_file)
    return data