Python get_transcriptome_bam 예제들, bcbio.pipeline.datadict.get_transcriptome_bam Python 예제들

예제 #1

0

파일 보기

def _maybe_add_transcriptome_alignment(sample, out):
    transcriptome_bam = dd.get_transcriptome_bam(sample)
    if transcriptome_bam and utils.file_exists(transcriptome_bam):
        out.append({"path": transcriptome_bam,
                    "type": "bam",
                    "ext": "transcriptome"})
    return out

예제 #2

0

파일 보기

파일: __init__.py 프로젝트: csardas/bcbio-nextgen

def _maybe_add_transcriptome_alignment(sample, out):
    transcriptome_bam = dd.get_transcriptome_bam(sample)
    if transcriptome_bam and utils.file_exists(transcriptome_bam):
        out.append({"path": transcriptome_bam,
                    "type": "bam",
                    "ext": "transcriptome"})
    return out

예제 #3

0

파일 보기

파일: rnaseq.py 프로젝트: senhongying/bcbio-nextgen

def generate_transcript_counts(data):
    """Generate counts per transcript and per exon from an alignment"""
    data["count_file"] = featureCounts.count(data)

    if dd.get_fusion_mode(data, False) and not dd.get_fusion_caller(data):
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data = dd.set_oncofuse_file(data, oncofuse_file)

    if dd.get_transcriptome_align(data):
        # to create a disambiguated transcriptome file realign with bowtie2
        if dd.get_disambiguate(data):
            logger.info("Aligning to the transcriptome with bowtie2 using the "
                        "disambiguated reads.")
            bam_path = data["work_bam"]
            fastq_paths = alignprep._bgzip_from_bam(bam_path, data["dirs"], data, is_retry=False, output_infix='-transcriptome')
            if len(fastq_paths) == 2:
                file1, file2 = fastq_paths
            else:
                file1, file2 = fastq_paths[0], None
            ref_file = dd.get_ref_file(data)
            data = bowtie2.align_transcriptome(file1, file2, ref_file, data)
        else:
            file1, file2 = dd.get_input_sequence_files(data)
        if not dd.get_transcriptome_bam(data):
            ref_file = dd.get_ref_file(data)
            logger.info("Transcriptome alignment was flagged to run, but the "
                        "transcriptome BAM file was not found. Aligning to the "
                        "transcriptome with bowtie2.")
            data = bowtie2.align_transcriptome(file1, file2, ref_file, data)
    data = spikein.counts_spikein(data)
    return [[data]]

예제 #4

0

파일 보기

파일: express.py 프로젝트: Tmacme/bcbio-nextgen

def run(data):
    """Quantitaive isoforms expression by eXpress"""
    name = dd.get_sample_name(data)
    in_bam = dd.get_transcriptome_bam(data)
    config = data['config']
    if not in_bam:
        logger.info("Transcriptome-mapped BAM file not found, skipping eXpress.")
        return data
    gtf_fasta = gtf.gtf_to_fasta(dd.get_gtf_file(data), dd.get_ref_file(data))
    out_dir = os.path.join(dd.get_work_dir(data), "express", name)
    out_file = os.path.join(out_dir, name + ".xprs")
    express = config_utils.get_program("express", data['config'])
    strand = _set_stranded_flag(in_bam, data)
    if not file_exists(out_file):
        with tx_tmpdir(data) as tmp_dir:
            with file_transaction(out_dir) as tx_out_dir:
                bam_file = _prepare_bam_file(in_bam, tmp_dir, config)
                cmd = ("{express} --no-update-check -o {tx_out_dir} {strand} {gtf_fasta} {bam_file}")
                do.run(cmd.format(**locals()), "Run express on %s." % in_bam, {})
            shutil.move(os.path.join(out_dir, "results.xprs"), out_file)
    eff_count_file = _get_column(out_file, out_file.replace(".xprs", "_eff.counts"), 7)
    tpm_file = _get_column(out_file, out_file.replace("xprs", "tpm"), 14)
    fpkm_file = _get_column(out_file, out_file.replace("xprs", "fpkm"), 10)
    data = dd.set_express_counts(data, eff_count_file)
    data = dd.set_express_tpm(data, tpm_file)
    data = dd.set_express_fpkm(data, fpkm_file)
    return data

예제 #5

0

파일 보기

파일: rnaseq.py 프로젝트: elkingtonmcb/bcbio-nextgen

def generate_transcript_counts(data):
    """Generate counts per transcript and per exon from an alignment"""
    data["count_file"] = featureCounts.count(data)

    if dd.get_fusion_mode(data, False):
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data = dd.set_oncofuse_file(data, oncofuse_file)

    if dd.get_transcriptome_align(data) and not dd.get_transcriptome_bam(data):
        file1, file2 = None, None

        if dd.get_disambiguate(data):
            bam_path = data["work_bam"]
            fastq_paths = alignprep._bgzip_from_bam(bam_path, data["dirs"], data["config"], is_retry=False, output_infix='-transcriptome')
            if len(fastq_paths) == 2:
                file1, file2 = fastq_paths
            else:
                file1, file2 = fastq_paths[0], None
        else:
            file1, file2 = dd.get_input_sequence_files(data)

        ref_file = dd.get_ref_file(data)
        logger.info("Transcriptome alignment was flagged to run, but the "
                    "transcriptome BAM file was not found. Aligning to the "
                    "transcriptome with bowtie2.")
        data = bowtie2.align_transcriptome(file1, file2, ref_file, data)
    return [[data]]

예제 #6

0

파일 보기

def tagcount(data):
    bam = dd.get_transcriptome_bam(data)
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    sample_dir = os.path.join(umi_dir, dd.get_sample_name(data))
    out_prefix = os.path.join(sample_dir, dd.get_sample_name(data))
    out_file = out_prefix + ".mtx"
    if file_exists(out_file):
        data = dd.set_count_file(data, out_file)
        return [[data]]
    safe_makedir(sample_dir)
    cutoff = dd.get_minimum_barcode_depth(data)
    cb_histogram = os.path.join(sample_dir, "cb-histogram.txt")
    positional = "--positional" if dd.get_positional_umi(data, False) else ""
    if use_installed_transcriptome(data):
        gtf_file = dd.get_gtf_file(data)
    else:
        gtf_file = dd.get_transcriptome_gtf(data, None)

    if gtf_file:
        gene_map_file = os.path.join(
            dd.get_work_dir(data), "annotation",
            os.path.basename(os.path.splitext(gtf_file)[0]) + "-tx2gene.tsv")
        gene_map_file = gtf.tx2genefile(gtf_file, gene_map_file, tsv=True)
        gene_map_flag = " --genemap {0} ".format(gene_map_file)
    else:
        gene_map_flag = ""

    message = "Counting alignments of transcripts in %s." % bam
    umis = _umis_cmd(data)
    cmd = ("{umis} fasttagcount --cb_cutoff {cutoff} "
           "{gene_map_flag} "
           "{positional} "
           "--cb_histogram {cb_histogram}")
    out_files = [out_file, out_file + ".rownames", out_file + ".colnames"]
    umi_matrix_file = out_prefix + "-dupes.mtx"
    out_files += [
        umi_matrix_file, umi_matrix_file + ".rownames",
        umi_matrix_file + ".colnames"
    ]
    if has_umi_matrix(data):
        umi_matrix_flag = " --umi_matrix {tx_umi_matrix_full} "
    else:
        umi_matrix_flag = ""
    cmd += umi_matrix_flag
    cmd += " {bam} {tx_out_file_full}"
    with file_transaction(out_files) as tx_out_files:
        tx_out_file = tx_out_files[0]
        tx_out_file_full = tx_out_file + ".full"
        tx_umi_matrix = tx_out_files[3]
        tx_umi_matrix_full = tx_out_files[3] + ".full"
        do.run(cmd.format(**locals()), message)
        cmd = ("{umis} sparse {tx_out_file_full} {tx_out_file}")
        message = "Converting %s to sparse format." % tx_out_file_full
        do.run(cmd.format(**locals()), message)
        if has_umi_matrix(data):
            cmd = ("{umis} sparse {tx_umi_matrix_full} {tx_umi_matrix}")
            message = "Converting %s to sparse format." % tx_umi_matrix_full
        do.run(cmd.format(**locals()), message)
    data = dd.set_count_file(data, out_file)
    return [[data]]

예제 #7

0

파일 보기

파일: express.py 프로젝트: kern3020/bcbio-nextgen

def run(data):
    """Quantitaive isoforms expression by express"""
    name = dd.get_sample_name(data)
    in_bam = dd.get_transcriptome_bam(data)
    tophat_index = get_in(data, ('genome_resources', 'rnaseq', 'transcriptome_index', 'tophat'))
    if not tophat_index:
        logger.info("Tophat index not found, skipping running eXpress.")
        return None
    tophat_fa = tophat_index.replace("ver", "fa")
    out_dir = os.path.join(dd.get_work_dir(data), "express", name)
    out_file = os.path.join(out_dir, name + ".xprs")
    safe_makedir(out_dir)
    express = config_utils.get_program("express", data['config'])
    if not in_bam:
        logger.info("Transcriptome-mapped BAM file not found, skipping eXpress.")
        return None
    if not file_exists(out_file):
        with tx_tmpdir() as tmp_dir:
            chdir(tmp_dir)
            ref_transcript = _do_fasta(tophat_fa)
            cmd = ("{express} {ref_transcript} {in_bam}")
            do.run(cmd.format(**locals()), "Run express", {})
            shutil.move("results.xprs", out_file)
    eff_count_file = _get_column(out_file, out_file.replace(".xprs", "_eff.counts"), 7)
    tpm_file = _get_column(out_file, out_file.replace("xprs", "tpm"), 14)
    fpkm_file = _get_column(out_file, out_file.replace("xprs","fpkm"), 10)
    return (eff_count_file, tpm_file, fpkm_file)

예제 #8

0

파일 보기

파일: express.py 프로젝트: hshujia/bcbio-nextgen

def run(data):
    """Quantitaive isoforms expression by eXpress"""
    name = dd.get_sample_name(data)
    in_bam = dd.get_transcriptome_bam(data)
    if not in_bam:
        logger.info(
            "Transcriptome-mapped BAM file not found, skipping eXpress.")
        return data
    gtf_fasta = gtf.gtf_to_fasta(dd.get_gtf_file(data), dd.get_ref_file(data))
    out_dir = os.path.join(dd.get_work_dir(data), "express", name)
    out_file = os.path.join(out_dir, name + ".xprs")
    express = config_utils.get_program("express", data['config'])
    strand = _set_stranded_flag(in_bam, data)
    if not file_exists(out_file):
        with file_transaction(out_dir) as tx_out_dir:
            cmd = (
                "{express} --no-update-check -o {tx_out_dir} {strand} {gtf_fasta} {in_bam}"
            )
            do.run(cmd.format(**locals()), "Run express on %s." % in_bam, {})
        shutil.move(os.path.join(out_dir, "results.xprs"), out_file)
    eff_count_file = _get_column(out_file,
                                 out_file.replace(".xprs", "_eff.counts"), 7)
    tpm_file = _get_column(out_file, out_file.replace("xprs", "tpm"), 14)
    fpkm_file = _get_column(out_file, out_file.replace("xprs", "fpkm"), 10)
    data = dd.set_express_counts(data, eff_count_file)
    data = dd.set_express_tpm(data, tpm_file)
    data = dd.set_express_fpkm(data, fpkm_file)
    return data

예제 #9

0

파일 보기

파일: umi.py 프로젝트: vladsaveliev/bcbio-nextgen

def tagcount(data):
    bam = dd.get_transcriptome_bam(data)
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    sample_dir = os.path.join(umi_dir, dd.get_sample_name(data))
    out_prefix = os.path.join(sample_dir, dd.get_sample_name(data))
    out_file = out_prefix + ".mtx"
    if file_exists(out_file):
        data = dd.set_count_file(data, out_file)
        return [[data]]
    umis = config_utils.get_program("umis", data, default="umis")
    safe_makedir(sample_dir)
    cutoff = dd.get_minimum_barcode_depth(data)
    cb_histogram = os.path.join(sample_dir, "cb-histogram.txt")
    positional = "--positional" if dd.get_positional_umi(data, False) else ""
    if use_installed_transcriptome(data):
        gtf_file = dd.get_gtf_file(data)
    else:
        gtf_file  = dd.get_transcriptome_gtf(data, None)

    if gtf_file:
        gene_map_file = os.path.join(dd.get_work_dir(data), "annotation",
                                     os.path.splitext(gtf_file)[0] + "-tx2gene.tsv")
        gene_map_file = gtf.tx2genefile(gtf_file, gene_map_file, tsv=True)
        gene_map_flag = " --genemap {0} ".format(gene_map_file)
    else:
        gene_map_flag = ""

    message = "Counting alignments of transcripts in %s." % bam
    cmd = ("{umis} fasttagcount --cb_cutoff {cutoff} "
           "{gene_map_flag} "
           "{positional} "
           "--cb_histogram {cb_histogram}")
    out_files = [out_file, out_file + ".rownames", out_file + ".colnames"]
    umi_matrix_file = out_prefix + "-dupes.mtx"
    out_files += [umi_matrix_file, umi_matrix_file + ".rownames",
                  umi_matrix_file + ".colnames"]
    if has_umi_matrix(data):
        umi_matrix_flag = " --umi_matrix {tx_umi_matrix_full} "
    else:
        umi_matrix_flag = ""
    cmd += umi_matrix_flag
    cmd += " {bam} {tx_out_file_full}"
    with file_transaction(out_files) as tx_out_files:
        tx_out_file = tx_out_files[0]
        tx_out_file_full = tx_out_file + ".full"
        tx_umi_matrix = tx_out_files[3]
        tx_umi_matrix_full = tx_out_files[3] + ".full"
        do.run(cmd.format(**locals()), message)
        cmd = ("{umis} sparse {tx_out_file_full} {tx_out_file}")
        message = "Converting %s to sparse format." % tx_out_file_full
        do.run(cmd.format(**locals()), message)
        if has_umi_matrix(data):
            cmd = ("{umis} sparse {tx_umi_matrix_full} {tx_umi_matrix}")
            message = "Converting %s to sparse format." % tx_umi_matrix_full
        do.run(cmd.format(**locals()), message)
    data = dd.set_count_file(data, out_file)
    return [[data]]

예제 #10

0

파일 보기

파일: salmon.py 프로젝트: yodeng/bcbio-nextgen

def run_salmon_bam(data):
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    salmon_dir = os.path.join(work_dir, "salmon", samplename)
    gtf_file = dd.get_gtf_file(data)
    bam_file = dd.get_transcriptome_bam(data)
    fasta_file = dd.get_ref_file(data)
    out_file = salmon_quant_bam(bam_file, salmon_dir, gtf_file, fasta_file, data)
    data = dd.set_salmon(data, out_file)
    data = dd.set_salmon_dir(data, salmon_dir)
    return [[data]]

예제 #11

0

파일 보기

파일: salmon.py 프로젝트: vladsaveliev/bcbio-nextgen

def run_salmon_bam(data):
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    salmon_dir = os.path.join(work_dir, "salmon", samplename)
    gtf_file = dd.get_gtf_file(data)
    bam_file = dd.get_transcriptome_bam(data)
    fasta_file = dd.get_ref_file(data)
    out_file = salmon_quant_bam(bam_file, salmon_dir, gtf_file, fasta_file, data)
    data = dd.set_salmon(data, out_file)
    data = dd.set_salmon_dir(data, salmon_dir)
    data = dd.set_salmon_fraglen_file(data, _get_fraglen_file(salmon_dir))
    return [[data]]

예제 #12

0

파일 보기

def run_salmon_bam(data):
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    salmon_dir = os.path.join(work_dir, "salmon", samplename)
    gtf_file = dd.get_gtf_file(data)
    bam_file = dd.get_transcriptome_bam(data)
    out_file = salmon_quant_bam(bam_file, salmon_dir, gtf_file, data)
    data = dd.set_salmon(data, out_file)
    data = dd.set_salmon_dir(data, salmon_dir)
    data = dd.set_salmon_fraglen_file(data, _get_fraglen_file(salmon_dir))
    data = dd.update_summary_qc(data, "salmon", base=dd.get_salmon_fraglen_file(data))
    return [[data]]

예제 #13

0

파일 보기

파일: salmon.py 프로젝트: hliang/bcbio-nextgen

def run_salmon_bam(data):
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    salmon_dir = os.path.join(work_dir, "salmon", samplename)
    gtf_file = dd.get_gtf_file(data)
    bam_file = dd.get_transcriptome_bam(data)
    assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file
    fasta_file = dd.get_ref_file(data)
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    out_file = salmon_quant_bam(bam_file, salmon_dir, gtf_file, fasta_file, data)
    data = dd.set_salmon(data, out_file)
    data = dd.set_salmon_dir(data, salmon_dir)
    return [[data]]

예제 #14

0

파일 보기

def run_salmon_bam(data):
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    salmon_dir = os.path.join(work_dir, "salmon", samplename)
    gtf_file = dd.get_gtf_file(data)
    bam_file = dd.get_transcriptome_bam(data)
    assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file
    fasta_file = dd.get_ref_file(data)
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    out_file = salmon_quant_bam(bam_file, salmon_dir, gtf_file, fasta_file, data)
    data = dd.set_salmon(data, out_file)
    data = dd.set_salmon_dir(data, salmon_dir)
    return [[data]]

예제 #15

0

파일 보기

파일: rnaseq.py 프로젝트: curoverse/bcbio-nextgen

def generate_transcript_counts(data):
    """Generate counts per transcript and per exon from an alignment"""
    data["count_file"] = featureCounts.count(data)
    if dd.get_fusion_mode(data, False):
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data = dd.set_oncofuse_file(data, oncofuse_file)
    # if RSEM set to run, but the aligner didn't create the transcriptome BAM
    # file, make one with bwa
    if dd.get_rsem(data) and not dd.get_transcriptome_bam(data):
        file1, file2 = dd.get_input_sequence_files(data)
        ref_file = dd.get_ref_file(data)
        logger.info("RSEM was flagged to run, but the transcriptome BAM file "
                    "was not found. Aligning to the transcriptome with bowtie2.")
        data = bowtie2.align_transcriptome(file1, file2, ref_file, data)
    return [[data]]

예제 #16

0

파일 보기

def generate_transcript_counts(data):
    """Generate counts per transcript and per exon from an alignment"""
    data["count_file"] = featureCounts.count(data)
    if dd.get_fusion_mode(data, False):
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data = dd.set_oncofuse_file(data, oncofuse_file)
    # if RSEM set to run, but the aligner didn't create the transcriptome BAM
    # file, make one with bwa
    if dd.get_rsem(data) and not dd.get_transcriptome_bam(data):
        file1, file2 = dd.get_input_sequence_files(data)
        ref_file = dd.get_ref_file(data)
        logger.info(
            "RSEM was flagged to run, but the transcriptome BAM file "
            "was not found. Aligning to the transcriptome with bowtie2.")
        data = bowtie2.align_transcriptome(file1, file2, ref_file, data)
    return [[data]]

예제 #17

0

파일 보기

def tagcount(data):
    bam = dd.get_transcriptome_bam(data)
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    sample_dir = os.path.join(umi_dir, dd.get_sample_name(data))
    out_file = os.path.join(sample_dir, dd.get_sample_name(data) + ".counts")
    if file_exists(out_file):
        data = dd.set_count_file(data, out_file)
        return [[data]]
    umis = config_utils.get_program("umis", data, default="umis")
    safe_makedir(sample_dir)
    cutoff = dd.get_minimum_barcode_depth(data)
    cb_histogram = os.path.join(sample_dir, "cb-histogram.txt")
    message = "Counting alignments of transcripts in %s." % bam
    cmd = ("{umis} tagcount --positional --cb_cutoff {cutoff} --cb_histogram "
           "{cb_histogram} {bam} {tx_out_file}")
    with file_transaction(out_file) as tx_out_file:
        do.run(cmd.format(**locals()), message)
    data = dd.set_count_file(data, out_file)
    return [[data]]

예제 #18

0

파일 보기

파일: umi.py 프로젝트: hliang/bcbio-nextgen

def tagcount(data):
    bam = dd.get_transcriptome_bam(data)
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    sample_dir = os.path.join(umi_dir, dd.get_sample_name(data))
    out_file = os.path.join(sample_dir, dd.get_sample_name(data) + ".counts")
    if file_exists(out_file):
        data = dd.set_count_file(data, out_file)
        return [[data]]
    umis = config_utils.get_program("umis", data, default="umis")
    safe_makedir(sample_dir)
    cutoff = dd.get_minimum_barcode_depth(data)
    cb_histogram = os.path.join(sample_dir, "cb-histogram.txt")
    message = "Counting alignments of transcripts in %s." % bam
    cmd = ("{umis} tagcount --positional --cb_cutoff {cutoff} --cb_histogram "
           "{cb_histogram} {bam} {tx_out_file}")
    with file_transaction(out_file) as tx_out_file:
        do.run(cmd.format(**locals()), message)
    data = dd.set_count_file(data, out_file)
    return [[data]]

예제 #19

0

파일 보기

파일: rnaseq.py 프로젝트: dh10/bcbio-nextgen

def generate_transcript_counts(data):
    """Generate counts per transcript and per exon from an alignment"""
    data["count_file"] = featureCounts.count(data)
    if dd.get_fusion_mode(data, False):
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data = dd.set_oncofuse_file(data, oncofuse_file)
    # if RSEM set to run, but the aligner didn't create the transcriptome BAM
    # file, make one with bwa
    if dd.get_disambiguate(data):
        logger.info("RSEM is not supported yet for disambiguation protocols. "
                    "See https://github.com/chapmanb/bcbio-nextgen/issues/859")
        return [[data]]
    if dd.get_rsem(data) and not dd.get_transcriptome_bam(data):
        file1, file2 = dd.get_input_sequence_files(data)
        ref_file = dd.get_ref_file(data)
        logger.info("RSEM was flagged to run, but the transcriptome BAM file "
                    "was not found. Aligning to the transcriptome with bowtie2.")
        data = bowtie2.align_transcriptome(file1, file2, ref_file, data)
    return [[data]]

예제 #20

0

파일 보기

파일: umi.py 프로젝트: DoaneAS/bcbio-nextgen

def tagcount(data):
    bam = dd.get_transcriptome_bam(data)
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    sample_dir = os.path.join(umi_dir, dd.get_sample_name(data))
    out_file = os.path.join(sample_dir, dd.get_sample_name(data) + ".mtx")
    if file_exists(out_file):
        data = dd.set_count_file(data, out_file)
        return [[data]]
    umis = config_utils.get_program("umis", data, default="umis")
    safe_makedir(sample_dir)
    cutoff = dd.get_minimum_barcode_depth(data)
    cb_histogram = os.path.join(sample_dir, "cb-histogram.txt")
    positional = "--positional" if dd.get_positional_umi(data, False) else ""
    message = "Counting alignments of transcripts in %s." % bam
    cmd = ("{umis} tagcount {positional} --cb_cutoff {cutoff} --sparse "
           "--cb_histogram {cb_histogram} {bam} {tx_out_file}")
    out_files = [out_file, out_file + ".rownames", out_file + ".colnames"]
    with file_transaction(out_files) as tx_out_files:
        tx_out_file = tx_out_files[0]
        do.run(cmd.format(**locals()), message)
    data = dd.set_count_file(data, out_file)
    return [[data]]

예제 #21

0

파일 보기

파일: umi.py 프로젝트: sdwfrost/bcbio-nextgen

def tagcount(data):
    bam = dd.get_transcriptome_bam(data)
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    sample_dir = os.path.join(umi_dir, dd.get_sample_name(data))
    out_file = os.path.join(sample_dir, dd.get_sample_name(data) + ".mtx")
    if file_exists(out_file):
        data = dd.set_count_file(data, out_file)
        return [[data]]
    umis = config_utils.get_program("umis", data, default="umis")
    safe_makedir(sample_dir)
    cutoff = dd.get_minimum_barcode_depth(data)
    cb_histogram = os.path.join(sample_dir, "cb-histogram.txt")
    positional = "--positional" if dd.get_positional_umi(data, False) else ""
    message = "Counting alignments of transcripts in %s." % bam
    cmd = ("{umis} tagcount {positional} --cb_cutoff {cutoff} --sparse "
           "--cb_histogram {cb_histogram} {bam} {tx_out_file}")
    out_files = [out_file, out_file + ".rownames", out_file + ".colnames"]
    with file_transaction(out_files) as tx_out_files:
        tx_out_file = tx_out_files[0]
        do.run(cmd.format(**locals()), message)
    data = dd.set_count_file(data, out_file)
    return [[data]]

예제 #22

0

파일 보기

파일: umi.py 프로젝트: vamst/bcbio-nextgen

def tagcount(data):
    bam = dd.get_transcriptome_bam(data)
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    sample_dir = os.path.join(umi_dir, dd.get_sample_name(data))
    out_file = os.path.join(sample_dir, dd.get_sample_name(data) + ".mtx")
    if file_exists(out_file):
        data = dd.set_count_file(data, out_file)
        return [[data]]
    umis = config_utils.get_program("umis", data, default="umis")
    safe_makedir(sample_dir)
    cutoff = dd.get_minimum_barcode_depth(data)
    cb_histogram = os.path.join(sample_dir, "cb-histogram.txt")
    positional = "--positional" if dd.get_positional_umi(data, False) else ""
    gtf_file = dd.get_transcriptome_gtf(data, None)

    if gtf_file:
        gene_map_file = os.path.join(
            dd.get_work_dir(data), "annotation",
            os.path.splitext(gtf_file)[0] + "-tx2gene.tsv")
        gene_map_file = gtf.tx2genefile(gtf_file, gene_map_file, tsv=True)
        gene_map_flag = " --genemap {0} ".format(gene_map_file)
    else:
        gene_map_flag = ""

    message = "Counting alignments of transcripts in %s." % bam
    cmd = ("{umis} fasttagcount --cb_cutoff {cutoff} "
           "{gene_map_flag}"
           "--cb_histogram {cb_histogram} {bam} {tx_out_file_full}")
    out_files = [out_file, out_file + ".rownames", out_file + ".colnames"]
    with file_transaction(out_files) as tx_out_files:
        tx_out_file = tx_out_files[0]
        tx_out_file_full = tx_out_file + ".full"
        do.run(cmd.format(**locals()), message)
        cmd = ("{umis} sparse {tx_out_file_full} {tx_out_file}")
        message = "Converting %s to sparse format." % tx_out_file_full
        do.run(cmd.format(**locals()), message)
    data = dd.set_count_file(data, out_file)
    return [[data]]