Example #1
0
def tagcount(data):
    bam = dd.get_transcriptome_bam(data)
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    sample_dir = os.path.join(umi_dir, dd.get_sample_name(data))
    out_file = os.path.join(sample_dir, dd.get_sample_name(data) + ".mtx")
    if file_exists(out_file):
        data = dd.set_count_file(data, out_file)
        return [[data]]
    umis = config_utils.get_program("umis", data, default="umis")
    safe_makedir(sample_dir)
    cutoff = dd.get_minimum_barcode_depth(data)
    cb_histogram = os.path.join(sample_dir, "cb-histogram.txt")
    positional = "--positional" if dd.get_positional_umi(data, False) else ""
    gtf_file = dd.get_transcriptome_gtf(data, None)

    if gtf_file:
        gene_map_file = os.path.join(
            dd.get_work_dir(data), "annotation",
            os.path.splitext(gtf_file)[0] + "-tx2gene.tsv")
        gene_map_file = gtf.tx2genefile(gtf_file, gene_map_file, tsv=True)
        gene_map_flag = " --genemap {0} ".format(gene_map_file)
    else:
        gene_map_flag = ""

    message = "Counting alignments of transcripts in %s." % bam
    cmd = ("{umis} tagcount {positional} --cb_cutoff {cutoff} --sparse "
           "{gene_map_flag}"
           "--cb_histogram {cb_histogram} {bam} {tx_out_file}")
    out_files = [out_file, out_file + ".rownames", out_file + ".colnames"]
    with file_transaction(out_files) as tx_out_files:
        tx_out_file = tx_out_files[0]
        do.run(cmd.format(**locals()), message)
    data = dd.set_count_file(data, out_file)
    return [[data]]
Example #2
0
def create_combined_tx2gene(data):
    out_dir = os.path.join(dd.get_work_dir(data), "inputs", "transcriptome")
    items = disambiguate.split([data])
    tx2gene_files = []
    for i in items:
        odata = i[0]
        gtf_file = dd.get_transcriptome_gtf(odata)
        if not gtf_file:
            gtf_file = dd.get_gtf_file(odata)
        out_file = os.path.join(out_dir,
                                dd.get_genome_build(odata) + "-tx2gene.csv")
        if file_exists(out_file):
            tx2gene_files.append(out_file)
        else:
            out_file = gtf.tx2genefile(gtf_file, out_file, tsv=False)
            tx2gene_files.append(out_file)
    combined_file = os.path.join(out_dir, "tx2gene.csv")
    if file_exists(combined_file):
        return combined_file

    tx2gene_file_string = " ".join(tx2gene_files)
    cmd = "cat {tx2gene_file_string} > {tx_out_file}"
    with file_transaction(data, combined_file) as tx_out_file:
        do.run(cmd.format(**locals()), "Combining tx2gene CSV files.")
    return combined_file
Example #3
0
def tagcount(data):
    bam = dd.get_transcriptome_bam(data)
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    sample_dir = os.path.join(umi_dir, dd.get_sample_name(data))
    out_prefix = os.path.join(sample_dir, dd.get_sample_name(data))
    out_file = out_prefix + ".mtx"
    if file_exists(out_file):
        data = dd.set_count_file(data, out_file)
        return [[data]]
    umis = config_utils.get_program("umis", data, default="umis")
    safe_makedir(sample_dir)
    cutoff = dd.get_minimum_barcode_depth(data)
    cb_histogram = os.path.join(sample_dir, "cb-histogram.txt")
    positional = "--positional" if dd.get_positional_umi(data, False) else ""
    if use_installed_transcriptome(data):
        gtf_file = dd.get_gtf_file(data)
    else:
        gtf_file = dd.get_transcriptome_gtf(data, None)

    if gtf_file:
        gene_map_file = os.path.join(
            dd.get_work_dir(data), "annotation",
            os.path.basename(os.path.splitext(gtf_file)[0]) + "-tx2gene.tsv")
        gene_map_file = gtf.tx2genefile(gtf_file, gene_map_file, tsv=True)
        gene_map_flag = " --genemap {0} ".format(gene_map_file)
    else:
        gene_map_flag = ""

    message = "Counting alignments of transcripts in %s." % bam
    cmd = ("{umis} fasttagcount --cb_cutoff {cutoff} "
           "{gene_map_flag} "
           "{positional} "
           "--cb_histogram {cb_histogram}")
    out_files = [out_file, out_file + ".rownames", out_file + ".colnames"]
    umi_matrix_file = out_prefix + "-dupes.mtx"
    out_files += [
        umi_matrix_file, umi_matrix_file + ".rownames",
        umi_matrix_file + ".colnames"
    ]
    if has_umi_matrix(data):
        umi_matrix_flag = " --umi_matrix {tx_umi_matrix_full} "
    else:
        umi_matrix_flag = ""
    cmd += umi_matrix_flag
    cmd += " {bam} {tx_out_file_full}"
    with file_transaction(out_files) as tx_out_files:
        tx_out_file = tx_out_files[0]
        tx_out_file_full = tx_out_file + ".full"
        tx_umi_matrix = tx_out_files[3]
        tx_umi_matrix_full = tx_out_files[3] + ".full"
        do.run(cmd.format(**locals()), message)
        cmd = ("{umis} sparse {tx_out_file_full} {tx_out_file}")
        message = "Converting %s to sparse format." % tx_out_file_full
        do.run(cmd.format(**locals()), message)
        if has_umi_matrix(data):
            cmd = ("{umis} sparse {tx_umi_matrix_full} {tx_umi_matrix}")
            message = "Converting %s to sparse format." % tx_umi_matrix_full
        do.run(cmd.format(**locals()), message)
    data = dd.set_count_file(data, out_file)
    return [[data]]
Example #4
0
def tagcount(data):
    bam = dd.get_transcriptome_bam(data)
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    sample_dir = os.path.join(umi_dir, dd.get_sample_name(data))
    out_prefix = os.path.join(sample_dir, dd.get_sample_name(data))
    out_file = out_prefix + ".mtx"
    if file_exists(out_file):
        data = dd.set_count_file(data, out_file)
        return [[data]]
    umis = config_utils.get_program("umis", data, default="umis")
    safe_makedir(sample_dir)
    cutoff = dd.get_minimum_barcode_depth(data)
    cb_histogram = os.path.join(sample_dir, "cb-histogram.txt")
    positional = "--positional" if dd.get_positional_umi(data, False) else ""
    if use_installed_transcriptome(data):
        gtf_file = dd.get_gtf_file(data)
    else:
        gtf_file  = dd.get_transcriptome_gtf(data, None)

    if gtf_file:
        gene_map_file = os.path.join(dd.get_work_dir(data), "annotation",
                                     os.path.splitext(gtf_file)[0] + "-tx2gene.tsv")
        gene_map_file = gtf.tx2genefile(gtf_file, gene_map_file, tsv=True)
        gene_map_flag = " --genemap {0} ".format(gene_map_file)
    else:
        gene_map_flag = ""

    message = "Counting alignments of transcripts in %s." % bam
    cmd = ("{umis} fasttagcount --cb_cutoff {cutoff} "
           "{gene_map_flag} "
           "{positional} "
           "--cb_histogram {cb_histogram}")
    out_files = [out_file, out_file + ".rownames", out_file + ".colnames"]
    umi_matrix_file = out_prefix + "-dupes.mtx"
    out_files += [umi_matrix_file, umi_matrix_file + ".rownames",
                  umi_matrix_file + ".colnames"]
    if has_umi_matrix(data):
        umi_matrix_flag = " --umi_matrix {tx_umi_matrix_full} "
    else:
        umi_matrix_flag = ""
    cmd += umi_matrix_flag
    cmd += " {bam} {tx_out_file_full}"
    with file_transaction(out_files) as tx_out_files:
        tx_out_file = tx_out_files[0]
        tx_out_file_full = tx_out_file + ".full"
        tx_umi_matrix = tx_out_files[3]
        tx_umi_matrix_full = tx_out_files[3] + ".full"
        do.run(cmd.format(**locals()), message)
        cmd = ("{umis} sparse {tx_out_file_full} {tx_out_file}")
        message = "Converting %s to sparse format." % tx_out_file_full
        do.run(cmd.format(**locals()), message)
        if has_umi_matrix(data):
            cmd = ("{umis} sparse {tx_umi_matrix_full} {tx_umi_matrix}")
            message = "Converting %s to sparse format." % tx_umi_matrix_full
        do.run(cmd.format(**locals()), message)
    data = dd.set_count_file(data, out_file)
    return [[data]]
Example #5
0
def combine_sailfish(samples):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    sailfish_dir = os.path.join(work_dir, "sailfish")
    gtf_file = dd.get_in_samples(samples, dd.get_gtf_file)
    dont_combine, to_combine = partition(dd.get_sailfish,
                                         dd.sample_data_iterator(samples),
                                         True)
    if not to_combine:
        return samples

    tidy_file = os.path.join(sailfish_dir, "combined.sf")
    transcript_tpm_file = os.path.join(sailfish_dir, "combined.isoform.sf.tpm")
    gene_tpm_file = os.path.join(sailfish_dir, "combined.gene.sf.tpm")
    tx2gene = os.path.join(sailfish_dir, "tx2gene.csv")
    if not all([
            file_exists(x)
            for x in [gene_tpm_file, tidy_file, transcript_tpm_file, tx2gene]
    ]):
        logger.info("Combining count files into %s." % tidy_file)
        df = pd.DataFrame()
        for data in to_combine:
            sailfish_file = dd.get_sailfish(data)
            samplename = dd.get_sample_name(data)
            new_df = _sailfish_expression_parser(sailfish_file, samplename)
            if df.empty:
                df = new_df
            else:
                df = rbind([df, new_df])
        df["id"] = df.index
        # some versions of the transcript annotations can have duplicated entries
        df = df.drop_duplicates(["id", "sample"])
        with file_transaction(tidy_file) as tx_out_file:
            df.to_csv(tx_out_file, sep="\t", index_label="name")
        with file_transaction(transcript_tpm_file) as tx_out_file:
            df.pivot("id", "sample", "tpm").to_csv(tx_out_file, sep="\t")
        with file_transaction(gene_tpm_file) as tx_out_file:
            pivot = df.pivot("id", "sample", "tpm")
            tdf = pd.DataFrame.from_dict(gtf.transcript_to_gene(gtf_file),
                                         orient="index")
            tdf.columns = ["gene_id"]
            pivot = pivot.join(tdf)
            pivot = pivot.groupby("gene_id").agg(np.sum)
            pivot.to_csv(tx_out_file, sep="\t")
        tx2gene = gtf.tx2genefile(gtf_file, tx2gene)
        logger.info("Finished combining count files into %s." % tidy_file)

    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_sailfish_tidy(data, tidy_file)
        data = dd.set_sailfish_transcript_tpm(data, transcript_tpm_file)
        data = dd.set_sailfish_gene_tpm(data, gene_tpm_file)
        data = dd.set_tx2gene(data, tx2gene)
        updated_samples.append([data])
    return updated_samples
Example #6
0
def combine_sailfish(samples):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    sailfish_dir = os.path.join(work_dir, "sailfish")
    gtf_file = dd.get_in_samples(samples, dd.get_gtf_file)
    dont_combine, to_combine = partition(dd.get_sailfish,
                                         dd.sample_data_iterator(samples), True)
    if not to_combine:
        return samples

    tidy_file = os.path.join(sailfish_dir, "combined.sf")
    transcript_tpm_file = os.path.join(sailfish_dir, "combined.isoform.sf.tpm")
    gene_tpm_file = os.path.join(sailfish_dir, "combined.gene.sf.tpm")
    tx2gene = os.path.join(sailfish_dir, "tx2gene.csv")
    if not all([file_exists(x) for x in [gene_tpm_file, tidy_file,
                                         transcript_tpm_file, tx2gene]]):
        logger.info("Combining count files into %s." % tidy_file)
        df = pd.DataFrame()
        for data in to_combine:
            sailfish_file = dd.get_sailfish(data)
            samplename = dd.get_sample_name(data)
            new_df = _sailfish_expression_parser(sailfish_file, samplename)
            if df.empty:
                df = new_df
            else:
                df = rbind([df, new_df])
        df["id"] = df.index
        # some versions of the transcript annotations can have duplicated entries
        df = df.drop_duplicates(["id", "sample"])
        with file_transaction(tidy_file) as tx_out_file:
            df.to_csv(tx_out_file, sep="\t", index_label="name")
        with file_transaction(transcript_tpm_file) as  tx_out_file:
            df.pivot("id", "sample", "tpm").to_csv(tx_out_file, sep="\t")
        with file_transaction(gene_tpm_file) as  tx_out_file:
            pivot = df.pivot("id", "sample", "tpm")
            tdf = pd.DataFrame.from_dict(gtf.transcript_to_gene(gtf_file),
                                         orient="index")
            tdf.columns = ["gene_id"]
            pivot = pivot.join(tdf)
            pivot = pivot.groupby("gene_id").agg(np.sum)
            pivot.to_csv(tx_out_file, sep="\t")
        tx2gene = gtf.tx2genefile(gtf_file, tx2gene)
        logger.info("Finished combining count files into %s." % tidy_file)

    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_sailfish_tidy(data, tidy_file)
        data = dd.set_sailfish_transcript_tpm(data, transcript_tpm_file)
        data = dd.set_sailfish_gene_tpm(data, gene_tpm_file)
        data = dd.set_tx2gene(data, tx2gene)
        updated_samples.append([data])
    return updated_samples
Example #7
0
def create_combined_tx2gene(data):
    out_dir = os.path.join(dd.get_work_dir(data), "inputs", "transcriptome")
    items = disambiguate.split([data])
    tx2gene_files = []
    for i in items:
        odata = i[0]
        gtf_file = dd.get_gtf_file(odata)
        out_file = os.path.join(out_dir, dd.get_genome_build(odata) + "-tx2gene.csv")
        if file_exists(out_file):
            tx2gene_files.append(out_file)
        else:
            out_file = gtf.tx2genefile(gtf_file, out_file, tsv=False)
            tx2gene_files.append(out_file)
    combined_file = os.path.join(out_dir, "tx2gene.csv")
    if file_exists(combined_file):
        return combined_file

    tx2gene_file_string = " ".join(tx2gene_files)
    cmd = "cat {tx2gene_file_string} > {tx_out_file}"
    with file_transaction(data, combined_file) as tx_out_file:
        do.run(cmd.format(**locals()), "Combining tx2gene CSV files.")
    return combined_file
Example #8
0
def combine_files(samples):
    """
    after quantitation, combine the counts/FPKM/TPM/etc into a single table with
    all samples
    """
    data = samples[0][0]
    # prefer the supplied transcriptome gtf file
    gtf_file = dd.get_transcriptome_gtf(data, None)
    if not gtf_file:
        gtf_file = dd.get_gtf_file(data, None)
    dexseq_gff = dd.get_dexseq_gff(data)

    # combine featureCount files
    count_files = filter_missing([dd.get_count_file(x[0]) for x in samples])
    combined = count.combine_count_files(count_files, ext=".counts")
    annotated = count.annotate_combined_count_file(combined, gtf_file)

    # add tx2gene file
    tx2gene_file = os.path.join(dd.get_work_dir(data), "annotation",
                                "tx2gene.csv")
    if gtf_file:
        tx2gene_file = tx2genefile(gtf_file, tx2gene_file, tsv=False)

    # combine eXpress files
    express_counts_combined = combine_express(samples, combined)

    # combine Cufflinks files
    fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm"
    fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples])
    if fpkm_files:
        fpkm_combined = count.combine_count_files(fpkm_files,
                                                  fpkm_combined_file)
    else:
        fpkm_combined = None
    fpkm_isoform_combined_file = os.path.splitext(
        combined)[0] + ".isoform.fpkm"
    isoform_files = filter_missing(
        [dd.get_fpkm_isoform(x[0]) for x in samples])
    if isoform_files:
        fpkm_isoform_combined = count.combine_count_files(
            isoform_files, fpkm_isoform_combined_file, ".isoform.fpkm")
    else:
        fpkm_isoform_combined = None
    # combine DEXseq files
    dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq"
    to_combine_dexseq = filter_missing(
        [dd.get_dexseq_counts(data[0]) for data in samples])

    if to_combine_dexseq:
        dexseq_combined = count.combine_count_files(to_combine_dexseq,
                                                    dexseq_combined_file,
                                                    ".dexseq")
        dexseq.create_dexseq_annotation(dexseq_gff, dexseq_combined)
    else:
        dexseq_combined = None
    samples = spikein.combine_spikein(samples)
    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_combined_counts(data, combined)
        if annotated:
            data = dd.set_annotated_combined_counts(data, annotated)
        if fpkm_combined:
            data = dd.set_combined_fpkm(data, fpkm_combined)
        if fpkm_isoform_combined:
            data = dd.set_combined_fpkm_isoform(data, fpkm_isoform_combined)
        if express_counts_combined:
            data = dd.set_express_counts(data,
                                         express_counts_combined['counts'])
            data = dd.set_express_tpm(data, express_counts_combined['tpm'])
            data = dd.set_express_fpkm(data, express_counts_combined['fpkm'])
            data = dd.set_isoform_to_gene(
                data, express_counts_combined['isoform_to_gene'])
        if dexseq_combined:
            data = dd.set_dexseq_counts(data, dexseq_combined_file)
        if gtf_file:
            data = dd.set_tx2gene(data, tx2gene_file)
        updated_samples.append([data])
    return updated_samples
Example #9
0
def combine_files(samples):
    """
    after quantitation, combine the counts/FPKM/TPM/etc into a single table with
    all samples
    """
    data = samples[0][0]
    gtf_file = dd.get_gtf_file(data, None)
    dexseq_gff = dd.get_dexseq_gff(data)

    # combine featureCount files
    count_files = filter_missing([dd.get_count_file(x[0]) for x in samples])
    combined = count.combine_count_files(count_files, ext=".counts")
    annotated = count.annotate_combined_count_file(combined, gtf_file)

    # add tx2gene file
    tx2gene_file = os.path.join(dd.get_work_dir(data), "annotation", "tx2gene.csv")
    if gtf_file:
        tx2gene_file = tx2genefile(gtf_file, tx2gene_file)

    # combine eXpress files
    express_counts_combined = combine_express(samples, combined)

    # combine Cufflinks files
    fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm"
    fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples])
    if fpkm_files:
        fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file)
    else:
        fpkm_combined = None
    fpkm_isoform_combined_file = os.path.splitext(combined)[0] + ".isoform.fpkm"
    isoform_files = filter_missing([dd.get_fpkm_isoform(x[0]) for x in samples])
    if isoform_files:
        fpkm_isoform_combined = count.combine_count_files(isoform_files,
                                                          fpkm_isoform_combined_file,
                                                          ".isoform.fpkm")
    else:
        fpkm_isoform_combined = None
    # combine DEXseq files
    dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq"
    to_combine_dexseq = filter_missing([dd.get_dexseq_counts(data[0]) for data
                                        in samples])

    if to_combine_dexseq:
        dexseq_combined = count.combine_count_files(to_combine_dexseq,
                                                    dexseq_combined_file, ".dexseq")
        dexseq.create_dexseq_annotation(dexseq_gff, dexseq_combined)
    else:
        dexseq_combined = None
    samples = spikein.combine_spikein(samples)
    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_combined_counts(data, combined)
        if annotated:
            data = dd.set_annotated_combined_counts(data, annotated)
        if fpkm_combined:
            data = dd.set_combined_fpkm(data, fpkm_combined)
        if fpkm_isoform_combined:
            data = dd.set_combined_fpkm_isoform(data, fpkm_isoform_combined)
        if express_counts_combined:
            data = dd.set_express_counts(data, express_counts_combined['counts'])
            data = dd.set_express_tpm(data, express_counts_combined['tpm'])
            data = dd.set_express_fpkm(data, express_counts_combined['fpkm'])
            data = dd.set_isoform_to_gene(data, express_counts_combined['isoform_to_gene'])
        if dexseq_combined:
            data = dd.set_dexseq_counts(data, dexseq_combined_file)
        if gtf_file:
            data = dd.set_tx2gene(data, tx2gene_file)
        updated_samples.append([data])
    return updated_samples