Example #1
0
def call_consensus(samples):
    """
    call consensus peaks on the narrow/Broad peakfiles from a set of
    ChiP/ATAC samples
    """
    data = samples[0][0]
    new_samples = []
    consensusdir = os.path.join(dd.get_work_dir(data), "consensus")
    utils.safe_makedir(consensusdir)
    peakfiles = []
    for data in dd.sample_data_iterator(samples):
        if dd.get_chip_method(data) == "chip":
            for fn in tz.get_in(("peaks_files", "macs2"), data, []):
                if "narrowPeak" in fn:
                    peakfiles.append(fn)
                    break
                elif "broadPeak" in fn:
                    peakfiles.append(fn)
                    break
        elif dd.get_chip_method(data) == "atac":
            for fn in tz.get_in(("peaks_files", "NF", "macs2"), data, []):
                if "narrowPeak" in fn:
                    peakfiles.append(fn)
    consensusfile = os.path.join(consensusdir, "consensus.bed")
    if not peakfiles:
        logger.info(
            "No suitable peak files found, skipping consensus peak calling.")
        return samples
    consensusfile = consensus(peakfiles, consensusfile, data)
    for data in dd.sample_data_iterator(samples):
        new_samples.append([
            tz.assoc_in(data, ("peaks_files", "consensus"),
                        {"main": consensusfile})
        ])
    return new_samples
Example #2
0
def combine_spikein(samples):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    sailfish_dir = os.path.join(work_dir, "spikein")
    dont_combine, to_combine = partition(dd.get_spikein_counts,
                                         dd.sample_data_iterator(samples), True)
    if not to_combine:
        return samples

    tidy_file = os.path.join(sailfish_dir, "spikein.sf")
    if not file_exists(tidy_file):
        logger.info("Combining count files into %s." % tidy_file)
        df = pd.DataFrame()
        for data in to_combine:
            sailfish_file = dd.get_spikein_counts(data)
            samplename = dd.get_sample_name(data)
            new_df = sailfish._sailfish_expression_parser(sailfish_file, samplename)
            if df.empty:
                df = new_df
            else:
                df = rbind([df, new_df])
        df["id"] = df.index
        # some versions of the transcript annotations can have duplicated entries
        df = df.drop_duplicates(["id", "sample"])
        with file_transaction(tidy_file) as tx_out_file:
            df.to_csv(tx_out_file, sep="\t", index_label="name")
        logger.info("Finished combining count files into %s." % tidy_file)

    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_spikein_counts(data, tidy_file)
        updated_samples.append([data])
    return updated_samples
Example #3
0
def create_ataqv_report(samples):
    """
    make the ataqv report from a set of ATAC-seq samples
    """
    data = samples[0][0]
    new_samples = []
    reportdir = os.path.join(dd.get_work_dir(data), "qc", "ataqv")
    sentinel = os.path.join(reportdir, "index.html")
    if utils.file_exists(sentinel):
        ataqv_output = {"base": sentinel, "secondary": get_ataqv_report_files(reportdir)}
        new_data = []
        for data in dd.sample_data_iterator(samples):
            data = tz.assoc_in(data, ["ataqv_report"], ataqv_output)
            new_data.append(data)
        return dd.get_samples_from_datalist(new_data)
    mkarv = config_utils.get_program("mkarv", dd.get_config(data))
    ataqv_files = []
    for data in dd.sample_data_iterator(samples):
        qc = dd.get_summary_qc(data)
        ataqv_file = tz.get_in(("ataqv", "base"), qc, None)
        if ataqv_file and utils.file_exists(ataqv_file):
            ataqv_files.append(ataqv_file)
    if not ataqv_files:
        return samples
    ataqv_json_file_string = " ".join(ataqv_files)
    with file_transaction(reportdir) as txreportdir:
        cmd = f"{mkarv} {txreportdir} {ataqv_json_file_string}"
        message = f"Creating ataqv report from {ataqv_json_file_string}."
        do.run(cmd, message)
    new_data = []
    ataqv_output = {"base": sentinel, "secondary": get_ataqv_report_files(reportdir)}
    for data in dd.sample_data_iterator(samples):
        data = tz.assoc_in(data, ["ataqv_report"], ataqv_output)
        new_data.append(data)
    return dd.get_samples_from_datalist(new_data)
Example #4
0
def concatenate_sparse_counts(*samples):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    umi_dir = os.path.join(work_dir, "umis")
    out_file = os.path.join(umi_dir, "tagcounts.mtx")
    if file_exists(out_file):
        return out_file
    files = [
        dd.get_count_file(data) for data in dd.sample_data_iterator(samples)
        if dd.get_count_file(data)
    ]
    descriptions = [
        dd.get_sample_name(data) for data in dd.sample_data_iterator(samples)
        if dd.get_count_file(data)
    ]
    if not files:
        return samples
    counts = SparseMatrix()
    counts.read(filename=files.pop(), colprefix=descriptions.pop())
    for filename, description in zip(files, descriptions):
        newcounts = SparseMatrix()
        newcounts.read(filename=filename, colprefix=description)
        counts.cat(newcounts)
    counts.write(out_file)
    newsamples = []
    for data in dd.sample_data_iterator(samples):
        newsamples.append([dd.set_combined_counts(data, out_file)])
    return newsamples
Example #5
0
def combine_spikein(samples):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    sailfish_dir = os.path.join(work_dir, "spikein")
    dont_combine, to_combine = partition(dd.get_spikein_counts,
                                         dd.sample_data_iterator(samples),
                                         True)
    if not to_combine:
        return samples

    tidy_file = os.path.join(sailfish_dir, "spikein.sf")
    if not file_exists(tidy_file):
        logger.info("Combining count files into %s." % tidy_file)
        df = pd.DataFrame()
        for data in to_combine:
            sailfish_file = dd.get_spikein_counts(data)
            samplename = dd.get_sample_name(data)
            new_df = sailfish._sailfish_expression_parser(
                sailfish_file, samplename)
            if df.empty:
                df = new_df
            else:
                df = rbind([df, new_df])
        df["id"] = df.index
        # some versions of the transcript annotations can have duplicated entries
        df = df.drop_duplicates(["id", "sample"])
        with file_transaction(tidy_file) as tx_out_file:
            df.to_csv(tx_out_file, sep="\t", index_label="name")
        logger.info("Finished combining count files into %s." % tidy_file)

    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_spikein_counts(data, tidy_file)
        updated_samples.append([data])
    return updated_samples
Example #6
0
def create_peaktable(samples):
    """create a table of peak counts per sample to use with differential peak calling
    """
    data = dd.get_data_from_sample(samples[0])
    peakcounts = []
    out_dir = os.path.join(dd.get_work_dir(data), "consensus")
    out_file = os.path.join(out_dir, "consensus-counts.tsv")
    if dd.get_chip_method(data) == "chip":
        for data in dd.sample_data_iterator(samples):
            peakcounts.append(tz.get_in(("peak_counts"), data))
    elif dd.get_chip_method(data) == "atac":
        for data in dd.sample_data_iterator(samples):
            if bam.is_paired(dd.get_work_bam(data)):
                peakcounts.append(tz.get_in(("peak_counts", "NF"), data))
            else:
                logger.info(f"Creating peak table from full BAM file because "
                            f"{dd.get_work_bam(data)} is single-ended.")
                peakcounts.append(tz.get_in(("peak_counts", "full"), data))
    combined_peaks = count.combine_count_files(peakcounts,
                                               out_file,
                                               ext=".counts")
    new_data = []
    for data in dd.sample_data_iterator(samples):
        data = tz.assoc_in(data, ("peak_counts", "peaktable"), combined_peaks)
        new_data.append(data)
    new_samples = dd.get_samples_from_datalist(new_data)
    return new_samples
Example #7
0
def combine_sailfish(samples):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    dont_combine, to_combine = partition(dd.get_sailfish,
                                         dd.sample_data_iterator(samples),
                                         True)
    if not to_combine:
        return samples

    out_file = os.path.join(work_dir, "sailfish", "combined.sf")
    if not file_exists(out_file):
        df = pd.DataFrame()
        for data in to_combine:
            sailfish_file = dd.get_sailfish(data)
            samplename = dd.get_sample_name(data)
            new_df = _sailfish_expression_parser(sailfish_file, samplename)
            if df.empty:
                df = new_df
            else:
                df = rbind([df, new_df])
        with file_transaction(out_file) as tx_out_file:
            df.to_csv(tx_out_file, sep="\t", index_label="name")

    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_sailfish_combined(data, out_file)
        updated_samples.append([data])
    return updated_samples
Example #8
0
def combine_sailfish(samples):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    sailfish_dir = os.path.join(work_dir, "sailfish")
    gtf_file = dd.get_in_samples(samples, dd.get_gtf_file)
    dont_combine, to_combine = partition(dd.get_sailfish,
                                         dd.sample_data_iterator(samples),
                                         True)
    if not to_combine:
        return samples

    tidy_file = os.path.join(sailfish_dir, "combined.sf")
    transcript_tpm_file = os.path.join(sailfish_dir, "combined.isoform.sf.tpm")
    gene_tpm_file = os.path.join(sailfish_dir, "combined.gene.sf.tpm")
    tx2gene = os.path.join(sailfish_dir, "tx2gene.csv")
    if not all([
            file_exists(x)
            for x in [gene_tpm_file, tidy_file, transcript_tpm_file, tx2gene]
    ]):
        logger.info("Combining count files into %s." % tidy_file)
        df = pd.DataFrame()
        for data in to_combine:
            sailfish_file = dd.get_sailfish(data)
            samplename = dd.get_sample_name(data)
            new_df = _sailfish_expression_parser(sailfish_file, samplename)
            if df.empty:
                df = new_df
            else:
                df = rbind([df, new_df])
        df["id"] = df.index
        # some versions of the transcript annotations can have duplicated entries
        df = df.drop_duplicates(["id", "sample"])
        with file_transaction(tidy_file) as tx_out_file:
            df.to_csv(tx_out_file, sep="\t", index_label="name")
        with file_transaction(transcript_tpm_file) as tx_out_file:
            df.pivot("id", "sample", "tpm").to_csv(tx_out_file, sep="\t")
        with file_transaction(gene_tpm_file) as tx_out_file:
            pivot = df.pivot("id", "sample", "tpm")
            tdf = pd.DataFrame.from_dict(gtf.transcript_to_gene(gtf_file),
                                         orient="index")
            tdf.columns = ["gene_id"]
            pivot = pivot.join(tdf)
            pivot = pivot.groupby("gene_id").agg(np.sum)
            pivot.to_csv(tx_out_file, sep="\t")
        tx2gene = gtf.tx2genefile(gtf_file, tx2gene)
        logger.info("Finished combining count files into %s." % tidy_file)

    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_sailfish_tidy(data, tidy_file)
        data = dd.set_sailfish_transcript_tpm(data, transcript_tpm_file)
        data = dd.set_sailfish_gene_tpm(data, gene_tpm_file)
        data = dd.set_tx2gene(data, tx2gene)
        updated_samples.append([data])
    return updated_samples
Example #9
0
def stringtie_merge(*samples):
    to_merge = filter_missing(flatten([dd.get_assembled_gtf(data) for data in
                                       dd.sample_data_iterator(samples)]))
    data = samples[0][0]
    ref_file = dd.get_sam_ref(data)
    gtf_file = dd.get_gtf_file(data)
    num_cores = dd.get_num_cores(data)
    merged_gtf = stringtie.merge(to_merge, ref_file, gtf_file, num_cores, data)
    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_merged_gtf(data, merged_gtf)
        updated_samples.append([data])
    return updated_samples
Example #10
0
def cufflinks_merge(*samples):
    to_merge = filter_missing([dd.get_assembled_gtf(data) for data in
                            dd.sample_data_iterator(samples)])
    data = samples[0][0]
    bam_file = dd.get_work_bam(data)
    ref_file = dd.get_sam_ref(data)
    gtf_file = dd.get_gtf_file(data)
    out_dir = os.path.join(dd.get_work_dir(data), "assembly")
    num_cores = dd.get_num_cores(data)
    merged_gtf = cufflinks.merge(to_merge, ref_file, gtf_file, num_cores, samples[0][0])
    for data in dd.sample_data_iterator(samples):
        dd.set_assembled_gtf(data, merged_gtf)
    return samples
Example #11
0
def stringtie_merge(*samples):
    to_merge = filter_missing(flatten([dd.get_assembled_gtf(data) for data in
                                       dd.sample_data_iterator(samples)]))
    data = samples[0][0]
    ref_file = dd.get_sam_ref(data)
    gtf_file = dd.get_gtf_file(data)
    num_cores = dd.get_num_cores(data)
    merged_gtf = stringtie.merge(to_merge, ref_file, gtf_file, num_cores, data)
    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_merged_gtf(data, merged_gtf)
        updated_samples.append([data])
    return updated_samples
Example #12
0
def combine_sailfish(samples):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    sailfish_dir = os.path.join(work_dir, "sailfish")
    gtf_file = dd.get_in_samples(samples, dd.get_gtf_file)
    dont_combine, to_combine = partition(dd.get_sailfish,
                                         dd.sample_data_iterator(samples), True)
    if not to_combine:
        return samples

    tidy_file = os.path.join(sailfish_dir, "combined.sf")
    transcript_tpm_file = os.path.join(sailfish_dir, "combined.isoform.sf.tpm")
    gene_tpm_file = os.path.join(sailfish_dir, "combined.gene.sf.tpm")
    tx2gene = os.path.join(sailfish_dir, "tx2gene.csv")
    if not all([file_exists(x) for x in [gene_tpm_file, tidy_file,
                                         transcript_tpm_file, tx2gene]]):
        logger.info("Combining count files into %s." % tidy_file)
        df = pd.DataFrame()
        for data in to_combine:
            sailfish_file = dd.get_sailfish(data)
            samplename = dd.get_sample_name(data)
            new_df = _sailfish_expression_parser(sailfish_file, samplename)
            if df.empty:
                df = new_df
            else:
                df = rbind([df, new_df])
        df["id"] = df.index
        # some versions of the transcript annotations can have duplicated entries
        df = df.drop_duplicates(["id", "sample"])
        with file_transaction(tidy_file) as tx_out_file:
            df.to_csv(tx_out_file, sep="\t", index_label="name")
        with file_transaction(transcript_tpm_file) as  tx_out_file:
            df.pivot("id", "sample", "tpm").to_csv(tx_out_file, sep="\t")
        with file_transaction(gene_tpm_file) as  tx_out_file:
            pivot = df.pivot("id", "sample", "tpm")
            tdf = pd.DataFrame.from_dict(gtf.transcript_to_gene(gtf_file),
                                         orient="index")
            tdf.columns = ["gene_id"]
            pivot = pivot.join(tdf)
            pivot = pivot.groupby("gene_id").agg(np.sum)
            pivot.to_csv(tx_out_file, sep="\t")
        tx2gene = gtf.tx2genefile(gtf_file, tx2gene)
        logger.info("Finished combining count files into %s." % tidy_file)

    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_sailfish_tidy(data, tidy_file)
        data = dd.set_sailfish_transcript_tpm(data, transcript_tpm_file)
        data = dd.set_sailfish_gene_tpm(data, gene_tpm_file)
        data = dd.set_tx2gene(data, tx2gene)
        updated_samples.append([data])
    return updated_samples
Example #13
0
def run_rnaseq_joint_genotyping(*samples):
    data = samples[0][0]
    variantcaller = dd.get_variantcaller(data)
    ref_file = dd.get_ref_file(data)
    out_file = os.path.join(dd.get_work_dir(data, "."), "variation", "combined.vcf")
    if variantcaller and "gatk" in variantcaller:
        vrn_files = [dd.get_vrn_file(d) for d in dd.sample_data_iterator(samples)]
        out_file = variation.gatk_joint_calling(data, vrn_files, ref_file, out_file)
        updated_samples = []
        for data in dd.sample_data_iterator(samples):
            data = dd.set_square_vcf(data, out_file)
            updated_samples.append([data])
        return updated_samples
    return samples
Example #14
0
def run_rnaseq_joint_genotyping(*samples):
    data = samples[0][0]
    variantcaller = dd.get_variantcaller(data)
    ref_file = dd.get_ref_file(data)
    out_file = os.path.join(dd.get_work_dir(data, "."), "variation", "combined.vcf")
    if variantcaller and "gatk" in variantcaller:
        vrn_files = [dd.get_vrn_file(d) for d in dd.sample_data_iterator(samples)]
        out_file = variation.gatk_joint_calling(data, vrn_files, ref_file, out_file)
        updated_samples = []
        for data in dd.sample_data_iterator(samples):
            data = dd.set_square_vcf(data, out_file)
            updated_samples.append([data])
        return updated_samples
    return samples
Example #15
0
def cufflinks_merge(*samples):
    to_merge = filter_missing([dd.get_assembled_gtf(data) for data in
                            dd.sample_data_iterator(samples)])
    data = samples[0][0]
    bam_file = dd.get_work_bam(data)
    ref_file = dd.get_sam_ref(data)
    gtf_file = dd.get_gtf_file(data)
    out_dir = os.path.join(dd.get_work_dir(data), "assembly")
    num_cores = dd.get_num_cores(data)
    merged_gtf = cufflinks.merge(to_merge, ref_file, gtf_file, num_cores, samples[0][0])
    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_assembled_gtf(data, merged_gtf)
        updated_samples.append([data])
    return updated_samples
Example #16
0
def detect_fusions(samples):
    """Run fusion with a standalone tool, specified in config
    as fusion_caller.
    If fusion_mode is True, and no fusion_caller is specified,
    or fusion_caller == 'aligner', it is assumed that gene fusion
    detection was run on the alignment step.
    """
    fusion_mode = dd.get_in_samples(samples, dd.get_fusion_mode)
    if not fusion_mode:
        return samples

    caller = dd.get_in_samples(samples, dd.get_fusion_caller)
    if not caller or caller == 'aligner':
        logger.info("No standalone fusion caller specified in the config.")
        return samples

    STANDALONE_CALLERS = {
        'ericscript': ericscript.run,
    }
    caller_fn = STANDALONE_CALLERS.get(caller)
    if not caller_fn:
        logger.warning("Gene fusion detection with %s is not supported."
                       "Supported callers:\n%s" %
                       ', '.join(STANDALONE_CALLERS.keys()))
        return samples

    logger.info("Running gene fusion detection with  %s" % caller)
    return [[caller_fn(s)] for s in dd.sample_data_iterator(samples)]
Example #17
0
def scrnaseq_concatenate_metadata(samples):
    """
    Create file same dimension than mtx.colnames
    with metadata and sample name to help in the
    creation of the SC object.
    """
    barcodes = {}
    counts =  ""
    metadata = {}
    for sample in dd.sample_data_iterator(samples):
        with open(dd.get_sample_barcodes(sample)) as inh:
            for line in inh:
                cols = line.strip().split(",")
                if len(cols) == 1:
                    # Assign sample name in case of missing in barcodes
                    cols.append("NaN")
                barcodes[cols[0]] = cols[1:]

        counts = dd.get_combined_counts(sample)
        meta = map(str, list(sample["metadata"].values()))
        meta_cols = list(sample["metadata"].keys())
        meta = ["NaN" if not v else v for v in meta]
        metadata[dd.get_sample_name(sample)] = meta

    metadata_fn = counts + ".metadata"
    if not file_exists(metadata_fn):
        with open(metadata_fn, 'w') as outh:
            outh.write(",".join(["sample"] + meta_cols) + '\n')
            with open(counts + ".colnames") as inh:
                for line in inh:
                    sample = line.split(":")[0]
                    barcode = sample.split("-")[1]
                    outh.write(",".join(barcodes[barcode] + metadata[sample]) + '\n')
    return samples
Example #18
0
def rnaseq_prep_samples(config, run_info_yaml, parallel, dirs, samples):
    """
    organizes RNA-seq and small-RNAseq samples, converting from BAM if
    necessary and trimming if necessary
    """
    pipeline = dd.get_in_samples(samples, dd.get_analysis)
    trim_reads_set = any([tz.get_in(["algorithm", "trim_reads"], d) for d in dd.sample_data_iterator(samples)])
    resources = ["picard"]
    needs_trimming = (_is_smallrnaseq(pipeline) or trim_reads_set)
    if needs_trimming:
        resources.append("atropos")
    with prun.start(_wres(parallel, resources),
                    samples, config, dirs, "trimming",
                    max_multicore=1 if not needs_trimming else None) as run_parallel:
        with profile.report("organize samples", dirs):
            samples = run_parallel("organize_samples", [[dirs, config, run_info_yaml,
                                                            [x[0]["description"] for x in samples]]])
            samples = run_parallel("prepare_sample", samples)
        if needs_trimming:
            with profile.report("adapter trimming", dirs):
                if _is_smallrnaseq(pipeline):
                    samples = run_parallel("trim_srna_sample", samples)
                else:
                    samples = run_parallel("trim_sample", samples)
    return samples
Example #19
0
def load_summarizedexperiment(samples):
    """ create summarizedexperiment rds object
    fails with n_samples = 1 """
    # using r36 (4.0) - will eventually drop R3.5
    rcmd = Rscript_cmd("r36")
    se_script = os.path.join(os.path.dirname(__file__), os.pardir, "scripts",
                             "R", "bcbio2se.R")
    data = samples[0][0]
    work_dir = dd.get_work_dir(data)
    out_dir = os.path.join(work_dir, "salmon")
    summarized_experiment = os.path.join(out_dir, "bcbio-se.rds")
    if not file_exists(summarized_experiment):
        with file_transaction(summarized_experiment) as tx_out_file:
            cmd = f"{rcmd} --vanilla {se_script} {work_dir} {tx_out_file}"
            message = f"Loading SummarizedExperiment."
            try:
                do.run(cmd, message)
            except Exception:
                logger.error("SE creation failed")
    if file_exists(summarized_experiment):
        try:
            se_qc_report = generate_se_qc_report(work_dir)
        except Exception:
            se_qc_report = None
            logger.error("SE QC failed")
        updated_samples = []
        for data in dd.sample_data_iterator(samples):
            data = dd.set_summarized_experiment(data, summarized_experiment)
            updated_samples.append([data])
        return updated_samples
    else:
        return samples
Example #20
0
def run_salmon_index(*samples):
    for data in dd.sample_data_iterator(samples):
        work_dir = dd.get_work_dir(data)
        salmon_dir = os.path.join(work_dir, "salmon")
        gtf_file = dd.get_transcriptome_gtf(data, dd.get_gtf_file(data))
        salmon_index(gtf_file, data, salmon_dir)
    return samples
Example #21
0
def scrnaseq_concatenate_metadata(samples):
    """
    Create file same dimension than mtx.colnames
    with metadata and sample name to help in the
    creation of the SC object.
    """
    barcodes = {}
    counts =  ""
    metadata = {}
    for sample in dd.sample_data_iterator(samples):
        with open(dd.get_sample_barcodes(sample)) as inh:
            for line in inh:
                cols = line.strip().split(",")
                if len(cols) == 1:
                    # Assign sample name in case of missing in barcodes
                    cols.append("NaN")
                barcodes[(dd.get_sample_name(sample), cols[0])] = cols[1:]

        counts = dd.get_combined_counts(sample)
        meta = map(str, list(sample["metadata"].values()))
        meta_cols = list(sample["metadata"].keys())
        meta = ["NaN" if not v else v for v in meta]
        metadata[dd.get_sample_name(sample)] = meta

    metadata_fn = counts + ".metadata"
    if not file_exists(metadata_fn):
        with open(metadata_fn, 'w') as outh:
            outh.write(",".join(["sample"] + meta_cols) + '\n')
            with open(counts + ".colnames") as inh:
                for line in inh:
                    sample = line.split(":")[0]
                    barcode = sample.split("-")[1]
                    outh.write(",".join(barcodes[(sample, barcode)] + metadata[sample]) + '\n')
    return samples
Example #22
0
def rnaseq_prep_samples(config, run_info_yaml, parallel, dirs, samples):
    """
    organizes RNA-seq and small-RNAseq samples, converting from BAM if
    necessary and trimming if necessary
    """
    pipeline = dd.get_in_samples(samples, dd.get_analysis)
    trim_reads_set = any([
        tz.get_in(["algorithm", "trim_reads"], d)
        for d in dd.sample_data_iterator(samples)
    ])
    resources = ["picard"]
    needs_trimming = (_is_smallrnaseq(pipeline) or trim_reads_set)
    if needs_trimming:
        resources.append("atropos")
    with prun.start(
            _wres(parallel, resources),
            samples,
            config,
            dirs,
            "trimming",
            max_multicore=1 if not needs_trimming else None) as run_parallel:
        with profile.report("organize samples", dirs):
            samples = run_parallel("organize_samples", [[
                dirs, config, run_info_yaml,
                [x[0]["description"] for x in samples]
            ]])
            samples = run_parallel("prepare_sample", samples)
        if needs_trimming:
            with profile.report("adapter trimming", dirs):
                if _is_smallrnaseq(pipeline):
                    samples = run_parallel("trim_srna_sample", samples)
                else:
                    samples = run_parallel("trim_sample", samples)
    return samples
Example #23
0
def combine_files(samples):
    """
    after quantitation, combine the counts/FPKM/TPM/etc into a single table with
    all samples
    """
    gtf_file = dd.get_gtf_file(samples[0][0], None)
    dexseq_gff = dd.get_dexseq_gff(samples[0][0])

    # combine featureCount files
    count_files = filter_missing([dd.get_count_file(x[0]) for x in samples])
    combined = count.combine_count_files(count_files, ext=".counts")
    annotated = count.annotate_combined_count_file(combined, gtf_file)

    # combine eXpress files
    express_counts_combined = combine_express(samples, combined)

    # combine Cufflinks files
    fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm"
    fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples])
    if fpkm_files:
        fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file)
    else:
        fpkm_combined = None
    fpkm_isoform_combined_file = os.path.splitext(combined)[0] + ".isoform.fpkm"
    isoform_files = filter_missing([dd.get_fpkm_isoform(x[0]) for x in samples])
    if isoform_files:
        fpkm_isoform_combined = count.combine_count_files(isoform_files,
                                                          fpkm_isoform_combined_file,
                                                          ".isoform.fpkm")
    else:
        fpkm_isoform_combined = None
    # combine DEXseq files
    dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq"
    to_combine_dexseq = filter_missing([dd.get_dexseq_counts(data[0]) for data in samples])
    if to_combine_dexseq:
        dexseq_combined = count.combine_count_files(to_combine_dexseq,
                                                    dexseq_combined_file, ".dexseq")
        dexseq.create_dexseq_annotation(dexseq_gff, dexseq_combined)
    else:
        dexseq_combined = None
    samples = spikein.combine_spikein(samples)
    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_combined_counts(data, combined)
        if annotated:
            data = dd.set_annotated_combined_counts(data, annotated)
        if fpkm_combined:
            data = dd.set_combined_fpkm(data, fpkm_combined)
        if fpkm_isoform_combined:
            data = dd.set_combined_fpkm_isoform(data, fpkm_isoform_combined)
        if express_counts_combined:
            data = dd.set_express_counts(data, express_counts_combined['counts'])
            data = dd.set_express_tpm(data, express_counts_combined['tpm'])
            data = dd.set_express_fpkm(data, express_counts_combined['fpkm'])
            data = dd.set_isoform_to_gene(data, express_counts_combined['isoform_to_gene'])
        if dexseq_combined:
            data = dd.set_dexseq_counts(data, dexseq_combined_file)
        updated_samples.append([data])
    return updated_samples
Example #24
0
def combine_files(samples):
    """
    after quantitation, combine the counts/FPKM/TPM/etc into a single table with
    all samples
    """
    gtf_file = dd.get_gtf_file(samples[0][0], None)
    dexseq_gff = dd.get_dexseq_gff(samples[0][0])

    # combine featureCount files
    count_files = filter_missing([dd.get_count_file(x[0]) for x in samples])
    combined = count.combine_count_files(count_files, ext=".counts")
    annotated = count.annotate_combined_count_file(combined, gtf_file)

    # combine eXpress files
    express_counts_combined = combine_express(samples, combined)

    # combine Cufflinks files
    fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm"
    fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples])
    if fpkm_files:
        fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file)
    else:
        fpkm_combined = None
    fpkm_isoform_combined_file = os.path.splitext(combined)[0] + ".isoform.fpkm"
    isoform_files = filter_missing([dd.get_fpkm_isoform(x[0]) for x in samples])
    if isoform_files:
        fpkm_isoform_combined = count.combine_count_files(isoform_files,
                                                          fpkm_isoform_combined_file,
                                                          ".isoform.fpkm")
    else:
        fpkm_isoform_combined = None
    # combine DEXseq files
    dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq"
    to_combine_dexseq = filter_missing([dd.get_dexseq_counts(data[0]) for data in samples])
    if to_combine_dexseq:
        dexseq_combined = count.combine_count_files(to_combine_dexseq,
                                                    dexseq_combined_file, ".dexseq")
        dexseq.create_dexseq_annotation(dexseq_gff, dexseq_combined)
    else:
        dexseq_combined = None
    samples = spikein.combine_spikein(samples)
    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_combined_counts(data, combined)
        if annotated:
            data = dd.set_annotated_combined_counts(data, annotated)
        if fpkm_combined:
            data = dd.set_combined_fpkm(data, fpkm_combined)
        if fpkm_isoform_combined:
            data = dd.set_combined_fpkm_isoform(data, fpkm_isoform_combined)
        if express_counts_combined:
            data = dd.set_express_counts(data, express_counts_combined['counts'])
            data = dd.set_express_tpm(data, express_counts_combined['tpm'])
            data = dd.set_express_fpkm(data, express_counts_combined['fpkm'])
            data = dd.set_isoform_to_gene(data, express_counts_combined['isoform_to_gene'])
        if dexseq_combined:
            data = dd.set_dexseq_counts(data, dexseq_combined_file)
        updated_samples.append([data])
    return updated_samples
Example #25
0
def call_consensus(samples):
    """
    call consensus peaks on the narrowPeak files from a set of
    ChiP/ATAC samples
    """
    data = samples[0][0]
    new_samples = []
    consensusdir = os.path.join(dd.get_work_dir(data), "consensus")
    utils.safe_makedir(consensusdir)
    peakfiles = []
    for data in dd.sample_data_iterator(samples):
        if dd.get_chip_method(data) == "chip":
            for fn in tz.get_in(("peaks_files", "macs2"), data, []):
                if "narrowPeak" in fn:
                    peakfiles.append(fn)
                elif "broadPeak" in fn:
                    peakfiles.append(fn)
        elif dd.get_chip_method(data) == "atac":
            if bam.is_paired(dd.get_work_bam(data)):
                for fn in tz.get_in(("peaks_files", "NF", "macs2"), data, []):
                    if "narrowPeak" in fn:
                        peakfiles.append(fn)
            else:
                logger.info(
                    f"Using peaks from full fraction since {dd.get_work_bam(data)} is single-ended."
                )
                for fn in tz.get_in(("peaks_files", "full", "macs2"), data,
                                    []):
                    if "narrowPeak" in fn:
                        peakfiles.append(fn)
    consensusfile = os.path.join(consensusdir, "consensus.bed")
    if not peakfiles:
        logger.info(
            "No suitable peak files found, skipping consensus peak calling.")
        return samples
    consensusfile = consensus(peakfiles, consensusfile, data)
    if not utils.file_exists(consensusfile):
        logger.warning("No consensus peaks found.")
        return samples
    saffile = consensus_to_saf(consensusfile,
                               os.path.splitext(consensusfile)[0] + ".saf")
    for data in dd.sample_data_iterator(samples):
        data = tz.assoc_in(data, ("peaks_files", "consensus"),
                           {"main": consensusfile})
        new_samples.append([data])
    return new_samples
Example #26
0
def get_samples_by_batch(samples):
    batch_samples = defaultdict(list)
    for data in dd.sample_data_iterator(samples):
        batch = dd.get_batch(data) or dd.get_sample_name(data)
        if isinstance(batch, list):
            batch = tuple(batch)
        batch_samples[batch].append(data)
    return batch_samples
Example #27
0
def run_rnaseq_joint_genotyping(*samples):
    data = samples[0][0]
    variantcaller = dd.get_variantcaller(data)
    if not variantcaller:
       return samples
    if "gatk" not in variantcaller:
        return samples
    ref_file = dd.get_ref_file(data)
    if variantcaller and "gatk" in variantcaller:
        vrn_files = [dd.get_vrn_file(d) for d in dd.sample_data_iterator(samples)]
        out_file = variation.gatk_joint_calling(data, vrn_files, ref_file)
        vrn_file = vcfanno.run_vcfanno(out_file, "rnaedit", data)
        updated_samples = []
        for data in dd.sample_data_iterator(samples):
            data = dd.set_square_vcf(data, vrn_file)
            updated_samples.append([data])
        return updated_samples
    return samples
Example #28
0
def run_salmon_index(*samples):
    for data in dd.sample_data_iterator(samples):
        work_dir = dd.get_work_dir(data)
        salmon_dir = os.path.join(work_dir, "salmon")
        gtf_file = dd.get_gtf_file(data)
        fasta_file = dd.get_ref_file(data)
        assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
        salmon_index(gtf_file, fasta_file, data, salmon_dir)
    return samples
Example #29
0
def run_salmon_index(*samples):
    for data in dd.sample_data_iterator(samples):
        work_dir = dd.get_work_dir(data)
        salmon_dir = os.path.join(work_dir, "salmon")
        gtf_file = dd.get_gtf_file(data)
        fasta_file = dd.get_ref_file(data)
        assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
        salmon_index(gtf_file, fasta_file, data, salmon_dir)
    return samples
Example #30
0
def concatenate_cb_histograms(samples):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    umi_dir = os.path.join(work_dir, "umis")
    out_file = os.path.join(umi_dir, "cb-histogram.txt")

    files = [dd.get_histogram_counts(data) for data in
            dd.sample_data_iterator(samples)
            if dd.get_histogram_counts(data)]
    files = " ".join(files)
    cmd = "cat {files} > {out_file}"
    if not file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            message = "Concay cb histograms."
            do.run(cmd.format(**locals()), message)
    newsamples = []
    for data in dd.sample_data_iterator(samples):
        newsamples.append([dd.set_combined_histogram(data, out_file)])
    return newsamples
Example #31
0
def run_rnaseq_joint_genotyping(*samples):
    data = samples[0][0]
    variantcaller = dd.get_variantcaller(data)
    if not variantcaller:
       return samples
    if "gatk" not in variantcaller:
        return samples
    ref_file = dd.get_ref_file(data)
    if variantcaller and "gatk" in variantcaller:
        vrn_files = [dd.get_vrn_file(d) for d in dd.sample_data_iterator(samples)]
        out_file = variation.gatk_joint_calling(data, vrn_files, ref_file)
        vrn_file = vcfanno.run_vcfanno(out_file, ["rnaedit"], data)
        updated_samples = []
        for data in dd.sample_data_iterator(samples):
            data = dd.set_square_vcf(data, vrn_file)
            updated_samples.append([data])
        return updated_samples
    return samples
Example #32
0
def run_rapmap_index(*samples):
    for data in dd.sample_data_iterator(samples):
        work_dir = dd.get_work_dir(data)
        rapmap_dir = os.path.join(work_dir, "rapmap")
        gtf_file = dd.get_gtf_file(data)
        fasta_file = dd.get_ref_file(data)
        assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
        rapmap_index(gtf_file, fasta_file, "quasi", data, rapmap_dir)
    return samples
Example #33
0
def combine_sailfish(samples):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    gtf_file = dd.get_in_samples(samples, dd.get_gtf_file)
    dont_combine, to_combine = partition(dd.get_sailfish,
                                         dd.sample_data_iterator(samples), True)
    if not to_combine:
        return samples

    tidy_file = os.path.join(work_dir, "sailfish", "combined.sf")
    transcript_tpm_file = os.path.join(work_dir, "sailfish",
                                       "combined.isoform.sf.tpm")
    gene_tpm_file = os.path.join(work_dir, "sailfish",
                                 "combined.gene.sf.tpm")
    if not all([file_exists(x) for x in [gene_tpm_file, tidy_file,
                                         transcript_tpm_file]]):
        df = pd.DataFrame()
        for data in to_combine:
            sailfish_file = dd.get_sailfish(data)
            samplename = dd.get_sample_name(data)
            new_df = _sailfish_expression_parser(sailfish_file, samplename)
            if df.empty:
                df = new_df
            else:
                df = rbind([df, new_df])
        with file_transaction(tidy_file) as tx_out_file:
            df.to_csv(tx_out_file, sep="\t", index_label="name")
        with file_transaction(transcript_tpm_file) as  tx_out_file:
            df.pivot(None, "sample", "tpm").to_csv(tx_out_file, sep="\t")
        with file_transaction(gene_tpm_file) as  tx_out_file:
            pivot = df.pivot(None, "sample", "tpm")
            tdf = pd.DataFrame.from_dict(gtf.transcript_to_gene(gtf_file),
                                         orient="index")
            tdf.columns = ["gene_id"]
            pivot = pivot.join(tdf)
            pivot = pivot.groupby("gene_id").agg(np.sum)
            pivot.to_csv(tx_out_file, sep="\t")

    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_sailfish_tidy(data, tidy_file)
        data = dd.set_sailfish_transcript_tpm(data, transcript_tpm_file)
        data = dd.set_sailfish_gene_tpm(data, gene_tpm_file)
        updated_samples.append([data])
    return updated_samples
Example #34
0
def combine_sailfish(samples):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    gtf_file = dd.get_in_samples(samples, dd.get_gtf_file)
    dont_combine, to_combine = partition(dd.get_sailfish,
                                         dd.sample_data_iterator(samples), True)
    if not to_combine:
        return samples

    tidy_file = os.path.join(work_dir, "sailfish", "combined.sf")
    transcript_tpm_file = os.path.join(work_dir, "sailfish",
                                       "combined.isoform.sf.tpm")
    gene_tpm_file = os.path.join(work_dir, "sailfish",
                                 "combined.gene.sf.tpm")
    if not all([file_exists(x) for x in [gene_tpm_file, tidy_file,
                                         transcript_tpm_file]]):
        df = pd.DataFrame()
        for data in to_combine:
            sailfish_file = dd.get_sailfish(data)
            samplename = dd.get_sample_name(data)
            new_df = _sailfish_expression_parser(sailfish_file, samplename)
            if df.empty:
                df = new_df
            else:
                df = rbind([df, new_df])
        with file_transaction(tidy_file) as tx_out_file:
            df.to_csv(tx_out_file, sep="\t", index_label="name")
        with file_transaction(transcript_tpm_file) as  tx_out_file:
            df.pivot(None, "sample", "tpm").to_csv(tx_out_file, sep="\t")
        with file_transaction(gene_tpm_file) as  tx_out_file:
            pivot = df.pivot(None, "sample", "tpm")
            tdf = pd.DataFrame.from_dict(gtf.transcript_to_gene(gtf_file),
                                         orient="index")
            tdf.columns = ["gene_id"]
            pivot = pivot.join(tdf)
            pivot = pivot.groupby("gene_id").agg(np.sum)
            pivot.to_csv(tx_out_file, sep="\t")

    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_sailfish_tidy(data, tidy_file)
        data = dd.set_sailfish_transcript_tpm(data, transcript_tpm_file)
        data = dd.set_sailfish_gene_tpm(data, gene_tpm_file)
        updated_samples.append([data])
    return updated_samples
Example #35
0
def run_rapmap_index(*samples):
    for data in dd.sample_data_iterator(samples):
        work_dir = dd.get_work_dir(data)
        rapmap_dir = os.path.join(work_dir, "rapmap")
        gtf_file = dd.get_gtf_file(data)
        fasta_file = dd.get_ref_file(data)
        assert file_exists(
            fasta_file), "%s was not found, exiting." % fasta_file
        rapmap_index(gtf_file, fasta_file, "quasi", data, rapmap_dir)
    return samples
Example #36
0
def run_kallisto_index(*samples):
    for data in dd.sample_data_iterator(samples):
        work_dir = dd.get_work_dir(data)
        kallisto_dir = os.path.join(work_dir, "kallisto")
        gtf_file = dd.get_gtf_file(data)
        assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file
        fasta_file = dd.get_ref_file(data)
        assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
        kallisto_index(gtf_file, fasta_file, data, kallisto_dir)
    return samples
Example #37
0
def run_kallisto_index(*samples):
    for data in dd.sample_data_iterator(samples):
        work_dir = dd.get_work_dir(data)
        kallisto_dir = os.path.join(work_dir, "kallisto")
        gtf_file = dd.get_gtf_file(data)
        assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file
        fasta_file = dd.get_ref_file(data)
        assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
        kallisto_index(gtf_file, fasta_file, data, kallisto_dir)
    return samples
Example #38
0
def combine_express(samples, combined):
    """Combine tpm, effective counts and fpkm from express results"""
    if not combined:
        return None
    to_combine = [
        dd.get_express_counts(x) for x in dd.sample_data_iterator(samples)
        if dd.get_express_counts(x)
    ]
    gtf_file = dd.get_gtf_file(samples[0][0])
    isoform_to_gene_file = os.path.join(os.path.dirname(combined),
                                        "isoform_to_gene.txt")
    isoform_to_gene_file = express.isoform_to_gene_name(
        gtf_file, isoform_to_gene_file,
        dd.sample_data_iterator(samples).next())
    if len(to_combine) > 0:
        eff_counts_combined_file = os.path.splitext(
            combined)[0] + ".isoform.express_counts"
        eff_counts_combined = count.combine_count_files(
            to_combine, eff_counts_combined_file, ext=".counts")
        to_combine = [
            dd.get_express_tpm(x) for x in dd.sample_data_iterator(samples)
            if dd.get_express_tpm(x)
        ]
        tpm_counts_combined_file = os.path.splitext(
            combined)[0] + ".isoform.express_tpm"
        tpm_counts_combined = count.combine_count_files(
            to_combine, tpm_counts_combined_file)
        to_combine = [
            dd.get_express_fpkm(x) for x in dd.sample_data_iterator(samples)
            if dd.get_express_fpkm(x)
        ]
        fpkm_counts_combined_file = os.path.splitext(
            combined)[0] + ".isoform.express_fpkm"
        fpkm_counts_combined = count.combine_count_files(
            to_combine, fpkm_counts_combined_file, ext=".fpkm")
        return {
            'counts': eff_counts_combined,
            'tpm': tpm_counts_combined,
            'fpkm': fpkm_counts_combined,
            'isoform_to_gene': isoform_to_gene_file
        }
    return {}
Example #39
0
def concatenate_sparse_matrices(samples, deduped=True):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    umi_dir = os.path.join(work_dir, "umis")
    if deduped:
        out_file = os.path.join(umi_dir, "tagcounts.mtx")
    else:
        out_file = os.path.join(umi_dir, "tagcounts-dupes.mtx")
    if file_exists(out_file):
        if deduped:
            newsamples = []
            for data in dd.sample_data_iterator(samples):
                newsamples.append([dd.set_combined_counts(data, out_file)])
            return newsamples
        else:
            return samples
    files = [
        dd.get_count_file(data) for data in dd.sample_data_iterator(samples)
        if dd.get_count_file(data)
    ]
    if not deduped:
        files = [os.path.splitext(x)[0] + "-dupes.mtx" for x in files]

    files = [fn for fn in files if file_exists(fn)]
    descriptions = [
        dd.get_sample_name(data) for data in dd.sample_data_iterator(samples)
        if dd.get_count_file(data)
    ]
    if not files:
        return samples
    counts = SparseMatrix()
    counts.read(filename=files.pop(), colprefix=descriptions.pop())
    for filename, description in zip(files, descriptions):
        newcounts = SparseMatrix()
        newcounts.read(filename=filename, colprefix=description)
        counts.cat(newcounts)
    counts.write(out_file)
    newsamples = []
    if deduped:
        for data in dd.sample_data_iterator(samples):
            newsamples.append([dd.set_combined_counts(data, out_file)])
        return newsamples
    return samples
Example #40
0
def concatenate_cb_histograms(samples):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    umi_dir = os.path.join(work_dir, "umis")
    out_file = os.path.join(umi_dir, "cb-histogram.txt")

    files = [
        dd.get_histogram_counts(data)
        for data in dd.sample_data_iterator(samples)
        if dd.get_histogram_counts(data)
    ]
    files = " ".join(files)
    cmd = "cat {files} > {out_file}"
    if not file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            message = "Concat cellular barcode histograms: %s." % files
            do.run(cmd.format(**locals()), message)
    newsamples = []
    for data in dd.sample_data_iterator(samples):
        newsamples.append([dd.set_combined_histogram(data, out_file)])
    return newsamples
Example #41
0
def run_sailfish_index(*samples):
    fq1, _ = dd.get_input_sequence_files(samples[0][0])
    kmer_size = estimate_kmer_size(fq1)
    Build = namedtuple('Build', ['build', 'ref', 'gtf'])
    builds = {Build(get_build_string(x), dd.get_ref_file(x), dd.get_gtf_file(x))
              for x in dd.sample_data_iterator(samples)}
    data = samples[0][0]
    indexdirs = {}
    for build in builds:
        indexdirs[build.build] = sailfish_index(build.ref, build.gtf, data,
                                                build.build, kmer_size)
    return samples
Example #42
0
def run_sailfish_index(*samples):
    fq1, _ = dd.get_input_sequence_files(samples[0][0])
    kmer_size = estimate_kmer_size(fq1)
    Build = namedtuple('Build', ['build', 'ref', 'gtf'])
    builds = {Build(get_build_string(x), dd.get_ref_file(x), dd.get_gtf_file(x))
              for x in dd.sample_data_iterator(samples)}
    data = samples[0][0]
    indexdirs = {}
    for build in builds:
        indexdirs[build.build] = sailfish_index(build.ref, build.gtf, data,
                                                build.build, kmer_size)
    return samples
Example #43
0
def combine_express(samples, combined):
    """Combine tpm, effective counts and fpkm from express results"""
    to_combine = [dd.get_express_counts(x) for x in
                  dd.sample_data_iterator(samples) if dd.get_express_counts(x)]
    gtf_file = dd.get_gtf_file(samples[0][0])
    isoform_to_gene_file = os.path.join(os.path.dirname(combined), "isoform_to_gene.txt")
    isoform_to_gene_file = express.isoform_to_gene_name(gtf_file, isoform_to_gene_file)
    if len(to_combine) > 0:
        eff_counts_combined_file = os.path.splitext(combined)[0] + ".isoform.express_counts"
        eff_counts_combined = count.combine_count_files(to_combine, eff_counts_combined_file)
        to_combine = [dd.get_express_tpm(x) for x in
                      dd.sample_data_iterator(samples) if dd.get_express_tpm(x)]
        tpm_counts_combined_file = os.path.splitext(combined)[0] + ".isoform.express_tpm"
        tpm_counts_combined = count.combine_count_files(to_combine, tpm_counts_combined_file)
        to_combine = [dd.get_express_fpkm(x) for x in dd.sample_data_iterator(samples)
                      if dd.get_express_fpkm(x)]
        fpkm_counts_combined_file = os.path.splitext(combined)[0] + ".isoform.express_fpkm"
        fpkm_counts_combined = count.combine_count_files(to_combine, fpkm_counts_combined_file)
        return {'counts': eff_counts_combined, 'tpm': tpm_counts_combined,
                'fpkm': fpkm_counts_combined, 'isoform_to_gene': isoform_to_gene_file}
    return {}
Example #44
0
def concatenate_sparse_matrices(samples, deduped=True):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    umi_dir = os.path.join(work_dir, "umis")
    if deduped:
        out_file = os.path.join(umi_dir, "tagcounts.mtx")
    else:
        out_file = os.path.join(umi_dir, "tagcounts-dupes.mtx")
    if file_exists(out_file):
        if deduped:
            newsamples = []
            for data in dd.sample_data_iterator(samples):
                newsamples.append([dd.set_combined_counts(data, out_file)])
            return newsamples
        else:
            return samples
    files = [dd.get_count_file(data) for data in
            dd.sample_data_iterator(samples)
            if dd.get_count_file(data)]
    if not deduped:
        files = [os.path.splitext(x)[0] + "-dupes.mtx" for x in files]

    files = [fn for fn in files if file_exists(fn)]
    descriptions = [dd.get_sample_name(data) for data in
                    dd.sample_data_iterator(samples) if dd.get_count_file(data)]
    if not files:
        return samples
    counts = SparseMatrix()
    counts.read(filename=files.pop(), colprefix=descriptions.pop())
    for filename, description in zip(files, descriptions):
        newcounts = SparseMatrix()
        newcounts.read(filename=filename, colprefix=description)
        counts.cat(newcounts)
    counts.write(out_file)
    newsamples = []
    if deduped:
        for data in dd.sample_data_iterator(samples):
            newsamples.append([dd.set_combined_counts(data, out_file)])
        return newsamples
    return samples
Example #45
0
def create_peaktable(samples):
    """create a table of peak counts per sample to use with differential peak calling
    """
    data = dd.get_data_from_sample(samples[0])
    peakcounts = []
    out_dir = os.path.join(dd.get_work_dir(data), "consensus")
    out_file = os.path.join(out_dir, "consensus-counts.tsv")
    if dd.get_chip_method(data) == "chip":
        for data in dd.sample_data_iterator(samples):
            peakcounts.append(tz.get_in(("peak_counts"), data))
    elif dd.get_chip_method(data) == "atac":
        for data in dd.sample_data_iterator(samples):
            peakcounts.append(tz.get_in(("peak_counts", "NF"), data))
    combined_peaks = count.combine_count_files(peakcounts,
                                               out_file,
                                               ext=".counts")
    new_data = []
    for data in dd.sample_data_iterator(samples):
        data = tz.assoc_in(data, ("peak_counts", "peaktable"), combined_peaks)
        new_data.append(data)
    new_samples = dd.get_samples_from_datalist(new_data)
    return new_samples
Example #46
0
def concatenate_sparse_counts(*samples):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    umi_dir = os.path.join(work_dir, "umis")
    out_file = os.path.join(umi_dir, "tagcounts.mtx")
    if file_exists(out_file):
        return out_file
    files = [dd.get_count_file(data) for data in
             dd.sample_data_iterator(samples)
             if dd.get_count_file(data)]
    descriptions = [dd.get_sample_name(data) for data in
                    dd.sample_data_iterator(samples) if dd.get_count_file(data)]
    if not files:
        return samples
    counts = SparseMatrix()
    counts.read(filename=files.pop(), colprefix=descriptions.pop())
    for filename, description in zip(files, descriptions):
        newcounts = SparseMatrix()
        newcounts.read(filename=filename, colprefix=description)
        counts.cat(newcounts)
    counts.write(out_file)
    newsamples = []
    for data in dd.sample_data_iterator(samples):
        newsamples.append([dd.set_combined_counts(data, out_file)])
    return newsamples
Example #47
0
def estimate_expression(samples, run_parallel):
    samples = run_parallel("generate_transcript_counts", samples)
    count_files = filter_missing([dd.get_count_file(x[0]) for x in samples])
    combined = count.combine_count_files(count_files)
    gtf_file = dd.get_gtf_file(samples[0][0], None)
    annotated = count.annotate_combined_count_file(combined, gtf_file)

    samples = run_parallel("run_express", samples)
    express_counts_combined = combine_express(samples, combined)

    samples = run_parallel("run_cufflinks", samples)
    #gene
    fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm"
    fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples])
    fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file)
    #isoform
    fpkm_isoform_combined_file = os.path.splitext(combined)[0] + ".isoform.fpkm"
    isoform_files = filter_missing([dd.get_fpkm_isoform(x[0]) for x in samples])
    fpkm_isoform_combined = count.combine_count_files(isoform_files,
                                                      fpkm_isoform_combined_file,
                                                      ".isoform.fpkm")
    dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq"
    to_combine_dexseq = filter_missing([dd.get_dexseq_counts(data[0]) for data in samples])
    if to_combine_dexseq:
        dexseq_combined = count.combine_count_files(to_combine_dexseq,
                                                    dexseq_combined_file, ".dexseq")
    else:
        dexseq_combined = None

    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_combined_counts(data, combined)
        if annotated:
            data = dd.set_annotated_combined_counts(data, annotated)
        if fpkm_combined:
            data = dd.set_combined_fpkm(data, fpkm_combined)
        if fpkm_isoform_combined:
            data = dd.set_combined_fpkm_isoform(data, fpkm_combined)
        if express_counts_combined:
            data = dd.set_express_counts(data, express_counts_combined['counts'])
            data = dd.set_express_tpm(data, express_counts_combined['tpm'])
            data = dd.set_express_fpkm(data, express_counts_combined['fpkm'])
        if dexseq_combined:
            data = dd.set_dexseq_counts(data, dexseq_combined_file)
        updated_samples.append([data])
    return updated_samples
Example #48
0
def combine_files(samples):
    """
    after quantitation, combine the counts/FPKM/TPM/etc into a single table with
    all samples
    """
    data = samples[0][0]
    # prefer the supplied transcriptome gtf file
    gtf_file = dd.get_transcriptome_gtf(data, None)
    if not gtf_file:
        gtf_file = dd.get_gtf_file(data, None)
    dexseq_gff = dd.get_dexseq_gff(data)

    # combine featureCount files
    count_files = filter_missing([dd.get_count_file(x[0]) for x in samples])
    combined = count.combine_count_files(count_files, ext=".counts")
    annotated = count.annotate_combined_count_file(combined, gtf_file)

    # add tx2gene file
    tx2gene_file = os.path.join(dd.get_work_dir(data), "annotation", "tx2gene.csv")
    if gtf_file:
        tx2gene_file = sailfish.create_combined_tx2gene(data)

    # combine eXpress files
    express_counts_combined = combine_express(samples, combined)

    # combine Cufflinks files
    fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples])
    if fpkm_files and combined:
        fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm"
        fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file)
    else:
        fpkm_combined = None
    isoform_files = filter_missing([dd.get_fpkm_isoform(x[0]) for x in samples])
    if isoform_files and combined:
        fpkm_isoform_combined_file = os.path.splitext(combined)[0] + ".isoform.fpkm"
        fpkm_isoform_combined = count.combine_count_files(isoform_files,
                                                          fpkm_isoform_combined_file,
                                                          ".isoform.fpkm")
    else:
        fpkm_isoform_combined = None
    # combine DEXseq files
    to_combine_dexseq = filter_missing([dd.get_dexseq_counts(data[0]) for data
                                        in samples])
    if to_combine_dexseq and combined:
        dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq"
        dexseq_combined = count.combine_count_files(to_combine_dexseq,
                                                    dexseq_combined_file, ".dexseq")
        if dexseq_combined:
            dexseq.create_dexseq_annotation(dexseq_gff, dexseq_combined)
    else:
        dexseq_combined = None
    samples = spikein.combine_spikein(samples)
    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        if combined:
            data = dd.set_combined_counts(data, combined)
        if annotated:
            data = dd.set_annotated_combined_counts(data, annotated)
        if fpkm_combined:
            data = dd.set_combined_fpkm(data, fpkm_combined)
        if fpkm_isoform_combined:
            data = dd.set_combined_fpkm_isoform(data, fpkm_isoform_combined)
        if express_counts_combined:
            data = dd.set_express_counts(data, express_counts_combined['counts'])
            data = dd.set_express_tpm(data, express_counts_combined['tpm'])
            data = dd.set_express_fpkm(data, express_counts_combined['fpkm'])
            data = dd.set_isoform_to_gene(data, express_counts_combined['isoform_to_gene'])
        if dexseq_combined:
            data = dd.set_dexseq_counts(data, dexseq_combined_file)
        if gtf_file:
            data = dd.set_tx2gene(data, tx2gene_file)
        updated_samples.append([data])
    return updated_samples
Example #49
0
def combine_files(samples):
    """
    after quantitation, combine the counts/FPKM/TPM/etc into a single table with
    all samples
    """
    data = samples[0][0]
    # prefer the supplied transcriptome gtf file
    gtf_file = dd.get_transcriptome_gtf(data, None)
    if not gtf_file:
        gtf_file = dd.get_gtf_file(data, None)
    dexseq_gff = dd.get_dexseq_gff(data)

    # combine featureCount files
    count_files = filter_missing([dd.get_count_file(x[0]) for x in samples])
    combined = count.combine_count_files(count_files, ext=".counts")
    annotated = count.annotate_combined_count_file(combined, gtf_file)

    # add tx2gene file
    tx2gene_file = os.path.join(dd.get_work_dir(data), "annotation",
                                "tx2gene.csv")
    if gtf_file:
        tx2gene_file = sailfish.create_combined_tx2gene(data)

    # combine eXpress files
    express_counts_combined = combine_express(samples, combined)

    # combine Cufflinks files
    fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples])
    if fpkm_files:
        fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm"
        fpkm_combined = count.combine_count_files(fpkm_files,
                                                  fpkm_combined_file)
    else:
        fpkm_combined = None
    isoform_files = filter_missing(
        [dd.get_fpkm_isoform(x[0]) for x in samples])
    if isoform_files:
        fpkm_isoform_combined_file = os.path.splitext(
            combined)[0] + ".isoform.fpkm"
        fpkm_isoform_combined = count.combine_count_files(
            isoform_files, fpkm_isoform_combined_file, ".isoform.fpkm")
    else:
        fpkm_isoform_combined = None
    # combine DEXseq files
    to_combine_dexseq = filter_missing(
        [dd.get_dexseq_counts(data[0]) for data in samples])
    if to_combine_dexseq:
        dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq"
        dexseq_combined = count.combine_count_files(to_combine_dexseq,
                                                    dexseq_combined_file,
                                                    ".dexseq")
        dexseq.create_dexseq_annotation(dexseq_gff, dexseq_combined)
    else:
        dexseq_combined = None
    samples = spikein.combine_spikein(samples)
    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        if combined:
            data = dd.set_combined_counts(data, combined)
        if annotated:
            data = dd.set_annotated_combined_counts(data, annotated)
        if fpkm_combined:
            data = dd.set_combined_fpkm(data, fpkm_combined)
        if fpkm_isoform_combined:
            data = dd.set_combined_fpkm_isoform(data, fpkm_isoform_combined)
        if express_counts_combined:
            data = dd.set_express_counts(data,
                                         express_counts_combined['counts'])
            data = dd.set_express_tpm(data, express_counts_combined['tpm'])
            data = dd.set_express_fpkm(data, express_counts_combined['fpkm'])
            data = dd.set_isoform_to_gene(
                data, express_counts_combined['isoform_to_gene'])
        if dexseq_combined:
            data = dd.set_dexseq_counts(data, dexseq_combined_file)
        if gtf_file:
            data = dd.set_tx2gene(data, tx2gene_file)
        updated_samples.append([data])
    return updated_samples
Example #50
0
def _is_trim_set(samples):
    for sample in dd.sample_data_iterator(samples):
        return utils.get_in(sample, ["algorithm", "trim_reads"])
    return None
Example #51
0
def _is_trim_set(samples):
    for sample in dd.sample_data_iterator(samples):
        return utils.get_in(sample, ["algorithm", "trim_reads"])
    return None