Example #1
0
def make_scrnaseq_object(samples):
    """
    load the initial se.rda object using sinclecell-experiment
    """
    local_sitelib = R_sitelib()
    counts_dir = os.path.dirname(
        dd.get_in_samples(samples, dd.get_combined_counts))
    gtf_file = dd.get_in_samples(samples, dd.get_transcriptome_gtf)
    if not gtf_file:
        gtf_file = dd.get_in_samples(samples, dd.get_gtf_file)
    rda_file = os.path.join(counts_dir, "se.rda")
    if not file_exists(rda_file):
        with file_transaction(rda_file) as tx_out_file:
            rcode = "%s-run.R" % os.path.splitext(rda_file)[0]
            rrna_file = "%s-rrna.txt" % os.path.splitext(rda_file)[0]
            rrna_file = _find_rRNA_genes(gtf_file, rrna_file)
            with open(rcode, "w") as out_handle:
                out_handle.write(_script.format(**locals()))
            rscript = Rscript_cmd()
            try:
                # do.run([rscript, "--vanilla", rcode],
                #        "SingleCellExperiment",
                #        log_error=False)
                rda_file = rcode
            except subprocess.CalledProcessError as msg:
                logger.exception()
Example #2
0
def rnaseq_prep_samples(config, run_info_yaml, parallel, dirs, samples):
    """
    organizes RNA-seq and small-RNAseq samples, converting from BAM if
    necessary and trimming if necessary
    """
    pipeline = dd.get_in_samples(samples, dd.get_analysis)
    trim_reads_set = dd.get_in_samples(samples, dd.get_trim_reads)
    resources = ["picard"]
    needs_trimming = (_is_smallrnaseq(pipeline) or trim_reads_set)
    if needs_trimming:
        resources.append("cutadapt")
    with prun.start(_wres(parallel, resources),
                    samples,
                    config,
                    dirs,
                    "trimming",
                    max_multicore=1) as run_parallel:
        with profile.report("organize samples", dirs):
            samples = run_parallel("organize_samples", [[
                dirs, config, run_info_yaml,
                [x[0]["description"] for x in samples]
            ]])
            samples = run_parallel("prepare_sample", samples)
        if needs_trimming:
            with profile.report("adapter trimming", dirs):
                if _is_smallrnaseq(pipeline):
                    samples = run_parallel("trim_srna_sample", samples)
                else:
                    samples = run_parallel("trim_sample", samples)
    return samples
Example #3
0
def detect_fusions(samples):
    """Run fusion with a standalone tool, specified in config
    as fusion_caller.
    If fusion_mode is True, and no fusion_caller is specified,
    or fusion_caller == 'aligner', it is assumed that gene fusion
    detection was run on the alignment step.
    """
    fusion_mode = dd.get_in_samples(samples, dd.get_fusion_mode)
    if not fusion_mode:
        return samples

    caller = dd.get_in_samples(samples, dd.get_fusion_caller)
    if not caller or caller == 'aligner':
        logger.info("No standalone fusion caller specified in the config.")
        return samples

    STANDALONE_CALLERS = {
        'ericscript': ericscript.run,
    }
    caller_fn = STANDALONE_CALLERS.get(caller)
    if not caller_fn:
        logger.warning("Gene fusion detection with %s is not supported."
                       "Supported callers:\n%s" %
                       ', '.join(STANDALONE_CALLERS.keys()))
        return samples

    logger.info("Running gene fusion detection with  %s" % caller)
    return [[caller_fn(s)] for s in dd.sample_data_iterator(samples)]
def make_scrnaseq_object(samples):
    """
    load the initial se.rda object using sinclecell-experiment
    """
    local_sitelib = R_sitelib()
    counts_dir = os.path.dirname(dd.get_in_samples(samples, dd.get_combined_counts))
    gtf_file = dd.get_in_samples(samples, dd.get_transcriptome_gtf)
    if not gtf_file:
        gtf_file = dd.get_in_samples(samples, dd.get_gtf_file)
    rda_file = os.path.join(counts_dir, "se.rda")
    if not file_exists(rda_file):
        with file_transaction(rda_file) as tx_out_file:
            rcode = "%s-run.R" % os.path.splitext(rda_file)[0]
            rrna_file = "%s-rrna.txt" % os.path.splitext(rda_file)[0]
            rrna_file = _find_rRNA_genes(gtf_file, rrna_file)
            with open(rcode, "w") as out_handle:
                out_handle.write(_script.format(**locals()))
            rscript = Rscript_cmd()
            try:
                # do.run([rscript, "--no-environ", rcode],
                #        "SingleCellExperiment",
                #        log_error=False)
                rda_file = rcode
            except subprocess.CalledProcessError as msg:
                logger.exception()
Example #5
0
def combine_sailfish(samples):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    sailfish_dir = os.path.join(work_dir, "sailfish")
    gtf_file = dd.get_in_samples(samples, dd.get_gtf_file)
    dont_combine, to_combine = partition(dd.get_sailfish,
                                         dd.sample_data_iterator(samples),
                                         True)
    if not to_combine:
        return samples

    tidy_file = os.path.join(sailfish_dir, "combined.sf")
    transcript_tpm_file = os.path.join(sailfish_dir, "combined.isoform.sf.tpm")
    gene_tpm_file = os.path.join(sailfish_dir, "combined.gene.sf.tpm")
    tx2gene = os.path.join(sailfish_dir, "tx2gene.csv")
    if not all([
            file_exists(x)
            for x in [gene_tpm_file, tidy_file, transcript_tpm_file, tx2gene]
    ]):
        logger.info("Combining count files into %s." % tidy_file)
        df = pd.DataFrame()
        for data in to_combine:
            sailfish_file = dd.get_sailfish(data)
            samplename = dd.get_sample_name(data)
            new_df = _sailfish_expression_parser(sailfish_file, samplename)
            if df.empty:
                df = new_df
            else:
                df = rbind([df, new_df])
        df["id"] = df.index
        # some versions of the transcript annotations can have duplicated entries
        df = df.drop_duplicates(["id", "sample"])
        with file_transaction(tidy_file) as tx_out_file:
            df.to_csv(tx_out_file, sep="\t", index_label="name")
        with file_transaction(transcript_tpm_file) as tx_out_file:
            df.pivot("id", "sample", "tpm").to_csv(tx_out_file, sep="\t")
        with file_transaction(gene_tpm_file) as tx_out_file:
            pivot = df.pivot("id", "sample", "tpm")
            tdf = pd.DataFrame.from_dict(gtf.transcript_to_gene(gtf_file),
                                         orient="index")
            tdf.columns = ["gene_id"]
            pivot = pivot.join(tdf)
            pivot = pivot.groupby("gene_id").agg(np.sum)
            pivot.to_csv(tx_out_file, sep="\t")
        tx2gene = gtf.tx2genefile(gtf_file, tx2gene)
        logger.info("Finished combining count files into %s." % tidy_file)

    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_sailfish_tidy(data, tidy_file)
        data = dd.set_sailfish_transcript_tpm(data, transcript_tpm_file)
        data = dd.set_sailfish_gene_tpm(data, gene_tpm_file)
        data = dd.set_tx2gene(data, tx2gene)
        updated_samples.append([data])
    return updated_samples
Example #6
0
def combine_sailfish(samples):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    sailfish_dir = os.path.join(work_dir, "sailfish")
    gtf_file = dd.get_in_samples(samples, dd.get_gtf_file)
    dont_combine, to_combine = partition(dd.get_sailfish,
                                         dd.sample_data_iterator(samples), True)
    if not to_combine:
        return samples

    tidy_file = os.path.join(sailfish_dir, "combined.sf")
    transcript_tpm_file = os.path.join(sailfish_dir, "combined.isoform.sf.tpm")
    gene_tpm_file = os.path.join(sailfish_dir, "combined.gene.sf.tpm")
    tx2gene = os.path.join(sailfish_dir, "tx2gene.csv")
    if not all([file_exists(x) for x in [gene_tpm_file, tidy_file,
                                         transcript_tpm_file, tx2gene]]):
        logger.info("Combining count files into %s." % tidy_file)
        df = pd.DataFrame()
        for data in to_combine:
            sailfish_file = dd.get_sailfish(data)
            samplename = dd.get_sample_name(data)
            new_df = _sailfish_expression_parser(sailfish_file, samplename)
            if df.empty:
                df = new_df
            else:
                df = rbind([df, new_df])
        df["id"] = df.index
        # some versions of the transcript annotations can have duplicated entries
        df = df.drop_duplicates(["id", "sample"])
        with file_transaction(tidy_file) as tx_out_file:
            df.to_csv(tx_out_file, sep="\t", index_label="name")
        with file_transaction(transcript_tpm_file) as  tx_out_file:
            df.pivot("id", "sample", "tpm").to_csv(tx_out_file, sep="\t")
        with file_transaction(gene_tpm_file) as  tx_out_file:
            pivot = df.pivot("id", "sample", "tpm")
            tdf = pd.DataFrame.from_dict(gtf.transcript_to_gene(gtf_file),
                                         orient="index")
            tdf.columns = ["gene_id"]
            pivot = pivot.join(tdf)
            pivot = pivot.groupby("gene_id").agg(np.sum)
            pivot.to_csv(tx_out_file, sep="\t")
        tx2gene = gtf.tx2genefile(gtf_file, tx2gene)
        logger.info("Finished combining count files into %s." % tidy_file)

    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_sailfish_tidy(data, tidy_file)
        data = dd.set_sailfish_transcript_tpm(data, transcript_tpm_file)
        data = dd.set_sailfish_gene_tpm(data, gene_tpm_file)
        data = dd.set_tx2gene(data, tx2gene)
        updated_samples.append([data])
    return updated_samples
Example #7
0
def rnaseq_prep_samples(config, run_info_yaml, parallel, dirs, samples):
    """
    organizes RNA-seq and small-RNAseq samples, converting from BAM if
    necessary and trimming if necessary
    """
    pipeline = dd.get_in_samples(samples, dd.get_analysis)
    trim_reads_set = any([tz.get_in(["algorithm", "trim_reads"], d) for d in dd.sample_data_iterator(samples)])
    resources = ["picard"]
    needs_trimming = (_is_smallrnaseq(pipeline) or trim_reads_set)
    if needs_trimming:
        resources.append("atropos")
    with prun.start(_wres(parallel, resources),
                    samples, config, dirs, "trimming",
                    max_multicore=1 if not needs_trimming else None) as run_parallel:
        with profile.report("organize samples", dirs):
            samples = run_parallel("organize_samples", [[dirs, config, run_info_yaml,
                                                            [x[0]["description"] for x in samples]]])
            samples = run_parallel("prepare_sample", samples)
        if needs_trimming:
            with profile.report("adapter trimming", dirs):
                if _is_smallrnaseq(pipeline):
                    samples = run_parallel("trim_srna_sample", samples)
                else:
                    samples = run_parallel("trim_sample", samples)
    return samples
Example #8
0
def singlecell_rnaseq(samples, run_parallel):
    quantifier = dd.get_in_samples(samples, dd.get_singlecell_quantifier)
    quantifier = quantifier.lower()
    samples = run_parallel("run_umi_transform", samples)
    demultiplexed = run_parallel("demultiplex_samples", samples)
    # break demultiplixed lanes into their own samples
    samples = []
    for lane in demultiplexed:
        for index in lane:
            samples.append([index])
    samples = run_parallel("run_filter_barcodes", samples)
    samples = run_parallel("run_barcode_histogram", samples)
    if quantifier == "rapmap":
        samples = run_parallel("run_rapmap_index", [samples])
        samples = run_parallel("run_rapmap_align", samples)
        samples = run_parallel("run_tagcount", samples)
        samples = run_parallel("run_concatenate_sparse_counts", [samples])
    elif quantifier == "kallisto":
        samples = run_parallel("run_kallisto_singlecell", samples)
    else:
        logger.error(("%s is not supported for singlecell RNA-seq "
                      "quantification." % quantifier))
        sys.exit(1)
    samples = scrnaseq_concatenate_metadata(samples)
    singlecellexperiment.make_scrnaseq_object(samples)
    return samples
Example #9
0
def singlecell_rnaseq(samples, run_parallel):
    quantifier = dd.get_in_samples(samples, dd.get_singlecell_quantifier)
    quantifier = quantifier.lower()
    samples = run_parallel("run_umi_transform", samples)
    demultiplexed = run_parallel("demultiplex_samples", samples)
    # break demultiplixed lanes into their own samples
    samples = []
    for lane in demultiplexed:
        for index in lane:
            samples.append([index])
    if not samples:
        logger.error(f"No samples were found matching the supplied sample barcodes. See "
            f"https://github.com/bcbio/bcbio-nextgen/issues/3428#issuecomment-772609904 "
            f"for how to debug this issue.")
        sys.exit(1)
    samples = run_parallel("run_filter_barcodes", samples)
    samples = run_parallel("run_barcode_histogram", samples)
    if quantifier == "rapmap":
        samples = run_parallel("run_rapmap_index", [samples])
        samples = run_parallel("run_rapmap_align", samples)
        samples = run_parallel("run_tagcount", samples)
        samples = run_parallel("run_concatenate_sparse_counts", [samples])
    elif quantifier == "kallisto":
        samples = run_parallel("run_kallisto_singlecell", samples)
    else:
        logger.error(("%s is not supported for singlecell RNA-seq "
                      "quantification." % quantifier))
        sys.exit(1)
    samples = scrnaseq_concatenate_metadata(samples)
    singlecellexperiment.make_scrnaseq_object(samples)
    return samples
Example #10
0
def combine_spikein(samples):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    sailfish_dir = os.path.join(work_dir, "spikein")
    dont_combine, to_combine = partition(dd.get_spikein_counts,
                                         dd.sample_data_iterator(samples), True)
    if not to_combine:
        return samples

    tidy_file = os.path.join(sailfish_dir, "spikein.sf")
    if not file_exists(tidy_file):
        logger.info("Combining count files into %s." % tidy_file)
        df = pd.DataFrame()
        for data in to_combine:
            sailfish_file = dd.get_spikein_counts(data)
            samplename = dd.get_sample_name(data)
            new_df = sailfish._sailfish_expression_parser(sailfish_file, samplename)
            if df.empty:
                df = new_df
            else:
                df = rbind([df, new_df])
        df["id"] = df.index
        # some versions of the transcript annotations can have duplicated entries
        df = df.drop_duplicates(["id", "sample"])
        with file_transaction(tidy_file) as tx_out_file:
            df.to_csv(tx_out_file, sep="\t", index_label="name")
        logger.info("Finished combining count files into %s." % tidy_file)

    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_spikein_counts(data, tidy_file)
        updated_samples.append([data])
    return updated_samples
Example #11
0
def concatenate_sparse_counts(*samples):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    umi_dir = os.path.join(work_dir, "umis")
    out_file = os.path.join(umi_dir, "tagcounts.mtx")
    if file_exists(out_file):
        return out_file
    files = [
        dd.get_count_file(data) for data in dd.sample_data_iterator(samples)
        if dd.get_count_file(data)
    ]
    descriptions = [
        dd.get_sample_name(data) for data in dd.sample_data_iterator(samples)
        if dd.get_count_file(data)
    ]
    if not files:
        return samples
    counts = SparseMatrix()
    counts.read(filename=files.pop(), colprefix=descriptions.pop())
    for filename, description in zip(files, descriptions):
        newcounts = SparseMatrix()
        newcounts.read(filename=filename, colprefix=description)
        counts.cat(newcounts)
    counts.write(out_file)
    newsamples = []
    for data in dd.sample_data_iterator(samples):
        newsamples.append([dd.set_combined_counts(data, out_file)])
    return newsamples
Example #12
0
def combine_sailfish(samples):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    dont_combine, to_combine = partition(dd.get_sailfish,
                                         dd.sample_data_iterator(samples),
                                         True)
    if not to_combine:
        return samples

    out_file = os.path.join(work_dir, "sailfish", "combined.sf")
    if not file_exists(out_file):
        df = pd.DataFrame()
        for data in to_combine:
            sailfish_file = dd.get_sailfish(data)
            samplename = dd.get_sample_name(data)
            new_df = _sailfish_expression_parser(sailfish_file, samplename)
            if df.empty:
                df = new_df
            else:
                df = rbind([df, new_df])
        with file_transaction(out_file) as tx_out_file:
            df.to_csv(tx_out_file, sep="\t", index_label="name")

    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_sailfish_combined(data, out_file)
        updated_samples.append([data])
    return updated_samples
Example #13
0
def combine_spikein(samples):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    sailfish_dir = os.path.join(work_dir, "spikein")
    dont_combine, to_combine = partition(dd.get_spikein_counts,
                                         dd.sample_data_iterator(samples),
                                         True)
    if not to_combine:
        return samples

    tidy_file = os.path.join(sailfish_dir, "spikein.sf")
    if not file_exists(tidy_file):
        logger.info("Combining count files into %s." % tidy_file)
        df = pd.DataFrame()
        for data in to_combine:
            sailfish_file = dd.get_spikein_counts(data)
            samplename = dd.get_sample_name(data)
            new_df = sailfish._sailfish_expression_parser(
                sailfish_file, samplename)
            if df.empty:
                df = new_df
            else:
                df = rbind([df, new_df])
        df["id"] = df.index
        # some versions of the transcript annotations can have duplicated entries
        df = df.drop_duplicates(["id", "sample"])
        with file_transaction(tidy_file) as tx_out_file:
            df.to_csv(tx_out_file, sep="\t", index_label="name")
        logger.info("Finished combining count files into %s." % tidy_file)

    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_spikein_counts(data, tidy_file)
        updated_samples.append([data])
    return updated_samples
Example #14
0
def singlecell_rnaseq(samples, run_parallel):
    quantifier = dd.get_in_samples(samples, dd.get_singlecell_quantifier)
    quantifier = quantifier.lower()
    samples = run_parallel("run_umi_transform", samples)
    demultiplexed = run_parallel("demultiplex_samples", samples)
    # break demultiplixed lanes into their own samples
    samples = []
    for lane in demultiplexed:
        for index in lane:
            samples.append([index])
    samples = run_parallel("run_filter_barcodes", samples)
    samples = run_parallel("run_barcode_histogram", samples)
    if quantifier == "rapmap":
        samples = run_parallel("run_rapmap_index", [samples])
        samples = run_parallel("run_rapmap_align", samples)
        samples = run_parallel("run_tagcount", samples)
        samples = run_parallel("run_concatenate_sparse_counts", [samples])
    elif quantifier == "kallisto":
        samples = run_parallel("run_kallisto_singlecell", samples)
    else:
        logger.error(("%s is not supported for singlecell RNA-seq "
                      "quantification." % quantifier))
        sys.exit(1)
    samples = scrnaseq_concatenate_metadata(samples)
    singlecellexperiment.make_scrnaseq_object(samples)
    return samples
Example #15
0
def combine_sailfish(samples):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    gtf_file = dd.get_in_samples(samples, dd.get_gtf_file)
    dont_combine, to_combine = partition(dd.get_sailfish,
                                         dd.sample_data_iterator(samples), True)
    if not to_combine:
        return samples

    tidy_file = os.path.join(work_dir, "sailfish", "combined.sf")
    transcript_tpm_file = os.path.join(work_dir, "sailfish",
                                       "combined.isoform.sf.tpm")
    gene_tpm_file = os.path.join(work_dir, "sailfish",
                                 "combined.gene.sf.tpm")
    if not all([file_exists(x) for x in [gene_tpm_file, tidy_file,
                                         transcript_tpm_file]]):
        df = pd.DataFrame()
        for data in to_combine:
            sailfish_file = dd.get_sailfish(data)
            samplename = dd.get_sample_name(data)
            new_df = _sailfish_expression_parser(sailfish_file, samplename)
            if df.empty:
                df = new_df
            else:
                df = rbind([df, new_df])
        with file_transaction(tidy_file) as tx_out_file:
            df.to_csv(tx_out_file, sep="\t", index_label="name")
        with file_transaction(transcript_tpm_file) as  tx_out_file:
            df.pivot(None, "sample", "tpm").to_csv(tx_out_file, sep="\t")
        with file_transaction(gene_tpm_file) as  tx_out_file:
            pivot = df.pivot(None, "sample", "tpm")
            tdf = pd.DataFrame.from_dict(gtf.transcript_to_gene(gtf_file),
                                         orient="index")
            tdf.columns = ["gene_id"]
            pivot = pivot.join(tdf)
            pivot = pivot.groupby("gene_id").agg(np.sum)
            pivot.to_csv(tx_out_file, sep="\t")

    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_sailfish_tidy(data, tidy_file)
        data = dd.set_sailfish_transcript_tpm(data, transcript_tpm_file)
        data = dd.set_sailfish_gene_tpm(data, gene_tpm_file)
        updated_samples.append([data])
    return updated_samples
Example #16
0
def combine_sailfish(samples):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    gtf_file = dd.get_in_samples(samples, dd.get_gtf_file)
    dont_combine, to_combine = partition(dd.get_sailfish,
                                         dd.sample_data_iterator(samples), True)
    if not to_combine:
        return samples

    tidy_file = os.path.join(work_dir, "sailfish", "combined.sf")
    transcript_tpm_file = os.path.join(work_dir, "sailfish",
                                       "combined.isoform.sf.tpm")
    gene_tpm_file = os.path.join(work_dir, "sailfish",
                                 "combined.gene.sf.tpm")
    if not all([file_exists(x) for x in [gene_tpm_file, tidy_file,
                                         transcript_tpm_file]]):
        df = pd.DataFrame()
        for data in to_combine:
            sailfish_file = dd.get_sailfish(data)
            samplename = dd.get_sample_name(data)
            new_df = _sailfish_expression_parser(sailfish_file, samplename)
            if df.empty:
                df = new_df
            else:
                df = rbind([df, new_df])
        with file_transaction(tidy_file) as tx_out_file:
            df.to_csv(tx_out_file, sep="\t", index_label="name")
        with file_transaction(transcript_tpm_file) as  tx_out_file:
            df.pivot(None, "sample", "tpm").to_csv(tx_out_file, sep="\t")
        with file_transaction(gene_tpm_file) as  tx_out_file:
            pivot = df.pivot(None, "sample", "tpm")
            tdf = pd.DataFrame.from_dict(gtf.transcript_to_gene(gtf_file),
                                         orient="index")
            tdf.columns = ["gene_id"]
            pivot = pivot.join(tdf)
            pivot = pivot.groupby("gene_id").agg(np.sum)
            pivot.to_csv(tx_out_file, sep="\t")

    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_sailfish_tidy(data, tidy_file)
        data = dd.set_sailfish_transcript_tpm(data, transcript_tpm_file)
        data = dd.set_sailfish_gene_tpm(data, gene_tpm_file)
        updated_samples.append([data])
    return updated_samples
Example #17
0
def initialize_watcher(samples):
    """
    check to see if cwl_reporting is set for any samples,
    and if so, initialize a WorldWatcher object from a set of samples,
    """
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    ww = WorldWatcher(work_dir,
                      is_on=any([dd.get_cwl_reporting(d[0]) for d in samples]))
    ww.initialize(samples)
    return ww
Example #18
0
def initialize_watcher(samples):
    """
    check to see if cwl_reporting is set for any samples,
    and if so, initialize a WorldWatcher object from a set of samples,
    """
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    ww = WorldWatcher(work_dir,
                      is_on=any([dd.get_cwl_reporting(d[0]) for d in samples]))
    ww.initialize(samples)
    return ww
Example #19
0
def rnaseqpipeline(config, run_info_yaml, parallel, dirs, samples):
    samples = rnaseq_prep_samples(config, run_info_yaml, parallel, dirs, samples)
    with prun.start(_wres(parallel, ["aligner", "picard", "samtools"],
                            ensure_mem={"tophat": 10, "tophat2": 10, "star": 2, "hisat2": 8}),
                    samples, config, dirs, "alignment",
                    multiplier=alignprep.parallel_multiplier(samples)) as run_parallel:
        with profile.report("alignment", dirs):
            samples = run_parallel("disambiguate_split", [samples])
            samples = run_parallel("process_alignment", samples)
    with prun.start(_wres(parallel, ["samtools", "cufflinks"]),
                    samples, config, dirs, "rnaseqcount") as run_parallel:
        with profile.report("disambiguation", dirs):
            samples = disambiguate.resolve(samples, run_parallel)
        with profile.report("transcript assembly", dirs):
            samples = rnaseq.assemble_transcripts(run_parallel, samples)
        with profile.report("estimate expression (threaded)", dirs):
            samples = rnaseq.quantitate_expression_parallel(samples, run_parallel)

    with prun.start(_wres(parallel, ["dexseq", "express"]), samples, config,
                    dirs, "rnaseqcount-singlethread", max_multicore=1) as run_parallel:
        with profile.report("estimate expression (single threaded)", dirs):
            samples = rnaseq.quantitate_expression_noparallel(samples, run_parallel)

    samples = rnaseq.combine_files(samples)
    with prun.start(_wres(parallel, ["gatk", "vardict"]), samples, config,
                    dirs, "rnaseq-variation") as run_parallel:
        with profile.report("RNA-seq variant calling", dirs):
            samples = rnaseq.rnaseq_variant_calling(samples, run_parallel)

    with prun.start(_wres(parallel, ["samtools", "fastqc", "qualimap",
                                     "kraken", "gatk", "preseq"], ensure_mem={"qualimap": 4}),
                    samples, config, dirs, "qc") as run_parallel:
        with profile.report("quality control", dirs):
            samples = qcsummary.generate_parallel(samples, run_parallel)
        with profile.report("upload", dirs):
            samples = run_parallel("upload_samples", samples)
            for sample in samples:
                run_parallel("upload_samples_project", [sample])
        with profile.report("bcbioRNAseq loading", dirs):
            tools_on = dd.get_in_samples(samples, dd.get_tools_on)
            bcbiornaseq_on = tools_on and "bcbiornaseq" in tools_on
            if bcbiornaseq_on:
                if len(samples) < 3:
                    logger.warn("bcbioRNASeq needs at least three samples total, skipping.")
                elif len(samples) > 100:
                    logger.warn("Over 100 samples, skipping bcbioRNASeq.")
                else:
                    run_parallel("run_bcbiornaseqload", [sample])
    logger.info("Timing: finished")
    return samples
Example #20
0
def singlecell_rnaseq(samples, run_parallel):
    quantifier = dd.get_in_samples(samples, dd.get_singlecell_quantifier)
    quantifier = quantifier.lower()
    samples = run_parallel("run_umi_transform", samples)
    samples = run_parallel("run_barcode_histogram", samples)
    samples = run_parallel("run_filter_barcodes", samples)
    if quantifier == "rapmap":
        samples = run_parallel("run_rapmap_align", samples)
        samples = run_parallel("run_tagcount", samples)
    elif quantifier == "kallisto":
        samples = run_parallel("run_kallisto_singlecell", samples)
    else:
        logger.error(("%s is not supported for singlecell RNA-seq "
                      "quantification." % quantifier))
        sys.exit(1)
    return samples
Example #21
0
def singlecell_rnaseq(samples, run_parallel):
    quantifier = dd.get_in_samples(samples, dd.get_singlecell_quantifier)
    quantifier = quantifier.lower()
    samples = run_parallel("run_umi_transform", samples)
    samples = run_parallel("run_barcode_histogram", samples)
    samples = run_parallel("run_filter_barcodes", samples)
    if quantifier == "rapmap":
        samples = run_parallel("run_rapmap_align", samples)
        samples = run_parallel("run_tagcount", samples)
    elif quantifier == "kallisto":
        samples = run_parallel("run_kallisto_singlecell", samples)
    else:
        logger.error(("%s is not supported for singlecell RNA-seq "
                      "quantification." % quantifier))
        sys.exit(1)
    return samples
Example #22
0
def assemble_transcripts(run_parallel, samples):
    """
    assembly strategy rationale implemented as suggested in
    http://www.nature.com/nprot/journal/v7/n3/full/nprot.2012.016.html

    run Cufflinks in without a reference GTF for each individual sample
    merge the assemblies with Cuffmerge using a reference GTF
    """
    assembler = dd.get_in_samples(samples, dd.get_transcript_assembler)
    if assembler:
        if "cufflinks" in assembler:
            samples = run_parallel("cufflinks_assemble", samples)
        if "stringtie" in assembler:
            samples = run_parallel("run_stringtie_expression", samples)
        samples = run_parallel("cufflinks_merge", [samples])
    return samples
Example #23
0
def _vardict_options_from_config(items,
                                 config,
                                 out_file,
                                 target=None,
                                 is_rnaseq=False):
    var2vcf_opts = []
    opts = ["-c 1", "-S 2", "-E 3", "-g 4"]
    # ["-z", "-F", "-c", "1", "-S", "2", "-E", "3", "-g", "4", "-x", "0",
    #  "-k", "3", "-r", "4", "-m", "8"]
    cores = dd.get_num_cores(items[0])
    if cores and cores > 1:
        opts += ["-th", str(cores)]
    # Disable SV calling for vardict, causes issues with regional analysis
    # by detecting SVs outside of target regions, which messes up merging
    # SV calling will be worked on as a separate step
    # use tools_on: vardict_sv to turn sv calling in vardict on (experimental)
    tools_on = dd.get_in_samples(items, dd.get_tools_on)
    vardict_sv_on = tools_on and "vardict_sv" in tools_on
    vardict_cl = get_vardict_command(items[0])
    version = programs.get_version_manifest(vardict_cl)
    # turn off structural variants
    if ((vardict_cl and version and
         ((vardict_cl == "vardict-java"
           and LooseVersion(version) >= LooseVersion("1.5.5")) or
          (vardict_cl == "vardict"))) and not vardict_sv_on):
        opts += ["--nosv"]
    if (vardict_cl and version
            and (vardict_cl == "vardict-java"
                 and LooseVersion(version) >= LooseVersion("1.5.6"))):
        opts += ["--deldupvar"]
    # remove low mapping quality reads
    if not is_rnaseq:
        opts += ["-Q", "10"]
    # Remove QCfail reads, avoiding high depth repetitive regions
    opts += ["-F", "0x700"]
    resources = config_utils.get_resources("vardict", config)
    if resources.get("options"):
        opts += [str(x) for x in resources["options"]]
    resources = config_utils.get_resources("var2vcf", config)
    if resources.get("options"):
        var2vcf_opts += [str(x) for x in resources["options"]]
    if target and _is_bed_file(target):
        target = _enforce_max_region_size(target, items[0])
        opts += [target]  # this must be the last option
    _add_freq_options(config, opts, var2vcf_opts)
    return " ".join(opts), " ".join(var2vcf_opts)
Example #24
0
def concatenate_cb_histograms(samples):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    umi_dir = os.path.join(work_dir, "umis")
    out_file = os.path.join(umi_dir, "cb-histogram.txt")

    files = [dd.get_histogram_counts(data) for data in
            dd.sample_data_iterator(samples)
            if dd.get_histogram_counts(data)]
    files = " ".join(files)
    cmd = "cat {files} > {out_file}"
    if not file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            message = "Concay cb histograms."
            do.run(cmd.format(**locals()), message)
    newsamples = []
    for data in dd.sample_data_iterator(samples):
        newsamples.append([dd.set_combined_histogram(data, out_file)])
    return newsamples
Example #25
0
def concatenate_sparse_matrices(samples, deduped=True):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    umi_dir = os.path.join(work_dir, "umis")
    if deduped:
        out_file = os.path.join(umi_dir, "tagcounts.mtx")
    else:
        out_file = os.path.join(umi_dir, "tagcounts-dupes.mtx")
    if file_exists(out_file):
        if deduped:
            newsamples = []
            for data in dd.sample_data_iterator(samples):
                newsamples.append([dd.set_combined_counts(data, out_file)])
            return newsamples
        else:
            return samples
    files = [
        dd.get_count_file(data) for data in dd.sample_data_iterator(samples)
        if dd.get_count_file(data)
    ]
    if not deduped:
        files = [os.path.splitext(x)[0] + "-dupes.mtx" for x in files]

    files = [fn for fn in files if file_exists(fn)]
    descriptions = [
        dd.get_sample_name(data) for data in dd.sample_data_iterator(samples)
        if dd.get_count_file(data)
    ]
    if not files:
        return samples
    counts = SparseMatrix()
    counts.read(filename=files.pop(), colprefix=descriptions.pop())
    for filename, description in zip(files, descriptions):
        newcounts = SparseMatrix()
        newcounts.read(filename=filename, colprefix=description)
        counts.cat(newcounts)
    counts.write(out_file)
    newsamples = []
    if deduped:
        for data in dd.sample_data_iterator(samples):
            newsamples.append([dd.set_combined_counts(data, out_file)])
        return newsamples
    return samples
Example #26
0
def assemble_transcripts(run_parallel, samples):
    """
    assembly strategy rationale implemented as suggested in
    http://www.nature.com/nprot/journal/v7/n3/full/nprot.2012.016.html

    run Cufflinks in without a reference GTF for each individual sample
    merge the assemblies with Cuffmerge using a reference GTF
    """
    assembler = dd.get_in_samples(samples, dd.get_transcript_assembler)
    data = samples[0][0]
    if assembler:
        if "cufflinks" in assembler:
            samples = run_parallel("cufflinks_assemble", samples)
        if "stringtie" in assembler:
            samples = run_parallel("run_stringtie_expression", samples)
        if "stringtie" in assembler and stringtie.supports_merge(data):
            samples = run_parallel("stringtie_merge", [samples])
        else:
            samples = run_parallel("cufflinks_merge", [samples])
    return samples
Example #27
0
def concatenate_cb_histograms(samples):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    umi_dir = os.path.join(work_dir, "umis")
    out_file = os.path.join(umi_dir, "cb-histogram.txt")

    files = [
        dd.get_histogram_counts(data)
        for data in dd.sample_data_iterator(samples)
        if dd.get_histogram_counts(data)
    ]
    files = " ".join(files)
    cmd = "cat {files} > {out_file}"
    if not file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            message = "Concat cellular barcode histograms: %s." % files
            do.run(cmd.format(**locals()), message)
    newsamples = []
    for data in dd.sample_data_iterator(samples):
        newsamples.append([dd.set_combined_histogram(data, out_file)])
    return newsamples
Example #28
0
def concatenate_sparse_matrices(samples, deduped=True):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    umi_dir = os.path.join(work_dir, "umis")
    if deduped:
        out_file = os.path.join(umi_dir, "tagcounts.mtx")
    else:
        out_file = os.path.join(umi_dir, "tagcounts-dupes.mtx")
    if file_exists(out_file):
        if deduped:
            newsamples = []
            for data in dd.sample_data_iterator(samples):
                newsamples.append([dd.set_combined_counts(data, out_file)])
            return newsamples
        else:
            return samples
    files = [dd.get_count_file(data) for data in
            dd.sample_data_iterator(samples)
            if dd.get_count_file(data)]
    if not deduped:
        files = [os.path.splitext(x)[0] + "-dupes.mtx" for x in files]

    files = [fn for fn in files if file_exists(fn)]
    descriptions = [dd.get_sample_name(data) for data in
                    dd.sample_data_iterator(samples) if dd.get_count_file(data)]
    if not files:
        return samples
    counts = SparseMatrix()
    counts.read(filename=files.pop(), colprefix=descriptions.pop())
    for filename, description in zip(files, descriptions):
        newcounts = SparseMatrix()
        newcounts.read(filename=filename, colprefix=description)
        counts.cat(newcounts)
    counts.write(out_file)
    newsamples = []
    if deduped:
        for data in dd.sample_data_iterator(samples):
            newsamples.append([dd.set_combined_counts(data, out_file)])
        return newsamples
    return samples
Example #29
0
def concatenate_sparse_counts(*samples):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    umi_dir = os.path.join(work_dir, "umis")
    out_file = os.path.join(umi_dir, "tagcounts.mtx")
    if file_exists(out_file):
        return out_file
    files = [dd.get_count_file(data) for data in
             dd.sample_data_iterator(samples)
             if dd.get_count_file(data)]
    descriptions = [dd.get_sample_name(data) for data in
                    dd.sample_data_iterator(samples) if dd.get_count_file(data)]
    if not files:
        return samples
    counts = SparseMatrix()
    counts.read(filename=files.pop(), colprefix=descriptions.pop())
    for filename, description in zip(files, descriptions):
        newcounts = SparseMatrix()
        newcounts.read(filename=filename, colprefix=description)
        counts.cat(newcounts)
    counts.write(out_file)
    newsamples = []
    for data in dd.sample_data_iterator(samples):
        newsamples.append([dd.set_combined_counts(data, out_file)])
    return newsamples