コード例 #1
0
def should_run_fusion(with_caller, config):
    fusion_mode = dd.get_fusion_mode(config) or \
        utils.get_in(config, ("algorithm", "fusion_mode"), False)
    fusion_caller = dd.get_fusion_caller(config) or \
        utils.get_in(config, ("algorithm", "fusion_caller"), None)

    return fusion_mode and fusion_caller in (None, with_caller)
コード例 #2
0
def detect_fusions(data):
    data = to_single_data(data)
    # support the old style of fusion mode calling
    if dd.get_fusion_mode(data, False):
        data = dd.set_fusion_caller(data, ["oncofuse", "pizzly"])
        logger.warning(
            "``fusion_mode`` is deprecated in favor of turning on "
            "callers with ``fusion_caller``. It will run pizzly and "
            "oncofuse for now, but will eventually have support "
            "dropped.")
    fusion_caller = dd.get_fusion_caller(data, [])
    if "oncofuse" in fusion_caller:
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data = dd.set_oncofuse_file(data, oncofuse_file)
    if "pizzly" in fusion_caller:
        pizzly_dir = pizzly.run_pizzly(data)
        if pizzly_dir:
            data = dd.set_pizzly_dir(data, pizzly_dir)
            data["fusion"] = {
                "fasta":
                os.path.join(pizzly_dir,
                             "%s.fusions.fasta" % dd.get_sample_name(data)),
                "json":
                os.path.join(pizzly_dir, "%s.json" % dd.get_sample_name(data))
            }
    if "ericscript" in fusion_caller:
        ericscript_dir = ericscript.run(data)
    return [[data]]
コード例 #3
0
ファイル: rnaseq.py プロジェクト: chapmanb/bcbio-nextgen
def quantitate(data):
    """CWL target for quantitation.

    XXX Needs to be split and parallelized by expression caller, with merging
    of multiple calls.
    """
    data = to_single_data(to_single_data(data))
    data = generate_transcript_counts(data)[0][0]
    data["quant"] = {}
    if "sailfish" in dd.get_expression_caller(data):
        data = to_single_data(sailfish.run_sailfish(data)[0])
        data["quant"]["tsv"] = data["sailfish"]
        data["quant"]["hdf5"] = os.path.join(os.path.dirname(data["sailfish"]), "abundance.h5")
    if ("kallisto" in dd.get_expression_caller(data) or "pizzly" in dd.get_fusion_caller(data, [])):
        data = to_single_data(kallisto.run_kallisto_rnaseq(data)[0])
        data["quant"]["tsv"] = os.path.join(data["kallisto_quant"], "abundance.tsv")
        data["quant"]["hdf5"] = os.path.join(data["kallisto_quant"], "abundance.h5")
    if (os.path.exists(os.path.join(data["kallisto_quant"], "fusion.txt"))):
        data["quant"]["fusion"] = os.path.join(data["kallisto_quant"], "fusion.txt")
    else:
        data["quant"]["fusion"] = None
    if "salmon" in dd.get_expression_caller(data):
        data = to_single_data(salmon.run_salmon_reads(data)[0])
        data["quant"]["tsv"] = data["salmon"]
        data["quant"]["hdf5"] = os.path.join(os.path.dirname(data["salmon"]), "abundance.h5")
    return [[data]]
コード例 #4
0
def should_run_fusion(with_caller, config):
    fusion_mode = dd.get_fusion_mode(config) or \
        utils.get_in(config, ("algorithm", "fusion_mode"), False)
    fusion_caller = dd.get_fusion_caller(config) or \
        utils.get_in(config, ("algorithm", "fusion_caller"), None)

    return fusion_mode and fusion_caller in (None, with_caller)
コード例 #5
0
ファイル: kallisto.py プロジェクト: skanwal/bcbio-nextgen
def kallisto_rnaseq(fq1, fq2, kallisto_dir, gtf_file, fasta_file, data):
    samplename = dd.get_sample_name(data)
    quant_dir = os.path.join(kallisto_dir, "quant")
    safe_makedir(kallisto_dir)
    sentinel_file = os.path.join(quant_dir, "abundance.h5")
    if os.path.exists(sentinel_file):
        return quant_dir
    num_cores = dd.get_num_cores(data)
    strandedness = dd.get_strandedness(data).lower()
    kallisto = config_utils.get_program("kallisto", dd.get_config(data))
    index = kallisto_index(gtf_file, fasta_file, data,
                           os.path.dirname(kallisto_dir))
    fusion_flag = "--fusion" if dd.get_fusion_mode(
        data) or dd.get_fusion_caller(data) else ""
    single_flag = "--single" if not fq2 else ""
    fraglength_flag = "--fragment-length=200" if not fq2 else ""
    sd_flag = "--sd=25" if not fq2 else ""
    bootstrap_flag = "--bootstrap-samples=30"
    fq2 = "" if not fq2 else fq2
    if not fq2:
        logger.warning(
            "kallisto was run on single-end data and we set the "
            "estimated fragment length to 200 and the standard "
            "deviation to 25, if these don't reflect your data then "
            "the results may be inaccurate. Use with caution. See "
            "https://groups.google.com/forum/#!topic/kallisto-sleuth-users/h5LeAlWS33w "
            "for details.")
    cmd = ("{kallisto} quant {fusion_flag} -t {num_cores} {single_flag} "
           "{fraglength_flag} {sd_flag} {bootstrap_flag} "
           "-o {tx_out_dir} -i {index} {fq1} {fq2}")
    with file_transaction(data, quant_dir) as tx_out_dir:
        message = ("Quantifying transcripts with kallisto.")
        do.run(cmd.format(**locals()), message, None)
    return quant_dir
コード例 #6
0
def quantitate(data):
    """CWL target for quantitation.

    XXX Needs to be split and parallelized by expression caller, with merging
    of multiple calls.
    """
    data = to_single_data(to_single_data(data))
    data = generate_transcript_counts(data)[0][0]
    data["quant"] = {}
    if "sailfish" in dd.get_expression_caller(data):
        data = to_single_data(sailfish.run_sailfish(data)[0])
        data["quant"]["tsv"] = data["sailfish"]
        data["quant"]["hdf5"] = os.path.join(os.path.dirname(data["sailfish"]),
                                             "abundance.h5")
    if ("kallisto" in dd.get_expression_caller(data)
            or "pizzly" in dd.get_fusion_caller(data, [])):
        data = to_single_data(kallisto.run_kallisto_rnaseq(data)[0])
        data["quant"]["tsv"] = os.path.join(data["kallisto_quant"],
                                            "abundance.tsv")
        data["quant"]["hdf5"] = os.path.join(data["kallisto_quant"],
                                             "abundance.h5")
    if (os.path.exists(os.path.join(data["kallisto_quant"], "fusion.txt"))):
        data["quant"]["fusion"] = os.path.join(data["kallisto_quant"],
                                               "fusion.txt")
    else:
        data["quant"]["fusion"] = None
    if "salmon" in dd.get_expression_caller(data):
        data = to_single_data(salmon.run_salmon_reads(data)[0])
        data["quant"]["tsv"] = data["salmon"]
        data["quant"]["hdf5"] = os.path.join(os.path.dirname(data["salmon"]),
                                             "abundance.h5")
    return [[data]]
コード例 #7
0
ファイル: kallisto.py プロジェクト: biocyberman/bcbio-nextgen
def kallisto_rnaseq(fq1, fq2, kallisto_dir, gtf_file, fasta_file, data):
    samplename = dd.get_sample_name(data)
    quant_dir = os.path.join(kallisto_dir, "quant")
    safe_makedir(kallisto_dir)
    num_cores = dd.get_num_cores(data)
    strandedness = dd.get_strandedness(data).lower()
    kallisto = config_utils.get_program("kallisto", dd.get_config(data))
    index = kallisto_index(gtf_file, fasta_file, data, os.path.dirname(kallisto_dir))
    fusion_flag = "--fusion" if dd.get_fusion_mode(data) or dd.get_fusion_caller(data) else ""
    single_flag = "--single" if not fq2 else ""
    fraglength_flag = "--fragment-length=200" if not fq2 else ""
    sd_flag = "--sd=25" if not fq2 else ""
    bootstrap_flag = "--bootstrap-samples=30"
    fq2 = "" if not fq2 else fq2
    if not fq2:
        logger.warning("kallisto was run on single-end data and we set the "
          "estimated fragment length to 200 and the standard "
          "deviation to 25, if these don't reflect your data then "
          "the results may be inaccurate. Use with caution. See "
          "https://groups.google.com/forum/#!topic/kallisto-sleuth-users/h5LeAlWS33w "
          "for details.")
    cmd = ("{kallisto} quant {fusion_flag} -t {num_cores} {single_flag} "
           "{fraglength_flag} {sd_flag} {bootstrap_flag} "
           "-o {tx_out_dir} -i {index} {fq1} {fq2}")
    with file_transaction(data, quant_dir) as tx_out_dir:
        message = ("Quantifying transcripts with kallisto.")
        do.run(cmd.format(**locals()), message, None)
    return quant_dir
コード例 #8
0
ファイル: rnaseq.py プロジェクト: chapmanb/bcbio-nextgen
def generate_transcript_counts(data):
    """Generate counts per transcript and per exon from an alignment"""
    data["count_file"] = featureCounts.count(data)

    if dd.get_fusion_mode(data, False) and not dd.get_fusion_caller(data):
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data = dd.set_oncofuse_file(data, oncofuse_file)

    if dd.get_transcriptome_align(data):
        # to create a disambiguated transcriptome file realign with bowtie2
        if dd.get_disambiguate(data):
            logger.info("Aligning to the transcriptome with bowtie2 using the "
                        "disambiguated reads.")
            bam_path = data["work_bam"]
            fastq_paths = alignprep._bgzip_from_bam(bam_path, data["dirs"], data, is_retry=False, output_infix='-transcriptome')
            if len(fastq_paths) == 2:
                file1, file2 = fastq_paths
            else:
                file1, file2 = fastq_paths[0], None
            ref_file = dd.get_ref_file(data)
            data = bowtie2.align_transcriptome(file1, file2, ref_file, data)
        else:
            file1, file2 = dd.get_input_sequence_files(data)
        if not dd.get_transcriptome_bam(data):
            ref_file = dd.get_ref_file(data)
            logger.info("Transcriptome alignment was flagged to run, but the "
                        "transcriptome BAM file was not found. Aligning to the "
                        "transcriptome with bowtie2.")
            data = bowtie2.align_transcriptome(file1, file2, ref_file, data)
    data = spikein.counts_spikein(data)
    return [[data]]
コード例 #9
0
def quantitate_expression_parallel(samples, run_parallel):
    """
    quantitate expression, all programs run here should be multithreaded to
    take advantage of the threaded run_parallel environment
    """
    data = samples[0][0]
    samples = run_parallel("generate_transcript_counts", samples)
    if "cufflinks" in dd.get_expression_caller(data):
        samples = run_parallel("run_cufflinks", samples)
    if "stringtie" in dd.get_expression_caller(data):
        samples = run_parallel("run_stringtie_expression", samples)
    if ("kallisto" in dd.get_expression_caller(data)
            or dd.get_fusion_mode(data)
            or "pizzly" in dd.get_fusion_caller(data, [])):
        samples = run_parallel("run_kallisto_index", [samples])
        samples = run_parallel("run_kallisto_rnaseq", samples)
    if "sailfish" in dd.get_expression_caller(data):
        samples = run_parallel("run_sailfish_index", [samples])
        samples = run_parallel("run_sailfish", samples)
    # always run salmon
    samples = run_parallel("run_salmon_index", [samples])
    samples = run_parallel("run_salmon_reads", samples)

    samples = run_parallel("detect_fusions", samples)
    return samples
コード例 #10
0
ファイル: rnaseq.py プロジェクト: chapmanb/bcbio-nextgen
def quantitate_expression_parallel(samples, run_parallel):
    """
    quantitate expression, all programs run here should be multithreaded to
    take advantage of the threaded run_parallel environment
    """
    data = samples[0][0]
    samples = run_parallel("generate_transcript_counts", samples)
    if "cufflinks" in dd.get_expression_caller(data):
        samples = run_parallel("run_cufflinks", samples)
    if "stringtie" in dd.get_expression_caller(data):
        samples = run_parallel("run_stringtie_expression", samples)
    if ("kallisto" in dd.get_expression_caller(data) or
        dd.get_fusion_mode(data) or
        "pizzly" in dd.get_fusion_caller(data, [])):
        samples = run_parallel("run_kallisto_index", [samples])
        samples = run_parallel("run_kallisto_rnaseq", samples)
    if "sailfish" in dd.get_expression_caller(data):
        samples = run_parallel("run_sailfish_index", [samples])
        samples = run_parallel("run_sailfish", samples)
    # always run salmon
    samples = run_parallel("run_salmon_index", [samples])
    samples = run_parallel("run_salmon_reads", samples)

    samples = run_parallel("detect_fusions", samples)
    return samples
コード例 #11
0
ファイル: rnaseq.py プロジェクト: chmodawk/bcbio-nextgen
def generate_transcript_counts(data):
    """Generate counts per transcript and per exon from an alignment"""
    data["count_file"] = featureCounts.count(data)

    if dd.get_fusion_mode(data, False) and not dd.get_fusion_caller(data):
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data = dd.set_oncofuse_file(data, oncofuse_file)

    if dd.get_transcriptome_align(data):
        # to create a disambiguated transcriptome file realign with bowtie2
        if dd.get_disambiguate(data):
            logger.info("Aligning to the transcriptome with bowtie2 using the "
                        "disambiguated reads.")
            bam_path = data["work_bam"]
            fastq_paths = alignprep._bgzip_from_bam(bam_path, data["dirs"], data, is_retry=False, output_infix='-transcriptome')
            if len(fastq_paths) == 2:
                file1, file2 = fastq_paths
            else:
                file1, file2 = fastq_paths[0], None
            ref_file = dd.get_ref_file(data)
            data = bowtie2.align_transcriptome(file1, file2, ref_file, data)
        else:
            file1, file2 = dd.get_input_sequence_files(data)
        if not dd.get_transcriptome_bam(data):
            ref_file = dd.get_ref_file(data)
            logger.info("Transcriptome alignment was flagged to run, but the "
                        "transcriptome BAM file was not found. Aligning to the "
                        "transcriptome with bowtie2.")
            data = bowtie2.align_transcriptome(file1, file2, ref_file, data)
    data = spikein.counts_spikein(data)
    return [[data]]
コード例 #12
0
ファイル: rnaseq.py プロジェクト: gberriz/bcbio-nextgen
def detect_fusions(data):
    # support the old style of fusion mode calling
    if dd.get_fusion_mode(data, False):
        data = dd.set_fusion_caller(data, ["oncofuse", "pizzly"])
        logger.warning("``fusion_mode`` is deprecated in favor of turning on "
                       "callers with ``fusion_caller``. It will run pizzly and "
                       "oncofuse for now, but will eventually have support "
                       "dropped.")
    if "oncofuse" in dd.get_fusion_caller(data, []):
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data = dd.set_oncofuse_file(data, oncofuse_file)
    if "pizzly" in dd.get_fusion_caller(data, []):
        pizzly_dir = pizzly.run_pizzly(data)
        if pizzly_dir:
            data = dd.set_pizzly_dir(data, pizzly_dir)
    return [[data]]
コード例 #13
0
def test_get_fusion_caller():
    data = {
        'config': {
            'algorithm': {
                'fusion_caller': 'FUSION_CALLER',
            },
        },
    }

    result = dd.get_fusion_caller(data)
    assert result == 'FUSION_CALLER'
コード例 #14
0
ファイル: rnaseq.py プロジェクト: WimSpee/bcbio-nextgen
def quantitate(data):
    """CWL target for quantitation.

    XXX Needs to be split and parallelized by expression caller, with merging
    of multiple calls.
    """
    data = to_single_data(to_single_data(data))
    data = generate_transcript_counts(data)[0][0]
    data["quant"] = {}
    if "sailfish" in dd.get_expression_caller(data):
        data = to_single_data(sailfish.run_sailfish(data)[0])
        data["quant"]["tsv"] = data["sailfish"]
        data["quant"]["hdf5"] = os.path.join(os.path.dirname(data["sailfish"]),
                                             "abundance.h5")
    if ("kallisto" in dd.get_expression_caller(data)
            or "pizzly" in dd.get_fusion_caller(data, [])):
        data = to_single_data(kallisto.run_kallisto_rnaseq(data)[0])
        data["quant"]["tsv"] = os.path.join(data["kallisto_quant"],
                                            "abundance.tsv")
        data["quant"]["hdf5"] = os.path.join(data["kallisto_quant"],
                                             "abundance.h5")
    if (os.path.exists(os.path.join(data["kallisto_quant"], "fusion.txt"))):
        data["quant"]["fusion"] = os.path.join(data["kallisto_quant"],
                                               "fusion.txt")
    else:
        data["quant"]["fusion"] = None
    if "salmon" in dd.get_expression_caller(data):
        if dd.get_quantify_genome_alignments(data):
            if dd.get_aligner(data).lower() != "star":
                if dd.get_genome_build(data) == "hg38":
                    logger.warning(
                        "Whole genome alignment-based Salmon quantification is "
                        "only supported for the STAR aligner. Since this is hg38 we will fall "
                        "back to the decoy method")
                    data = to_single_data(salmon.run_salmon_decoy(data)[0])
                else:
                    logger.warning(
                        "Whole genome alignment-based Salmon quantification is "
                        "only supported for the STAR aligner. Falling back to the "
                        "transcriptome-only method.")
                    data = to_single_data(salmon.run_salmon_reads(data)[0])
            else:
                data = to_single_data(salmon.run_salmon_bam(data)[0])
        else:
            data = to_single_data(salmon.run_salmon_reads(data)[0])
        data["quant"]["tsv"] = data["salmon"]
        data["quant"]["hdf5"] = os.path.join(os.path.dirname(data["salmon"]),
                                             "abundance.h5")
    return [[data]]
コード例 #15
0
def _set_transcriptome_option(options, data, ref_file):
    # prefer transcriptome-index vs a GTF file if available
    transcriptome_index = get_in(data, ("genome_resources", "rnaseq",
                                        "transcriptome_index", "tophat"))
    fusion_mode = get_in(data, ("config", "algorithm", "fusion_mode"), False)
    fusion_mode = fusion_mode or dd.get_fusion_caller(data)
    if transcriptome_index and file_exists(transcriptome_index) and not fusion_mode:
        options["transcriptome-index"] = os.path.splitext(transcriptome_index)[0]
        return options

    gtf_file = dd.get_gtf_file(data)
    if gtf_file:
        options["GTF"] = gtf_file
        return options

    return options
コード例 #16
0
ファイル: rnaseq.py プロジェクト: WimSpee/bcbio-nextgen
def quantitate_expression_parallel(samples, run_parallel):
    """
    quantitate expression, all programs run here should be multithreaded to
    take advantage of the threaded run_parallel environment
    """
    data = samples[0][0]
    to_index = determine_indexes_to_make(samples)
    samples = run_parallel("generate_transcript_counts", samples)
    if "cufflinks" in dd.get_expression_caller(data):
        samples = run_parallel("run_cufflinks", samples)
    if "stringtie" in dd.get_expression_caller(data):
        samples = run_parallel("run_stringtie_expression", samples)
    if ("kallisto" in dd.get_expression_caller(data)
            or dd.get_fusion_mode(data)
            or "pizzly" in dd.get_fusion_caller(data, [])):
        run_parallel("run_kallisto_index", [to_index])
        samples = run_parallel("run_kallisto_rnaseq", samples)
    if "sailfish" in dd.get_expression_caller(data):
        run_parallel("run_sailfish_index", [to_index])
        samples = run_parallel("run_sailfish", samples)

    # always run salmon
    run_parallel("run_salmon_index", [to_index])
    if dd.get_quantify_genome_alignments(data):
        if dd.get_aligner(data).lower() != "star":
            if dd.get_genome_build(data) == "hg38":
                logger.warning(
                    "Whole genome alignment-based Salmon quantification is "
                    "only supported for the STAR aligner. Since this is hg38 we will fall "
                    "back to the decoy method")
                samples = run_parallel("run_salmon_decoy", samples)
            else:
                logger.warning(
                    "Whole genome alignment-based Salmon quantification is "
                    "only supported for the STAR aligner. Falling back to the "
                    "transcriptome-only method.")
                samples = run_parallel("run_salmon_reads", samples)
        else:
            samples = run_parallel("run_salmon_bam", samples)
    else:
        samples = run_parallel("run_salmon_reads", samples)

    samples = run_parallel("detect_fusions", samples)
    return samples
コード例 #17
0
ファイル: rnaseq.py プロジェクト: chapmanb/bcbio-nextgen
def detect_fusions(data):
    data = to_single_data(data)
    # support the old style of fusion mode calling
    if dd.get_fusion_mode(data, False):
        data = dd.set_fusion_caller(data, ["oncofuse", "pizzly"])
        logger.warning("``fusion_mode`` is deprecated in favor of turning on "
                       "callers with ``fusion_caller``. It will run pizzly and "
                       "oncofuse for now, but will eventually have support "
                       "dropped.")
    fusion_caller = dd.get_fusion_caller(data, [])
    if "oncofuse" in fusion_caller:
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data = dd.set_oncofuse_file(data, oncofuse_file)
    if "pizzly" in fusion_caller:
        pizzly_dir = pizzly.run_pizzly(data)
        if pizzly_dir:
            data = dd.set_pizzly_dir(data, pizzly_dir)
            data["fusion"] = {"fasta": os.path.join(pizzly_dir, "%s.fusions.fasta" % dd.get_sample_name(data)),
                              "json": os.path.join(pizzly_dir, "%s.json" % dd.get_sample_name(data))}
    if "ericscript" in fusion_caller:
        ericscript_dir = ericscript.run(data)
    return [[data]]
コード例 #18
0
ファイル: star.py プロジェクト: chapmanb/bcbio-nextgen
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    if not ref_file:
        logger.error("STAR index not found. We don't provide the STAR indexes "
                     "by default because they are very large. You can install "
                     "the index for your genome with: bcbio_nextgen.py upgrade "
                     "--aligners star --genomes genome-build-name --data")
        sys.exit(1)

    max_hits = 10
    srna = True if data["analysis"].lower().startswith("smallrna-seq") else False
    srna_opts = ""
    if srna:
        max_hits = 1000
        srna_opts = "--alignIntronMax 1"
    config = data["config"]
    star_dirs = _get_star_dirnames(align_dir, data, names)
    if file_exists(star_dirs.final_out):
        data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data)
        return data

    star_path = config_utils.get_program("STAR", config)
    def _unpack_fastq(f):
        """Use process substitution instead of readFilesCommand for gzipped inputs.

        Prevents issues on shared filesystems that don't support FIFO:
        https://github.com/alexdobin/STAR/issues/143
        """
        if f and is_gzipped(f):
            return "<(gunzip -c %s)" % f
        else:
            return f
    fastq_files = (" ".join([_unpack_fastq(fastq_file), _unpack_fastq(pair_file)])
                   if pair_file else _unpack_fastq(fastq_file))
    num_cores = dd.get_num_cores(data)
    gtf_file = dd.get_gtf_file(data)
    if ref_file.endswith("chrLength"):
        ref_file = os.path.dirname(ref_file)

    with file_transaction(data, align_dir) as tx_align_dir:
        tx_star_dirnames = _get_star_dirnames(tx_align_dir, data, names)
        tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames
        safe_makedir(tx_align_dir)
        safe_makedir(tx_out_dir)
        cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} "
               "--runThreadN {num_cores} --outFileNamePrefix {tx_out_prefix} "
               "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} "
               "--outStd BAM_Unsorted {srna_opts} "
               "--limitOutSJcollapsed 2000000 "
               "--outSAMtype BAM Unsorted "
               "--outSAMmapqUnique 60 "
               "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS))
        cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file) if not srna else ""
        cmd += _read_group_option(names)
        if dd.get_fusion_caller(data):
            cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 "
                "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 "
                "--chimScoreSeparation 5 ")
            if "oncofuse" in dd.get_fusion_caller(data):
                cmd += "--chimOutType Junctions "
            else:
                cmd += "--chimOutType WithinBAM "
        strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"),
                                    "unstranded").lower()
        if strandedness == "unstranded" and not srna:
            cmd += " --outSAMstrandField intronMotif "
        if not srna:
            cmd += " --quantMode TranscriptomeSAM "

        resources = config_utils.get_resources("star", data["config"])
        if resources.get("options", []):
            cmd += " " + " ".join([str(x) for x in resources.get("options", [])])
        cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out)
        cmd += " > {tx_final_out} "
        run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file)
        do.run(cmd.format(**locals()), run_message, None)

    data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data)
    return data
コード例 #19
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    if not ref_file:
        logger.error(
            "STAR index not found. We don't provide the STAR indexes "
            "by default because they are very large. You can install "
            "the index for your genome with: bcbio_nextgen.py upgrade "
            "--aligners star --genomes genome-build-name --data")
        sys.exit(1)

    max_hits = 10
    srna = True if data["analysis"].lower().startswith(
        "smallrna-seq") else False
    srna_opts = ""
    if srna:
        max_hits = 1000
        srna_opts = "--alignIntronMax 1"
    config = data["config"]
    star_dirs = _get_star_dirnames(align_dir, data, names)
    if file_exists(star_dirs.final_out):
        data = _update_data(star_dirs.final_out, star_dirs.out_dir, names,
                            data)
        return data

    star_path = config_utils.get_program("STAR", config)

    def _unpack_fastq(f):
        """Use process substitution instead of readFilesCommand for gzipped inputs.

        Prevents issues on shared filesystems that don't support FIFO:
        https://github.com/alexdobin/STAR/issues/143
        """
        if f and is_gzipped(f):
            return "<(gunzip -c %s)" % f
        else:
            return f

    fastq_files = (" ".join([
        _unpack_fastq(fastq_file),
        _unpack_fastq(pair_file)
    ]) if pair_file else _unpack_fastq(fastq_file))
    num_cores = dd.get_num_cores(data)
    gtf_file = dd.get_gtf_file(data)
    if ref_file.endswith("chrLength"):
        ref_file = os.path.dirname(ref_file)

    with file_transaction(data, align_dir) as tx_align_dir:
        tx_star_dirnames = _get_star_dirnames(tx_align_dir, data, names)
        tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames
        safe_makedir(tx_align_dir)
        safe_makedir(tx_out_dir)
        cmd = (
            "{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} "
            "--runThreadN {num_cores} --outFileNamePrefix {tx_out_prefix} "
            "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} "
            "--outStd BAM_Unsorted {srna_opts} "
            "--limitOutSJcollapsed 2000000 "
            "--outSAMtype BAM Unsorted "
            "--outSAMmapqUnique 60 "
            "--outSAMunmapped Within --outSAMattributes %s " %
            " ".join(ALIGN_TAGS))
        cmd += _add_sj_index_commands(fastq_file, ref_file,
                                      gtf_file) if not srna else ""
        cmd += _read_group_option(names)
        if dd.get_fusion_caller(data):
            cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 "
                    "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 "
                    "--chimScoreSeparation 5 ")
            if "oncofuse" in dd.get_fusion_caller(data):
                cmd += "--chimOutType Junctions "
            else:
                cmd += "--chimOutType WithinBAM "
        strandedness = utils.get_in(data,
                                    ("config", "algorithm", "strandedness"),
                                    "unstranded").lower()
        if strandedness == "unstranded" and not srna:
            cmd += " --outSAMstrandField intronMotif "
        if not srna:
            cmd += " --quantMode TranscriptomeSAM "

        resources = config_utils.get_resources("star", data["config"])
        if resources.get("options", []):
            cmd += " " + " ".join(
                [str(x) for x in resources.get("options", [])])
        cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out)
        cmd += " > {tx_final_out} "
        run_message = "Running STAR aligner on %s and %s" % (fastq_file,
                                                             ref_file)
        do.run(cmd.format(**locals()), run_message, None)

    data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data)
    return data
コード例 #20
0
ファイル: tophat.py プロジェクト: zhangyupisa/bcbio-nextgen
def _should_run_fusion(data):
    return dd.get_fusion_caller(data)
コード例 #21
0
ファイル: star.py プロジェクト: gberriz/bcbio-nextgen
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    if not ref_file:
        logger.error(
            "STAR index not found. We don't provide the STAR indexes "
            "by default because they are very large. You can install "
            "the index for your genome with: bcbio_nextgen.py upgrade "
            "--aligners star --genomes genome-build-name --data")
        sys.exit(1)

    max_hits = 10
    srna = True if data["analysis"].lower().startswith(
        "smallrna-seq") else False
    srna_opts = ""
    if srna:
        max_hits = 1000
        srna_opts = "--alignIntronMax 1"
    config = data["config"]
    star_dirs = _get_star_dirnames(align_dir, data, names)
    if file_exists(star_dirs.final_out):
        data = _update_data(star_dirs.final_out, star_dirs.out_dir, names,
                            data)
        return data

    star_path = config_utils.get_program("STAR", config)
    fastq_files = " ".join([fastq_file, pair_file
                            ]) if pair_file else fastq_file
    num_cores = dd.get_num_cores(data)
    gtf_file = dd.get_gtf_file(data)
    if ref_file.endswith("chrLength"):
        ref_file = os.path.dirname(ref_file)

    with file_transaction(data, align_dir) as tx_align_dir:
        tx_star_dirnames = _get_star_dirnames(tx_align_dir, data, names)
        tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames
        safe_makedir(tx_align_dir)
        safe_makedir(tx_out_dir)
        cmd = (
            "{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} "
            "--runThreadN {num_cores} --outFileNamePrefix {tx_out_prefix} "
            "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} "
            "--outStd BAM_Unsorted {srna_opts} "
            "--limitOutSJcollapsed 2000000 "
            "--outSAMtype BAM Unsorted "
            "--outSAMunmapped Within --outSAMattributes %s " %
            " ".join(ALIGN_TAGS))
        cmd += _add_sj_index_commands(fastq_file, ref_file,
                                      gtf_file) if not srna else ""
        cmd += " --readFilesCommand zcat " if is_gzipped(fastq_file) else ""
        cmd += _read_group_option(names)
        fusion_mode = utils.get_in(data,
                                   ("config", "algorithm", "fusion_mode"),
                                   False)
        fusion_mode = fusion_mode or dd.get_fusion_caller(data)
        if fusion_mode:
            cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 "
                    "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 "
                    "--chimScoreSeparation 5 "
                    "--chimOutType WithinBAM ")
        strandedness = utils.get_in(data,
                                    ("config", "algorithm", "strandedness"),
                                    "unstranded").lower()
        if strandedness == "unstranded" and not srna:
            cmd += " --outSAMstrandField intronMotif "
        if not srna:
            cmd += " --quantMode TranscriptomeSAM "
        cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out)
        cmd += " > {tx_final_out} "
        run_message = "Running STAR aligner on %s and %s" % (fastq_file,
                                                             ref_file)
        do.run(cmd.format(**locals()), run_message, None)

    data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data)
    return data