def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    paired = True if pair_file else False
    hisat2 = config_utils.get_program("hisat2", data)
    num_cores = dd.get_num_cores(data)
    quality_flag = _get_quality_flag(data)
    stranded_flag = _get_stranded_flag(data, paired)
    rg_flags = _get_rg_flags(names)
    out_file = os.path.join(align_dir, dd.get_lane(data)) + ".bam"
    if file_exists(out_file):
        data = dd.set_work_bam(data, out_file)
        return data
    cmd = ("{hisat2} -x {ref_file} -p {num_cores} {quality_flag} {stranded_flag} "
           "{rg_flags} ")
    if paired:
        cmd += "-1 {fastq_file} -2 {pair_file} "
    else:
        cmd += "-U {fastq_file} "
    if dd.get_analysis(data).lower() == "smallrna-seq":
        cmd += "-k 1000 "
    # if assembling transcripts, set flags that cufflinks can use
    if dd.get_assemble_transcripts(data):
        cmd += "--dta-cufflinks "
    if dd.get_analysis(data) == "rna-seq":
        splicesites = os.path.join(os.path.dirname(gtf_file),
                                   "ref-transcripts-splicesites.txt")
        cmd += "--known-splicesite-infile {splicesites} "
    message = "Aligning %s and %s with hisat2." %(fastq_file, pair_file)
    with file_transaction(out_file) as tx_out_file:
        cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_out_file)
        do.run(cmd.format(**locals()), message)
    data = dd.set_work_bam(data, out_file)
    return data
Beispiel #2
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    paired = True if pair_file else False
    hisat2 = config_utils.get_program("hisat2", data)
    num_cores = dd.get_num_cores(data)
    quality_flag = _get_quality_flag(data)
    stranded_flag = _get_stranded_flag(data, paired)
    rg_flags = _get_rg_flags(names)
    out_file = os.path.join(align_dir, dd.get_lane(data)) + ".bam"
    if file_exists(out_file):
        data = dd.set_work_bam(data, out_file)
        return data
    cmd = (
        "{hisat2} -x {ref_file} -p {num_cores} {quality_flag} {stranded_flag} "
        "{rg_flags} ")
    if paired:
        cmd += "-1 {fastq_file} -2 {pair_file} "
    else:
        cmd += "-U {fastq_file} "
    if dd.get_analysis(data).lower() == "smallrna-seq":
        cmd += "-k 1000 "
    # if assembling transcripts, set flags that cufflinks/stringtie can use
    if dd.get_transcript_assembler(data):
        cmd += "--dta-cufflinks "
    if dd.get_analysis(data).lower() == "rna-seq":
        gtf_file = dd.get_gtf_file(data)
        splicesites = os.path.join(os.path.dirname(gtf_file),
                                   "ref-transcripts-splicesites.txt")
        cmd += "--known-splicesite-infile {splicesites} "
    message = "Aligning %s and %s with hisat2." % (fastq_file, pair_file)
    with file_transaction(out_file) as tx_out_file:
        cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_out_file)
        do.run(cmd.format(**locals()), message)
    data = dd.set_work_bam(data, out_file)
    return data
Beispiel #3
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    paired = True if pair_file else False
    hisat2 = config_utils.get_program("hisat2", data)
    num_cores = dd.get_num_cores(data)
    quality_flag = _get_quality_flag(data)
    stranded_flag = _get_stranded_flag(data, paired)
    rg_flags = _get_rg_flags(names)
    out_file = os.path.join(align_dir,
                            "{0}-sort.bam".format(dd.get_sample_name(data)))
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(
            fastq_file, pair_file, data)
    else:
        final_file = None
    if not file_exists(out_file) and (final_file is None
                                      or not file_exists(final_file)):
        cmd = (
            "{hisat2} --new-summary -x {ref_file} -p {num_cores} {quality_flag} {stranded_flag} "
            "{rg_flags} ")
        if paired:
            cmd += "-1 {fastq_file} -2 {pair_file} "
        else:
            cmd += "-U {fastq_file} "
        if dd.get_analysis(data).lower() == "smallrna-seq":
            cmd += "-k 1000 "
        # if assembling transcripts, set flags that cufflinks/stringtie can use
        if dd.get_transcript_assembler(data):
            cmd += "--dta-cufflinks "
        if dd.get_analysis(data).lower() == "rna-seq":
            splicesites = get_known_splicesites_file(align_dir, data)
            if file_exists(splicesites):
                cmd += "--known-splicesite-infile {splicesites} "
        novel_splicesite_file = os.path.join(
            align_dir,
            "{0}-novelsplicesites.bed".format(dd.get_sample_name(data)))
        cmd += "--novel-splicesite-outfile {novel_splicesite_file} "
        # apply additional hisat2 options
        cmd += " ".join(_get_options_from_config(data))

        message = "Aligning %s and %s with hisat2." % (fastq_file, pair_file)
        with postalign.tobam_cl(data, out_file, pair_file
                                is not None) as (tobam_cl, tx_out_file):
            cmd += " | " + tobam_cl
            do.run(cmd.format(**locals()), message)
    data = dd.set_work_bam(data, out_file)
    junctionbed = get_splicejunction_file(align_dir, data)
    data = dd.set_junction_bed(data, junctionbed)
    return data
Beispiel #4
0
def _maybe_add_sailfish_files(algorithm, sample, out):
    analysis = dd.get_analysis(sample)
    if dd.get_sailfish_dir(sample) and analysis != "fastrna-seq":
        out.append({"path": dd.get_sailfish_dir(sample),
                    "type": "directory",
                    "ext": "sailfish"})
    return out
Beispiel #5
0
def counts_spikein(data):
    data = utils.to_single_data(data)
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    salmon_dir = os.path.join(work_dir, "spikein", samplename)
    fasta_file = dd.get_spikein_fasta(data)
    if not fasta_file:
        return data
    files = dd.get_input_sequence_files(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    readlength = fastq.estimate_read_length(fq1)
    if readlength % 2 == 0:
        readlength -= 1
    kmersize = min(readlength, 31)
    logger.info("kmersize used for salmon index at spikein quant: %s" %
                kmersize)
    kmersize = kmersize if not dd.get_analysis(
        data).lower() == "smallrna-seq" else 15
    fasta_index = _index_spikein(fasta_file, salmon_dir, data, kmersize)
    out_file = _salmon_quant_reads(fq1, fq2, salmon_dir, fasta_index, data)
    data = dd.set_spikein_counts(data, out_file)
    return data
Beispiel #6
0
def _use_spark(num_cores, gatk_type, items, opts):
    data = items[0]
    use_spark = False
    if dd.get_analysis(data).lower() != "rna-seq":
        use_spark = (len(items) == 1 and num_cores > 1
                     and gatk_type == "gatk4") or "--spark-master" in opts
    return use_spark
def postprocess_variants(items):
    """Provide post-processing of variant calls: filtering and effects annotation.
    """
    vrn_key = "vrn_file"
    if not isinstance(items, dict):
        items = [utils.to_single_data(x) for x in items]
        if "vrn_file_joint" in items[0]:
            vrn_key = "vrn_file_joint"
    data, items = _get_batch_representative(items, vrn_key)
    items = cwlutils.unpack_tarballs(items, data)
    data = cwlutils.unpack_tarballs(data, data)
    cur_name = "%s, %s" % (dd.get_sample_name(data), get_variantcaller(data))
    logger.info("Finalizing variant calls: %s" % cur_name)
    orig_vrn_file = data.get(vrn_key)
    data = _symlink_to_workdir(data, [vrn_key])
    data = _symlink_to_workdir(data,
                               ["config", "algorithm", "variant_regions"])
    if data.get(vrn_key):
        logger.info("Calculating variation effects for %s" % cur_name)
        ann_vrn_file, vrn_stats = effects.add_to_vcf(data[vrn_key], data)
        if ann_vrn_file:
            data[vrn_key] = ann_vrn_file
        if vrn_stats:
            data["vrn_stats"] = vrn_stats
        orig_items = _get_orig_items(items)
        logger.info("Annotate VCF file: %s" % cur_name)
        data[vrn_key] = annotation.finalize_vcf(data[vrn_key],
                                                get_variantcaller(data),
                                                orig_items)
        if dd.get_analysis(data).lower().find("rna-seq") >= 0:
            logger.info("Annotate RNA editing sites")
            ann_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), ["rnaedit"],
                                           data)
            if ann_file:
                data[vrn_key] = ann_file
        if cwlutils.is_cwl_run(data):
            logger.info("Annotate with population level variation data")
            ann_file = population.run_vcfanno(dd.get_vrn_file(data), data,
                                              population.do_db_build([data]))
            if ann_file:
                data[vrn_key] = ann_file
        logger.info("Filtering for %s" % cur_name)
        data[vrn_key] = variant_filtration(
            data[vrn_key], dd.get_ref_file(data),
            tz.get_in(("genome_resources", "variation"), data, {}), data,
            orig_items)
        logger.info("Prioritization for %s" % cur_name)
        prio_vrn_file = prioritize.handle_vcf_calls(data[vrn_key], data,
                                                    orig_items)
        if prio_vrn_file != data[vrn_key]:
            data[vrn_key] = prio_vrn_file
            logger.info("Germline extraction for %s" % cur_name)
            data = germline.extract(data, orig_items)

        if dd.get_align_bam(data):
            data = damage.run_filter(data[vrn_key], dd.get_align_bam(data),
                                     dd.get_ref_file(data), data, orig_items)
    if orig_vrn_file and os.path.samefile(data[vrn_key], orig_vrn_file):
        data[vrn_key] = orig_vrn_file
    return [[data]]
Beispiel #8
0
def _maybe_add_sailfish_files(algorithm, sample, out):
    analysis = dd.get_analysis(sample)
    sailfish_dir = os.path.join(dd.get_work_dir(sample), "sailfish",
                                dd.get_sample_name(sample), "quant")
    if os.path.exists(sailfish_dir):
        out.append({"path": dd.get_sailfish_dir(sample),
                    "type": "directory",
                    "ext": "sailfish"})
    return out
Beispiel #9
0
def _maybe_add_sailfish_files(algorithm, sample, out):
    analysis = dd.get_analysis(sample)
    sailfish_dir = os.path.join(dd.get_work_dir(sample), "sailfish",
                                dd.get_sample_name(sample), "quant")
    if os.path.exists(sailfish_dir):
        out.append({"path": sailfish_dir,
                    "type": "directory",
                    "ext": "sailfish"})
    return out
Beispiel #10
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    paired = True if pair_file else False
    hisat2 = config_utils.get_program("hisat2", data)
    num_cores = dd.get_num_cores(data)
    quality_flag = _get_quality_flag(data)
    stranded_flag = _get_stranded_flag(data, paired)
    rg_flags = _get_rg_flags(names)
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data)))
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data)
    else:
        final_file = None
    if not file_exists(out_file) and (final_file is None or not file_exists(final_file)):
        cmd = ("{hisat2} --new-summary -x {ref_file} -p {num_cores} {quality_flag} {stranded_flag} "
               "{rg_flags} ")
        if paired:
            cmd += "-1 {fastq_file} -2 {pair_file} "
        else:
            cmd += "-U {fastq_file} "
        if dd.get_analysis(data).lower() == "smallrna-seq":
            cmd += "-k 1000 "
        # if assembling transcripts, set flags that cufflinks/stringtie can use
        if dd.get_transcript_assembler(data):
            cmd += "--dta-cufflinks "
        if dd.get_analysis(data).lower() == "rna-seq":
            splicesites = get_known_splicesites_file(align_dir, data)
            if file_exists(splicesites):
                cmd += "--known-splicesite-infile {splicesites} "
        novel_splicesite_file = os.path.join(align_dir, "{0}-novelsplicesites.bed".format(dd.get_sample_name(data)))
        cmd += "--novel-splicesite-outfile {novel_splicesite_file} "
        # apply additional hisat2 options
        cmd += " ".join(_get_options_from_config(data))

        message = "Aligning %s and %s with hisat2." % (fastq_file, pair_file)
        with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file):
            cmd += " | " + tobam_cl
            do.run(cmd.format(**locals()), message)
    data = dd.set_work_bam(data, out_file)
    junctionbed = get_splicejunction_file(align_dir, data)
    data = dd.set_junction_bed(data, junctionbed)
    return data
Beispiel #11
0
def _default_conf_files(data, retriever):
    conf_files = []
    if dd.get_variantcaller(data) or dd.get_vrn_file(data):
        if annotate_gemini(data, retriever):
            conf_files.append("gemini")
        if _annotate_somatic(data, retriever):
            conf_files.append("somatic")
        if dd.get_analysis(data).lower().find("rna-seq") >= 0:
            conf_files.append("rnaedit")
    return conf_files
Beispiel #12
0
def _default_conf_files(data, retriever):
    conf_files = []
    if dd.get_variantcaller(data) or dd.get_vrn_file(data):
        if annotate_gemini(data, retriever):
            conf_files.append("gemini")
        if _annotate_somatic(data, retriever):
            conf_files.append("somatic")
        if dd.get_analysis(data).lower().find("rna-seq") >= 0:
            conf_files.append("rnaedit")
    return conf_files
Beispiel #13
0
def _check_dedup(data):
    """Check configuration for de-duplication.

    Defaults to no de-duplication for RNA-seq and small RNA, the
    back compatible default. Allow overwriting with explicit
    `mark_duplicates: true` setting.
    """
    if dd.get_analysis(data).lower() in ["rna-seq", "smallrna-seq"]:
        dup_param = utils.get_in(data, ("config", "algorithm", "mark_duplicates"), False)
    else:
        dup_param = utils.get_in(data, ("config", "algorithm", "mark_duplicates"), True)
    if dup_param and isinstance(dup_param, basestring):
        logger.info("Warning: bcbio no longer support explicit setting of mark_duplicate algorithm. "
                    "Using best-practice choice based on input data.")
        dup_param = True
    return dup_param
Beispiel #14
0
def _check_dedup(data):
    """Check configuration for de-duplication.

    Defaults to no de-duplication for RNA-seq and small RNA, the
    back compatible default. Allow overwriting with explicit
    `mark_duplicates: true` setting.
    Also defaults to false for no alignment inputs.
    """
    if dd.get_analysis(data).lower() in ["rna-seq", "smallrna-seq"] or not dd.get_aligner(data):
        dup_param = utils.get_in(data, ("config", "algorithm", "mark_duplicates"), False)
    else:
        dup_param = utils.get_in(data, ("config", "algorithm", "mark_duplicates"), True)
    if dup_param and isinstance(dup_param, six.string_types):
        logger.info("Warning: bcbio no longer support explicit setting of mark_duplicate algorithm. "
                    "Using best-practice choice based on input data.")
        dup_param = True
    return dup_param
Beispiel #15
0
def counts_spikein(data):
    data = utils.to_single_data(data)
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    salmon_dir = os.path.join(work_dir, "spikein", samplename)
    fasta_file = dd.get_spikein_fasta(data)
    if not fasta_file:
        return data
    files = dd.get_input_sequence_files(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    kmer = 31 if not dd.get_analysis(data).lower() == "smallrna-seq" else 15
    fasta_index = _index_spikein(fasta_file, salmon_dir, data, kmer)
    out_file = _salmon_quant_reads(fq1, fq2, salmon_dir, fasta_index, data)
    data = dd.set_spikein_counts(data, out_file)
    return data
Beispiel #16
0
def counts_spikein(data):
    data = utils.to_single_data(data)
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    salmon_dir = os.path.join(work_dir, "spikein", samplename)
    fasta_file = dd.get_spikein_fasta(data)
    if not fasta_file:
        return data
    files = dd.get_input_sequence_files(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    kmer = 31 if not dd.get_analysis(data).lower() == "smallrna-seq" else 15
    fasta_index = _index_spikein(fasta_file, salmon_dir, data, kmer)
    out_file = _salmon_quant_reads(fq1, fq2, salmon_dir, fasta_index, data)
    data = dd.set_spikein_counts(data, out_file)
    return data
def variant_filtration(call_file, ref_file, vrn_files, data, items):
    """Filter variant calls using Variant Quality Score Recalibration.

    Newer GATK with Haplotype calling has combined SNP/indel filtering.
    """
    caller = data["config"]["algorithm"].get("variantcaller")
    if "gvcf" not in dd.get_tools_on(data):
        call_file = ploidy.filter_vcf_by_sex(call_file, items)
    if caller in ["freebayes"]:
        return vfilter.freebayes(call_file, ref_file, vrn_files, data)
    elif caller in ["platypus"]:
        return vfilter.platypus(call_file, data)
    elif caller in ["samtools"]:
        return vfilter.samtools(call_file, data)
    elif caller in ["gatk", "gatk-haplotype", "haplotyper"]:
        if dd.get_analysis(data).lower().find("rna-seq") >= 0:
            from bcbio.rnaseq import variation as rnaseq_variation
            return rnaseq_variation.gatk_filter_rnaseq(call_file, data)
        else:
            return gatkfilter.run(call_file, ref_file, vrn_files, data)
    # no additional filtration for callers that filter as part of call process
    else:
        return call_file
Beispiel #18
0
def counts_spikein(data):
    data = utils.to_single_data(data)
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    salmon_dir = os.path.join(work_dir, "spikein", samplename)
    fasta_file = dd.get_spikein_fasta(data)
    if not fasta_file:
        return data
    files = dd.get_input_sequence_files(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    readlength = fastq.estimate_read_length(fq1)
    if readlength % 2 == 0:
        readlength -= 1
    kmersize = min(readlength, 31)
    logger.info("kmersize used for salmon index at spikein quant: %s" % kmersize)
    kmersize = kmersize if not dd.get_analysis(data).lower() == "smallrna-seq" else 15
    fasta_index = _index_spikein(fasta_file, salmon_dir, data, kmersize)
    out_file = _salmon_quant_reads(fq1, fq2, salmon_dir, fasta_index, data)
    data = dd.set_spikein_counts(data, out_file)
    return data