Ejemplo n.º 1
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    assert data["analysis"].lower().startswith("wgbs-seq"), "No comparible alignment."
    config = data["config"]
    sample = dd.get_sample_name(data)
    out_prefix = os.path.join(align_dir, dd.get_lane(data))
    out_dir = os.path.join(align_dir, "%s_bismark" % dd.get_lane(data))

    if not ref_file:
        logger.error("bismark index not found. You can install "
                     "the index for your genome with: bcbio_nextgen.py upgrade "
                     "--aligners bismark --genomes genome-build-name --data")
        sys.exit(1)

    final_out = os.path.join(align_dir, "{0}.bam".format(sample))
    if file_exists(final_out):
        data = dd.set_work_bam(data, final_out)
        data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0]
        data = dd.update_summary_qc(data, "bismark", base=data["bam_report"])
        return data

    bismark = config_utils.get_program("bismark", config)
    # bismark uses 5 threads/sample and ~12GB RAM/sample (hg38)
    resources = config_utils.get_resources("bismark", data["config"])
    max_cores = dd.get_num_cores(data)
    max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) / (1024.0 * 1024.0)
    instances = calculate_bismark_instances(max_cores, max_mem * max_cores)
    # override instances if specified in the config
    if resources and resources.get("bismark_threads"):
        instances = resources.get("bismark_threads")
        logger.info(f"Using {instances} bismark instances - overriden by resources")
    bowtie_threads = 1
    if resources and resources.get("bowtie_threads"):
        bowtie_threads = resources.get("bowtie_threads")
    logger.info(f"Using {bowtie_threads} bowtie threads per bismark instance")
    kit = kits.KITS.get(dd.get_kit(data), None)
    directional = "--non_directional" if kit and not kit.is_directional else ""

    other_opts = resources.get("options", [])
    other_opts = " ".join([str(x) for x in other_opts]).strip()

    fastq_files = " ".join([fastq_file, pair_file]) if pair_file else fastq_file
    safe_makedir(align_dir)
    cmd = "{bismark} {other_opts} {directional} --bowtie2 --temp_dir {tx_out_dir} --gzip --parallel {instances} -p {bowtie_threads} -o {tx_out_dir} --unmapped {ref_file} {fastq_file} "
    if pair_file:
        fastq_file = "-1 %s -2 %s" % (fastq_file, pair_file)
    raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam")
    if not raw_bam:
        with tx_tmpdir() as tx_out_dir:
            run_message = "Running Bismark aligner on %s and %s" % (fastq_file, ref_file)
            do.run(cmd.format(**locals()), run_message, None)
            shutil.move(tx_out_dir, out_dir)
        raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam")
    # don't process bam in the bismark pipeline!
    utils.symlink_plus(raw_bam[0], final_out)
    data = dd.set_work_bam(data, final_out)
    data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0]
    data = dd.update_summary_qc(data, "bismark", base=data["bam_report"])
    return data
Ejemplo n.º 2
0
def _bismark_calling(data):
    workdir = safe_makedir(os.path.join(dd.get_work_dir(data), "cpg"))
    config = data["config"]
    sample = dd.get_sample_name(data)
    index_dir = get_aligner_index('bismark', data)
    biasm_file = _run_meth_extractor(data["work_bam"], sample, workdir,
                                     index_dir, config)
    data['bismark_report'] = _run_report(data["work_bam"], data["bam_report"],
                                         sample, biasm_file, workdir, config)
    splitting_report = biasm_file.replace(".M-bias", "_splitting_report")
    data = dd.update_summary_qc(data, "bismark", base=biasm_file)
    data = dd.update_summary_qc(data, "bismark", base=data["bam_report"])
    data = dd.update_summary_qc(data, "bismark", base=splitting_report)
    return data
Ejemplo n.º 3
0
def postprocess_alignment(data):
    """Perform post-processing steps required on full BAM files.
    Prepares list of callable genome regions allowing subsequent parallelization.
    """
    data = cwlutils.normalize_missing(utils.to_single_data(data))
    data = cwlutils.unpack_tarballs(data, data)
    bam_file = data.get("align_bam") or data.get("work_bam")
    ref_file = dd.get_ref_file(data)
    artifacts = gatk.collect_artifact_metrics(data)
    if artifacts:
        data = dd.update_summary_qc(data, "picard", artifacts.pop(), artifacts)
        oxog = gatk.collect_oxog_metrics(data)
        data = dd.update_summary_qc(data, "picard", oxog.pop(), oxog)
    if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith(
            ".bam"):
        out_dir = utils.safe_makedir(
            os.path.join(dd.get_work_dir(data), "align",
                         dd.get_sample_name(data)))
        bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file))
        if not utils.file_exists(bam_file_ready):
            utils.symlink_plus(bam_file, bam_file_ready)
        bam.index(bam_file_ready, data["config"])
        covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data)
        callable_region_bed, nblock_bed = \
            callable.block_regions(covinfo.raw_callable, bam_file_ready, ref_file, data)
        data["regions"] = {
            "nblock": nblock_bed,
            "callable": covinfo.raw_callable,
            "sample_callable": covinfo.callable,
            "mapped_stats": readstats.get_cache_file(data)
        }
        data["depth"] = covinfo.depth_files
        data = coverage.assign_interval(data)
        data = samtools.run_and_save(data)
        data = recalibrate.prep_recal(data)
        data = recalibrate.apply_recal(data)
    elif dd.get_variant_regions(data):
        callable_region_bed, nblock_bed = \
            callable.block_regions(dd.get_variant_regions(data), bam_file, ref_file, data)
        data["regions"] = {
            "nblock": nblock_bed,
            "callable": dd.get_variant_regions(data),
            "sample_callable": dd.get_variant_regions(data)
        }
    return [[data]]
Ejemplo n.º 4
0
def run_salmon_bam(data):
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    salmon_dir = os.path.join(work_dir, "salmon", samplename)
    gtf_file = dd.get_gtf_file(data)
    bam_file = dd.get_transcriptome_bam(data)
    out_file = salmon_quant_bam(bam_file, salmon_dir, gtf_file, data)
    data = dd.set_salmon(data, out_file)
    data = dd.set_salmon_dir(data, salmon_dir)
    data = dd.set_salmon_fraglen_file(data, _get_fraglen_file(salmon_dir))
    data = dd.update_summary_qc(data, "salmon", base=dd.get_salmon_fraglen_file(data))
    return [[data]]
Ejemplo n.º 5
0
def dedup_bismark(data):
    """Remove alignments to the same position in the genome from the Bismark
    mapping output using deduplicate_bismark
    """
    config = data["config"]
    input_file = datadict.get_work_bam(data)
    # don't sort even by read names
    # input_file = bam.sort(input_file, config, order="queryname")
    sample_name = datadict.get_sample_name(data)
    output_dir = os.path.join(datadict.get_work_dir(data), 'dedup',
                              sample_name)
    output_dir = utils.safe_makedir(output_dir)

    input_file_name, input_file_extension = os.path.splitext(
        os.path.basename(input_file))
    output_file = os.path.join(
        output_dir, f'{input_file_name}.deduplicated{input_file_extension}')

    if utils.file_exists(output_file):
        data = datadict.set_work_bam(data, output_file)
        data["deduplication_report"] = output_file.replace(
            "deduplicated.bam", "deduplication_report.txt")
        data = dd.update_summary_qc(data,
                                    "bismark",
                                    base=data["deduplication_report"])
        return [[data]]

    deduplicate_bismark = config_utils.get_program('deduplicate_bismark',
                                                   config)
    command = f'{deduplicate_bismark} --output_dir {output_dir} {input_file}'
    with transaction.file_transaction(output_dir):
        do.run(command, 'remove deduplicate alignments')

    data = datadict.set_work_bam(data, output_file)
    data["deduplication_report"] = output_file.replace(
        "deduplicated.bam", "deduplication_report.txt")
    data = dd.update_summary_qc(data,
                                "bismark",
                                base=data["deduplication_report"])
    return [[data]]
Ejemplo n.º 6
0
def run_salmon_bam(data):
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    salmon_dir = os.path.join(work_dir, "salmon", samplename)
    gtf_file = dd.get_gtf_file(data)
    bam_file = dd.get_transcriptome_bam(data)
    fasta_file = dd.get_ref_file(data)
    out_file = salmon_quant_bam(bam_file, salmon_dir, gtf_file, fasta_file, data)
    data = dd.set_salmon(data, out_file)
    data = dd.set_salmon_dir(data, salmon_dir)
    data = dd.set_salmon_fraglen_file(data, _get_fraglen_file(salmon_dir))
    data = dd.update_summary_qc(data, "salmon", base=dd.get_salmon_fraglen_file(data))
    return [[data]]
Ejemplo n.º 7
0
def run_salmon_reads(data):
    data = utils.to_single_data(data)
    files = dd.get_input_sequence_files(data)
    if bam.is_bam(files[0]):
        files = fastq.convert_bam_to_fastq(files[0], data["dirs"]["work"],
                                           data, data["dirs"], data["config"])
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    salmon_dir = os.path.join(work_dir, "salmon", samplename)
    gtf_file = dd.get_gtf_file(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    fasta_file = dd.get_ref_file(data)
    out_file = salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, fasta_file, data)
    data = dd.set_salmon(data, out_file)
    data = dd.set_salmon_dir(data, salmon_dir)
    data = dd.set_salmon_fraglen_file(data, _get_fraglen_file(salmon_dir))
    data = dd.update_summary_qc(data, "salmon", base=dd.get_salmon_fraglen_file(data))
    return [[data]]
Ejemplo n.º 8
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    if not ref_file:
        logger.error(
            "STAR index not found. We don't provide the STAR indexes "
            "by default because they are very large. You can install "
            "the index for your genome with: bcbio_nextgen.py upgrade "
            "--aligners star --genomes genome-build-name --data")
        sys.exit(1)

    max_hits = 10
    srna = True if data["analysis"].lower().startswith(
        "smallrna-seq") else False
    srna_opts = ""
    if srna:
        max_hits = 1000
        srna_opts = "--alignIntronMax 1"
    config = data["config"]
    star_dirs = _get_star_dirnames(align_dir, data, names)
    if file_exists(star_dirs.final_out):
        data = _update_data(star_dirs.final_out, star_dirs.out_dir, names,
                            data)
        out_log_file = os.path.join(align_dir,
                                    dd.get_lane(data) + "Log.final.out")
        data = dd.update_summary_qc(data, "star", base=out_log_file)
        return data

    star_path = config_utils.get_program("STAR", config)

    def _unpack_fastq(f):
        """Use process substitution instead of readFilesCommand for gzipped inputs.

        Prevents issues on shared filesystems that don't support FIFO:
        https://github.com/alexdobin/STAR/issues/143
        """
        if f and is_gzipped(f):
            return "<(gunzip -c %s)" % f
        else:
            return f

    fastq_files = (" ".join([
        _unpack_fastq(fastq_file),
        _unpack_fastq(pair_file)
    ]) if pair_file else _unpack_fastq(fastq_file))
    num_cores = dd.get_num_cores(data)
    gtf_file = dd.get_transcriptome_gtf(data)
    if not gtf_file:
        gtf_file = dd.get_gtf_file(data)
    if ref_file.endswith("chrLength"):
        ref_file = os.path.dirname(ref_file)

    if index_has_alts(ref_file):
        logger.error(
            "STAR is being run on an index with ALTs which STAR is not "
            "designed for. Please remake your STAR index or use an ALT-aware "
            "aligner like hisat2")
        sys.exit(1)
    with file_transaction(data, align_dir) as tx_align_dir:
        tx_1pass_dir = tx_align_dir + "1pass"
        tx_star_dirnames = _get_star_dirnames(tx_1pass_dir, data, names)
        tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames
        safe_makedir(tx_1pass_dir)
        safe_makedir(tx_out_dir)
        cmd = (
            "{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} "
            "--runThreadN {num_cores} --outFileNamePrefix {tx_out_prefix} "
            "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} "
            "--outStd BAM_Unsorted {srna_opts} "
            "--limitOutSJcollapsed 2000000 "
            "--outSAMtype BAM Unsorted "
            "--outSAMmapqUnique 60 "
            "--outSAMunmapped Within --outSAMattributes %s " %
            " ".join(ALIGN_TAGS))
        cmd += _add_sj_index_commands(fastq_file, ref_file,
                                      gtf_file) if not srna else ""
        cmd += _read_group_option(names)
        if dd.get_fusion_caller(data):
            if "arriba" in dd.get_fusion_caller(data):
                cmd += (
                    "--chimSegmentMin 10 --chimOutType WithinBAM "
                    "--chimJunctionOverhangMin 10 --chimScoreMin 1 --chimScoreDropMax 30 "
                    "--chimScoreJunctionNonGTAG 0 --chimScoreSeparation 1 "
                    "--alignSJstitchMismatchNmax 5 -1 5 5 "
                    "--chimSegmentReadGapMax 3 "
                    "--peOverlapNbasesMin 10 "
                    "--alignSplicedMateMapLminOverLmate 0.5 ")
            else:
                cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 "
                        "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 "
                        "--chimScoreSeparation 5 ")
                if "oncofuse" in dd.get_fusion_caller(data):
                    cmd += "--chimOutType Junctions "
                else:
                    cmd += "--chimOutType WithinBAM "
        strandedness = utils.get_in(data,
                                    ("config", "algorithm", "strandedness"),
                                    "unstranded").lower()
        if strandedness == "unstranded" and not srna:
            cmd += " --outSAMstrandField intronMotif "
        if not srna:
            cmd += " --quantMode TranscriptomeSAM "

        resources = config_utils.get_resources("star", data["config"])
        if resources.get("options", []):
            cmd += " " + " ".join(
                [str(x) for x in resources.get("options", [])])
        cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out)
        cmd += " > {tx_final_out} "
        run_message = "Running 1st pass of STAR aligner on %s and %s" % (
            fastq_file, ref_file)
        do.run(cmd.format(**locals()), run_message, None)

        sjfile = get_splicejunction_file(tx_out_dir, data)
        sjflag = f"--sjdbFileChrStartEnd {sjfile}" if sjfile else ""
        tx_star_dirnames = _get_star_dirnames(tx_align_dir, data, names)
        tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames
        safe_makedir(tx_align_dir)
        safe_makedir(tx_out_dir)
        cmd = (
            "{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} "
            "--runThreadN {num_cores} --outFileNamePrefix {tx_out_prefix} "
            "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} "
            "--outStd BAM_Unsorted {srna_opts} "
            "--limitOutSJcollapsed 2000000 "
            "{sjflag} "
            "--outSAMtype BAM Unsorted "
            "--outSAMmapqUnique 60 "
            "--outSAMunmapped Within --outSAMattributes %s " %
            " ".join(ALIGN_TAGS))
        cmd += _add_sj_index_commands(fastq_file, ref_file,
                                      gtf_file) if not srna else ""
        cmd += _read_group_option(names)
        if dd.get_fusion_caller(data):
            if "arriba" in dd.get_fusion_caller(data):
                cmd += (
                    "--chimSegmentMin 10 --chimOutType WithinBAM SoftClip Junctions "
                    "--chimJunctionOverhangMin 10 --chimScoreMin 1 --chimScoreDropMax 30 "
                    "--chimScoreJunctionNonGTAG 0 --chimScoreSeparation 1 "
                    "--alignSJstitchMismatchNmax 5 -1 5 5 "
                    "--chimSegmentReadGapMax 3 ")
            else:
                cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 "
                        "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 "
                        "--chimScoreSeparation 5 ")
                if "oncofuse" in dd.get_fusion_caller(data):
                    cmd += "--chimOutType Junctions "
                else:
                    cmd += "--chimOutType WithinBAM "
        strandedness = utils.get_in(data,
                                    ("config", "algorithm", "strandedness"),
                                    "unstranded").lower()
        if strandedness == "unstranded" and not srna:
            cmd += " --outSAMstrandField intronMotif "
        if not srna:
            cmd += " --quantMode TranscriptomeSAM "

        resources = config_utils.get_resources("star", data["config"])
        if resources.get("options", []):
            cmd += " " + " ".join(
                [str(x) for x in resources.get("options", [])])
        cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out)
        cmd += " > {tx_final_out} "
        run_message = "Running 2nd pass of STAR aligner on %s and %s" % (
            fastq_file, ref_file)
        do.run(cmd.format(**locals()), run_message, None)

    data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data)
    out_log_file = os.path.join(align_dir, dd.get_lane(data) + "Log.final.out")
    data = dd.update_summary_qc(data, "star", base=out_log_file)
    return data