Beispiel #1
0
def main(config_file,
         align_sam,
         ref_file,
         fastq_one,
         fastq_pair=None,
         sample_name="",
         rg_name="",
         pu_name=""):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    picard = BroadRunner(config["program"]["picard"],
                         max_memory=config["algorithm"].get("java_memory", ""))
    platform = config["algorithm"]["platform"]
    if platform.lower() == "illumina":
        qual_format = "Illumina"
    else:
        raise ValueError("Need to specify quality format for %s" % platform)
    index_ref_file(picard, ref_file)
    base_dir = os.path.split(align_sam)[0]
    with curdir_tmpdir() as tmp_dir:
        out_fastq_bam = picard_fastq_to_bam(picard, fastq_one, fastq_pair,
                                            base_dir, platform, qual_format,
                                            sample_name, rg_name, pu_name,
                                            tmp_dir)
        out_bam = picard_merge_bam(picard, align_sam, out_fastq_bam, ref_file,
                                   tmp_dir, fastq_pair is not None)
        sort_bam = picard_sort(picard, out_bam, tmp_dir)
    save_diskspace(out_fastq_bam, "Combined into output BAM %s" % out_bam,
                   config)
    save_diskspace(out_bam, "Sorted to %s" % sort_bam, config)
Beispiel #2
0
def align(fastq_file, pair_file, index_dir, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM.

    TODO: Use streaming with new development version of SNAP to feed into
    structural variation preparation de-duplication.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    assert not data.get("align_split"), "Split alignments not supported with SNAP"
    snap = config_utils.get_program("snap", data["config"])
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    resources = config_utils.get_resources("snap", data["config"])
    max_mem = resources.get("memory", "1G")
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            with utils.curdir_tmpdir(data) as work_dir:
                if fastq_file.endswith(".bam"):
                    cmd_name = "paired" if bam.is_paired(fastq_file) else "single"
                else:
                    cmd_name = "single" if not pair_file else "paired"
                cmd = ("{snap} {cmd_name} {index_dir} {fastq_file} {pair_file} "
                       "-rg '{rg_info}' -t {num_cores} -sa -so -sm {max_mem} -o {tx_out_file}")
                do.run(cmd.format(**locals()), "SNAP alignment: %s" % names["sample"])
    data["work_bam"] = out_file
    return data
Beispiel #3
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file = alignprep.split_namedpipe_cl(fastq_file, data)
        if pair_file:
            pair_file = alignprep.split_namedpipe_cl(pair_file, data)
    else:
        final_file = None
    samtools = config_utils.get_program("samtools", data["config"])
    novoalign = config_utils.get_program("novoalign", data["config"])
    resources = config_utils.get_resources("novoalign", data["config"])
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "1G")
    extra_novo_args = " ".join(_novoalign_args_from_config(data["config"]))
    rg_info = get_rg_info(names)
    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        with utils.curdir_tmpdir(data) as work_dir:
            with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file):
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                cmd = ("{novoalign} -o SAM '{rg_info}' -d {ref_file} -f {fastq_file} {pair_file} "
                       "  -c {num_cores} {extra_novo_args} | ")
                cmd = cmd.format(**locals()) + tobam_cl
                do.run(cmd, "Novoalign: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, fastq_file)])
    data["work_bam"] = out_file
    return data
Beispiel #4
0
def align_bam(in_bam, ref_file, names, align_dir, data):
    """Perform direct alignment of an input BAM file with BWA using pipes.

    This avoids disk IO by piping between processes:
     - samtools sort of input BAM to queryname
     - bedtools conversion to interleaved FASTQ
     - bwa-mem alignment
     - samtools conversion to BAM
     - samtools sort to coordinate
    """
    config = data["config"]
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    bedtools = config_utils.get_program("bedtools", config)
    bwa = config_utils.get_program("bwa", config)
    resources = config_utils.get_resources("samtools", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    # adjust memory for samtools since used for input and output
    max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                         3, "decrease")
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file):
        with utils.curdir_tmpdir() as work_dir:
            with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file):
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
                       "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout "
                       "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - | ")
                cmd = cmd.format(**locals()) + tobam_cl
                do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)])
    return out_file
Beispiel #5
0
def _extract_split_and_discordants(in_bam, work_dir, data):
    """Retrieve split-read alignments from input BAM file.
    """
    sr_file = os.path.join(work_dir, "%s-sr.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    disc_file = os.path.join(work_dir, "%s-disc.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    samblaster = config_utils.get_program("samblaster", data["config"])
    sambamba = config_utils.get_program("sambamba", data["config"])
    cores = utils.get_in(data, ("config", "algorithm", "num_cores"), 1)
    resources = config_utils.get_resources("sambamba", data["config"])
    mem = config_utils.adjust_memory(resources.get("memory", "2G"),
                                     3, "decrease")
    if not utils.file_exists(sr_file) or not utils.file_exists(disc_file):
        with file_transaction(sr_file) as tx_sr_file:
            with file_transaction(disc_file) as tx_disc_file:
                with utils.curdir_tmpdir() as tmpdir:
                    tobam_cmd = ("{sambamba} view -S -f bam -l 0 /dev/stdin | "
                                 "{sambamba} sort -t {cores} -m {mem} --tmpdir {tmpdir} "
                                 "-o {out_file} /dev/stdin")
                    splitter_cmd = tobam_cmd.format(out_file=tx_sr_file, **locals())
                    discordant_cmd = tobam_cmd.format(out_file=tx_disc_file, **locals())
                    cmd = ("{sambamba} sort -t {cores} -m {mem} --tmpdir={tmpdir} "
                           "-n -o /dev/stdout -l 0 {in_bam} | "
                           "{sambamba} view -h /dev/stdin | "
                           "{samblaster} --splitterFile >({splitter_cmd}) --discordantFile >({discordant_cmd}) "
                           "-o /dev/null")
                    do.run(cmd.format(**locals()), "samblaster: split and discordant reads", data)
    return sr_file, disc_file
Beispiel #6
0
def _run_recal_bam(dup_align_bam, recal_file, region, ref_file, out_file,
                   config):
    """Run BAM recalibration with the given input
    """
    if not file_exists(out_file):
        if _recal_available(recal_file):
            broad_runner = broad.runner_from_config(config)
            intervals = config["algorithm"].get("variant_regions", None)
            with curdir_tmpdir() as tmp_dir:
                with file_transaction(out_file) as tx_out_file:
                    params = [
                        "-T",
                        "PrintReads",
                        "-BQSR",
                        recal_file,
                        "-R",
                        ref_file,
                        "-I",
                        dup_align_bam,
                        "--out",
                        tx_out_file,
                    ]
                    if region:
                        params += ["-L", region]
                    if intervals:
                        params += ["-L", intervals]
                    if params and intervals:
                        params += ["--interval_set_rule", "INTERSECTION"]
                    broad_runner.run_gatk(params, tmp_dir)
        elif region:
            subset_bam_by_region(dup_align_bam, region, out_file)
        else:
            shutil.copy(dup_align_bam, out_file)
    return out_file
Beispiel #7
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, config):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    novoalign = config_utils.get_program("novoalign", config)
    resources = config_utils.get_resources("novoalign", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "1G")
    qual_format = config["algorithm"].get("quality_format", "").lower()
    qual_flag = "ILMFQ" if qual_format == "illumina" else "STDFQ"
    rg_info = get_rg_info(names)
    if not utils.file_exists(out_file):
        with utils.curdir_tmpdir() as work_dir:
            with file_transaction(out_file) as tx_out_file:
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                cmd = ("{novoalign} -o SAM '{rg_info}' -d {ref_file} -f {fastq_file} {pair_file} "
                       "  -F {qual_flag} -c {num_cores} "
                       "| {samtools} view -b -S -u - "
                       "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}")
                cmd = cmd.format(**locals())
                log_cmd("Novoalign: %s" % names["sample"], None, cmd)
                subprocess.check_call(cmd, shell=True)
    return out_file
def _run_recal_bam(dup_align_bam, recal_file, region, ref_file, out_file, config):
    """Run BAM recalibration with the given input
    """
    if not file_exists(out_file):
        if _recal_available(recal_file):
            broad_runner = broad.runner_from_config(config)
            with curdir_tmpdir() as tmp_dir:
                with file_transaction(out_file) as tx_out_file:
                    params = ["-T", "PrintReads",
                              "-BQSR", recal_file,
                              "-R", ref_file,
                              "-I", dup_align_bam,
                              "--out", tx_out_file,
                              ]
                    base_bed = config["algorithm"].get("variant_regions", None)
                    region_bed = subset_variant_regions(base_bed, region, tx_out_file)
                    if region_bed:
                        params += ["-L", region_bed, "--interval_set_rule", "INTERSECTION"]
                    elif region:
                        params += ["-L", region, "--interval_set_rule", "INTERSECTION"]
                    broad_runner.run_gatk(params, tmp_dir)
        elif region:
            subset_bam_by_region(dup_align_bam, region, out_file)
        else:
            shutil.copy(dup_align_bam, out_file)
    return out_file
Beispiel #9
0
 def run_gatk(self, params, tmp_dir=None):
     #support_nt = set(["UnifiedGenotyper", "VariantEval"])
     support_nt = set()
     support_nct = set(["BaseRecalibrator"])
     gatk_jar = self._get_jar("GenomeAnalysisTK", ["GenomeAnalysisTKLite"])
     local_args = []
     cores = self._resources.get("cores", None)
     if cores and cores > 1:
         atype_index = params.index("-T") if params.count("-T") > 0 \
                       else params.index("--analysis_type")
         prog = params[atype_index + 1]
         if prog in support_nt:
             params.extend(["-nt", str(cores)])
         elif prog in support_nct:
             params.extend(["-nct", str(cores)])
     if len([x for x in params if x.startswith(("-U", "--unsafe"))]) == 0:
         params.extend(["-U", "LENIENT_VCF_PROCESSING"])
     with curdir_tmpdir() as local_tmp_dir:
         if tmp_dir is None:
             tmp_dir = local_tmp_dir
         local_args.append("-Djava.io.tmpdir=%s" % tmp_dir)
         cl = ["java"] + self._jvm_opts + local_args + \
                 ["-jar", gatk_jar] + [str(x) for x in params]
         #print " ".join(cl)
         subprocess.check_call(cl)
Beispiel #10
0
def align_bam(in_bam, ref_file, names, align_dir, config):
    """Perform realignment of input BAM file, handling sorting of input/output with novosort.

    Uses unix pipes for avoid IO writing between steps:
      - novosort of input BAM to coordinates
      - alignment with novoalign
      - conversion to BAM with samtools
      - coordinate sorting with novosort
    """
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    novosort = config_utils.get_program("novosort", config)
    novoalign = config_utils.get_program("novoalign", config)
    samtools = config_utils.get_program("samtools", config)
    resources = config_utils.get_resources("novoalign", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "4G")
    extra_novo_args = " ".join(_novoalign_args_from_config(config, False))

    if not file_exists(out_file):
        with curdir_tmpdir(base_dir=align_dir) as work_dir:
            with file_transaction(out_file) as tx_out_file:
                rg_info = get_rg_info(names)
                cmd = ("{novosort} -c {num_cores} -m {max_mem} --compression 0 "
                       " -n -t {work_dir} {in_bam} "
                       "| {novoalign} -o SAM '{rg_info}' -d {ref_file} -f /dev/stdin "
                       "  -F BAMPE -c {num_cores} {extra_novo_args} "
                       "| {samtools} view -b -S -u - "
                       "| {novosort} -c {num_cores} -m {max_mem} -t {work_dir} "
                       "  -o {tx_out_file} /dev/stdin")
                cmd = cmd.format(**locals())
                do.run(cmd, "Novoalign: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file)])
    return out_file
Beispiel #11
0
 def run_gatk(self, params, tmp_dir=None):
     #support_nt = set(["UnifiedGenotyper", "VariantEval"])
     support_nt = set()
     support_nct = set(["BaseRecalibrator"])
     gatk_jar = self._get_jar("GenomeAnalysisTK", ["GenomeAnalysisTKLite"])
     local_args = []
     cores = self._resources.get("cores", None)
     if cores and cores > 1:
         atype_index = params.index("-T") if params.count("-T") > 0 \
                       else params.index("--analysis_type")
         prog = params[atype_index + 1]
         if prog in support_nt:
             params.extend(["-nt", str(cores)])
         elif prog in support_nct:
             params.extend(["-nct", str(cores)])
     if len([x for x in params if x.startswith(("-U", "--unsafe"))]) == 0:
         params.extend(["-U", "LENIENT_VCF_PROCESSING"])
     with curdir_tmpdir() as local_tmp_dir:
         if tmp_dir is None:
             tmp_dir = local_tmp_dir
         local_args.append("-Djava.io.tmpdir=%s" % tmp_dir)
         cl = ["java"] + self._jvm_opts + local_args + \
                 ["-jar", gatk_jar] + [str(x) for x in params]
         #print " ".join(cl)
         subprocess.check_call(cl)
Beispiel #12
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, config):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    novoalign = config_utils.get_program("novoalign", config)
    resources = config_utils.get_resources("novoalign", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "1G")
    extra_novo_args = " ".join(_novoalign_args_from_config(config, False))
    rg_info = get_rg_info(names)
    if not utils.file_exists(out_file):
        with utils.curdir_tmpdir() as work_dir:
            with file_transaction(out_file) as tx_out_file:
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                cmd = (
                    "{novoalign} -o SAM '{rg_info}' -d {ref_file} -f {fastq_file} {pair_file} "
                    "  -c {num_cores} {extra_novo_args} "
                    "| {samtools} view -b -S -u - "
                    "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}"
                )
                cmd = cmd.format(**locals())
                do.run(cmd, "Novoalign: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file)])
    return out_file
Beispiel #13
0
def align_bam(in_bam, ref_file, names, align_dir, config):
    """Perform realignment of input BAM file, handling sorting of input/output with novosort.

    Uses unix pipes for avoid IO writing between steps:
      - novosort of input BAM to coordinates
      - alignment with novoalign
      - conversion to BAM with samtools
      - coordinate sorting with novosort
    """
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    novosort = config_utils.get_program("novosort", config)
    novoalign = config_utils.get_program("novoalign", config)
    samtools = config_utils.get_program("samtools", config)
    resources = config_utils.get_resources("novoalign", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "4G")
    extra_novo_args = " ".join(_novoalign_args_from_config(config, False))

    if not file_exists(out_file):
        with curdir_tmpdir(base_dir=align_dir) as work_dir:
            with file_transaction(out_file) as tx_out_file:
                rg_info = get_rg_info(names)
                cmd = (
                    "{novosort} -c {num_cores} -m {max_mem} --compression 0 "
                    " -n -t {work_dir} {in_bam} "
                    "| {novoalign} -o SAM '{rg_info}' -d {ref_file} -f /dev/stdin "
                    "  -F BAMPE -c {num_cores} {extra_novo_args} "
                    "| {samtools} view -b -S -u - "
                    "| {novosort} -c {num_cores} -m {max_mem} -t {work_dir} "
                    "  -o {tx_out_file} /dev/stdin")
                cmd = cmd.format(**locals())
                do.run(cmd, "Novoalign: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file)])
    return out_file
Beispiel #14
0
def _gatk_count_covariates(broad_runner, dup_align_bam, ref_file, platform,
        snp_file, intervals):
    """Step 1 of GATK recalibration process -- counting covariates.
    """
    out_file = "%s.recal" % os.path.splitext(dup_align_bam)[0]
    if not file_exists(out_file):
        if has_aligned_reads(dup_align_bam):
            with curdir_tmpdir() as tmp_dir:
                with file_transaction(out_file) as tx_out_file:
                    params = ["-T", "CountCovariates",
                              "-cov", "ReadGroupCovariate",
                              "-cov", "QualityScoreCovariate",
                              "-cov", "CycleCovariate",
                              "-cov", "DinucCovariate",
                              "-recalFile", tx_out_file,
                              "-I", dup_align_bam,
                              "-R", ref_file,
                              "-l", "INFO",
                              "-U",
                              "-OQ",
                              "--default_platform", platform,
                              ]
                    if snp_file:
                        params += ["--knownSites", snp_file]
                    if intervals:
                        params += ["-L", intervals, "--interval_set_rule", "INTERSECTION"]
                    broad_runner.run_gatk(params, tmp_dir)
        else:
            with open(out_file, "w") as out_handle:
                out_handle.write("# No aligned reads")
    return out_file
Beispiel #15
0
def _gatk_table_recalibrate(broad_runner, dup_align_bam, ref_file, recal_file,
                            platform, intervals):
    """Step 2 of GATK recalibration -- use covariates to re-write output file.
    """
    out_file = "%s-gatkrecal.bam" % os.path.splitext(dup_align_bam)[0]
    if not file_exists(out_file):
        if _recal_available(recal_file):
            with curdir_tmpdir() as tmp_dir:
                with file_transaction(out_file) as tx_out_file:
                    params = ["-T", "TableRecalibration",
                              "-recalFile", recal_file,
                              "-R", ref_file,
                              "-I", dup_align_bam,
                              "--out", tx_out_file,
                              "-baq",  "RECALCULATE",
                              "-l", "INFO",
                              "-U",
                              "-OQ",
                              "--default_platform", platform,
                              ]
                    if intervals:
                        params += ["-L", intervals, "--interval_set_rule", "INTERSECTION"]
                    broad_runner.run_gatk(params, tmp_dir)
        else:
            shutil.copy(dup_align_bam, out_file)
    return out_file
Beispiel #16
0
def _run_toplevel(config,
                  config_file,
                  work_dir,
                  parallel,
                  fc_dir=None,
                  run_info_yaml=None):
    """
    Run toplevel analysis, processing a set of input files.
    config_file -- Main YAML configuration file with system parameters
    fc_dir -- Directory of fastq files to process
    run_info_yaml -- YAML configuration file specifying inputs to process
    """
    parallel = log.create_base_logger(config, parallel)
    log.setup_local_logging(config, parallel)
    dirs = setup_directories(work_dir, fc_dir, config, config_file)
    config_file = os.path.join(dirs["config"], os.path.basename(config_file))
    samples = run_info.organize(dirs, config, run_info_yaml)
    pipelines = _pair_lanes_with_pipelines(samples)
    final = []
    with utils.curdir_tmpdir({"config": config}) as tmpdir:
        tempfile.tempdir = tmpdir
        for pipeline, pipeline_items in pipelines.items():
            pipeline_items = _add_provenance(pipeline_items, dirs, parallel,
                                             config)
            versioncheck.testall(pipeline_items)
            for xs in pipeline.run(config, config_file, parallel, dirs,
                                   pipeline_items):
                if len(xs) == 1:
                    upload.from_sample(xs[0])
                    final.append(xs[0])
Beispiel #17
0
def align_bam(in_bam, ref_file, names, align_dir, data):
    """Perform realignment of input BAM file; uses unix pipes for avoid IO.
    """
    config = data["config"]
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    novoalign = config_utils.get_program("novoalign", config)
    samtools = config_utils.get_program("samtools", config)
    resources = config_utils.get_resources("novoalign", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "4G").upper()
    extra_novo_args = " ".join(_novoalign_args_from_config(config, False))

    if not file_exists(out_file):
        with utils.curdir_tmpdir(data, base_dir=align_dir) as work_dir:
            with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file):
                rg_info = get_rg_info(names)
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
                       "| {novoalign} -o SAM '{rg_info}' -d {ref_file} -f /dev/stdin "
                       "  -F BAMPE -c {num_cores} {extra_novo_args} | ")
                cmd = cmd.format(**locals()) + tobam_cl
                do.run(cmd, "Novoalign: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)])
    return out_file
Beispiel #18
0
 def run_gatk(self,
              params,
              tmp_dir=None,
              log_error=True,
              memory_retry=False,
              data=None,
              region=None):
     with curdir_tmpdir() as local_tmp_dir:
         if tmp_dir is None:
             tmp_dir = local_tmp_dir
         cl = self.cl_gatk(params, tmp_dir)
         atype_index = cl.index("-T") if cl.count("-T") > 0 \
                       else cl.index("--analysis_type")
         prog = cl[atype_index + 1]
         if memory_retry:
             do.run_memory_retry(cl,
                                 "GATK: {0}".format(prog),
                                 data,
                                 region=region)
         else:
             do.run(cl,
                    "GATK: {0}".format(prog),
                    data,
                    region=region,
                    log_error=log_error)
Beispiel #19
0
 def run_mutect(self, params, tmp_dir=None):
     with curdir_tmpdir() as local_tmp_dir:
         if tmp_dir is None:
             tmp_dir = local_tmp_dir
         cl = self.cl_mutect(params, tmp_dir)
         prog = "MuTect"
         do.run(cl, "MuTect: {0}".format(prog), None)
Beispiel #20
0
def _run_fastqc(bam_file, data, fastqc_out):
    """Run fastqc, generating report in specified directory and parsing metrics.

    Downsamples to 10 million reads to avoid excessive processing times with large
    files, unless we're running a Standard/QC pipeline.
    """
    sentry_file = os.path.join(fastqc_out, "fastqc_report.html")
    if not os.path.exists(sentry_file):
        work_dir = os.path.dirname(fastqc_out)
        utils.safe_makedir(work_dir)
        ds_bam = (bam.downsample(bam_file, data, 1e7) if data.get(
            "analysis", "").lower() not in ["standard"] else None)
        bam_file = ds_bam if ds_bam else bam_file
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        with utils.curdir_tmpdir(data, work_dir) as tx_tmp_dir:
            with utils.chdir(tx_tmp_dir):
                cl = [
                    config_utils.get_program("fastqc", data["config"]), "-t",
                    str(num_cores), "-o", tx_tmp_dir, "-f", "bam", bam_file
                ]
                do.run(cl, "FastQC: %s" % data["name"][-1])
                fastqc_outdir = os.path.join(
                    tx_tmp_dir, "%s_fastqc" %
                    os.path.splitext(os.path.basename(bam_file))[0])
                if os.path.exists("%s.zip" % fastqc_outdir):
                    os.remove("%s.zip" % fastqc_outdir)
                if not os.path.exists(sentry_file):
                    if os.path.exists(fastqc_out):
                        shutil.rmtree(fastqc_out)
                    shutil.move(fastqc_outdir, fastqc_out)
        if ds_bam and os.path.exists(ds_bam):
            os.remove(ds_bam)
    parser = FastQCParser(fastqc_out)
    stats = parser.get_fastqc_summary()
    return stats
Beispiel #21
0
def align_bam(in_bam, ref_file, names, align_dir, data):
    """Perform realignment of input BAM file; uses unix pipes for avoid IO.
    """
    config = data["config"]
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    novoalign = config_utils.get_program("novoalign", config)
    samtools = config_utils.get_program("samtools", config)
    resources = config_utils.get_resources("novoalign", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "4G")
    extra_novo_args = " ".join(_novoalign_args_from_config(config, False))

    if not file_exists(out_file):
        with utils.curdir_tmpdir(data, base_dir=align_dir) as work_dir:
            with postalign.tobam_cl(data, out_file,
                                    bam.is_paired(in_bam)) as (tobam_cl,
                                                               tx_out_file):
                rg_info = get_rg_info(names)
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                cmd = (
                    "{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
                    "| {novoalign} -o SAM '{rg_info}' -d {ref_file} -f /dev/stdin "
                    "  -F BAMPE -c {num_cores} {extra_novo_args} | ")
                cmd = cmd.format(**locals()) + tobam_cl
                do.run(cmd, "Novoalign: %s" % names["sample"], None, [
                    do.file_nonempty(tx_out_file),
                    do.file_reasonable_size(tx_out_file, in_bam)
                ])
    return out_file
Beispiel #22
0
def _run_toplevel(config, config_file, work_dir, parallel,
                  fc_dir=None, run_info_yaml=None):
    """
    Run toplevel analysis, processing a set of input files.
    config_file -- Main YAML configuration file with system parameters
    fc_dir -- Directory of fastq files to process
    run_info_yaml -- YAML configuration file specifying inputs to process
    """
    parallel = log.create_base_logger(config, parallel)
    log.setup_local_logging(config, parallel)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir)
                                                        if fc_dir else None,
                                                        config, config_file)
    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir,
            "work": work_dir, "flowcell": fc_dir, "config": config_dir}
    run_items = run_info.organize(dirs, config, run_info_yaml)
    run_parallel = parallel_runner(parallel, dirs, config, config_file)

    # process each flowcell lane
    lane_items = lane.process_all_lanes(run_items, run_parallel)
    pipelines = _pair_lanes_with_pipelines(lane_items)
    final = []
    with utils.curdir_tmpdir() as tmpdir:
        tempfile.tempdir = tmpdir
        for pipeline, pipeline_items in pipelines.items():
            pipeline_items = _add_provenance(pipeline_items, dirs, run_parallel, parallel, config)
            versioncheck.testall(pipeline_items)
            for xs in pipeline.run(config, config_file, run_parallel, parallel, dirs, pipeline_items):
                if len(xs) == 1:
                    upload.from_sample(xs[0])
                    final.append(xs[0])
Beispiel #23
0
def align_bam(in_bam, ref_file, names, align_dir, config):
    """Perform direct alignment of an input BAM file with BWA using pipes.

    This avoids disk IO by piping between processes:
     - samtools sort of input BAM to queryname
     - bedtools conversion to interleaved FASTQ
     - bwa-mem alignment
     - samtools conversion to BAM
     - samtools sort to coordinate
    """
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    bedtools = config_utils.get_program("bedtools", config)
    bwa = config_utils.get_program("bwa", config)
    resources = config_utils.get_resources("samtools", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "768M")
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file):
        _check_samtools_version()
        with utils.curdir_tmpdir() as work_dir:
            with file_transaction(out_file) as tx_out_file:
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
                       "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout "
                       "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - "
                       "| {samtools} view -b -S -u - "
                       "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}")
                logger.info(cmd.format(**locals()))
                subprocess.check_call(cmd.format(**locals()), shell=True)
    return out_file
Beispiel #24
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, config):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    bwa = config_utils.get_program("bwa", config)
    resources = config_utils.get_resources("samtools", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    # adjust memory for samtools since used alongside alignment
    max_mem = config_utils.adjust_memory(resources.get("memory", "2G"),
                                         3, "decrease")
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file):
        novoalign.check_samtools_version(config)
        with utils.curdir_tmpdir() as work_dir:
            with file_transaction(out_file) as tx_out_file:
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                cmd = ("{bwa} mem -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} "
                       "{fastq_file} {pair_file} "
                       "| {samtools} view -b -S -u - "
                       "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}")
                cmd = cmd.format(**locals())
                do.run(cmd, "bwa mem alignment from fastq: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file)])
    return out_file
Beispiel #25
0
def align_bam(in_bam, ref_file, names, align_dir, config):
    """Perform realignment of input BAM file, handling sorting of input/output with novosort.

    Uses unix pipes for avoid IO writing between steps:
      - novosort of input BAM to coordinates
      - alignment with novoalign
      - conversion to BAM with samtools
      - coordinate sorting with novosort
    """
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    if not file_exists(out_file):
        with curdir_tmpdir(base_dir=align_dir) as work_dir:
            with file_transaction(out_file) as tx_out_file:
                resources = config_utils.get_resources("novoalign", config)
                num_cores = resources["cores"]
                max_mem = resources.get("memory", "4G")
                read_sort = sh.novosort.bake(in_bam, c=num_cores, m=max_mem,
                                             compression=0, n=True, t=work_dir,
                                             _piped=True)
                rg_info = r"SAM '@RG\tID:{rg}\tPL:{pl}\tPU:{pu}\tSM:{sample}'".format(**names)
                align = sh.novoalign.bake(o=rg_info, d=ref_file, f="/dev/stdin", F="BAMPE",
                                          c=num_cores, _piped=True)
                to_bam = sh.samtools.view.bake(b=True, S=True, u=True,
                                               _piped=True).bake("-")
                coord_sort = sh.novosort.bake("/dev/stdin", c=num_cores, m=max_mem,
                                              o=tx_out_file, t=work_dir)
                subprocess.check_call("%s | %s | %s | %s" % (read_sort, align, to_bam, coord_sort),
                                      shell=True)
    return out_file
Beispiel #26
0
def split_bam_file(bam_file, split_size, out_dir, config):
    """Split a BAM file into paired end fastq splits based on split size.

    XXX Need to generalize for non-paired end inputs.
    """
    existing = _find_current_bam_split(bam_file, out_dir)
    if len(existing) > 0:
        return existing
    pipe = True

    utils.safe_makedir(out_dir)
    broad_runner = broad.runner_from_config(config)
    out_files = []

    def new_handle(num):
        out = []
        for pair in [1, 2]:
            fname = os.path.join(
                out_dir,
                "{base}_{pair}_{num}.fastq".format(
                    base=os.path.splitext(os.path.basename(bam_file))[0], pair=pair, num=num
                ),
            )
            out += [fname, open(fname, "w")]
        return out

    with utils.curdir_tmpdir(base_dir=config_utils.get_resources("tmp", config).get("dir")) as tmp_dir:
        if pipe:
            sort_file = os.path.join(tmp_dir, "%s-sort.bam" % os.path.splitext(os.path.basename(bam_file))[0])
            os.mkfifo(sort_file)
            broad_runner.run_fn("picard_sort", bam_file, "queryname", sort_file, compression_level=0, pipe=True)
        else:
            sort_file = os.path.join(out_dir, "%s-sort.bam" % os.path.splitext(os.path.basename(bam_file))[0])
            broad_runner.run_fn("picard_sort", bam_file, "queryname", sort_file)

        samfile = pysam.Samfile(sort_file, "rb")
        i = 0
        num = 0
        f1, out_handle1, f2, out_handle2 = new_handle(num)
        out_files.append([f1, f2, None])
        for x1, x2 in utils.partition_all(2, samfile):
            x1_seq, x1_qual = _get_seq_qual(x1)
            out_handle1.write("@%s/1\n%s\n+\n%s\n" % (i, x1_seq, x1_qual))
            x2_seq, x2_qual = _get_seq_qual(x2)
            out_handle2.write("@%s/2\n%s\n+\n%s\n" % (i, x2_seq, x2_qual))
            i += 1
            if i % split_size == 0:
                num += 1
                out_handle1.close()
                out_handle2.close()
                f1, out_handle1, f2, out_handle2 = new_handle(num)
                out_files.append([f1, f2, num])
        out_handle1.close()
        out_handle2.close()
        samfile.close()
        if pipe:
            os.unlink(sort_file)
        else:
            utils.save_diskspace(sort_file, "Split to {}".format(out_files[0][0]), config)
    return out_files
Beispiel #27
0
def gatk_indel_realignment(runner, align_bam, ref_file, intervals,
                           region=None, out_file=None, deep_coverage=False):
    """Perform realignment of BAM file in specified regions
    """
    if out_file is None:
        out_file = "%s-realign.bam" % os.path.splitext(align_bam)[0]
    if not file_exists(out_file):
        with curdir_tmpdir() as tmp_dir:
            with file_transaction(out_file) as tx_out_file:
                logger.info("GATK IndelRealigner: %s %s" %
                            (os.path.basename(align_bam), region))
                params = ["-T", "IndelRealigner",
                          "-I", align_bam,
                          "-R", ref_file,
                          "-targetIntervals", intervals,
                          "-o", tx_out_file,
                          "-l", "INFO",
                          ]
                if region:
                    params += ["-L", region]
                if deep_coverage:
                    params += ["--maxReadsInMemory", "300000",
                               "--maxReadsForRealignment", str(int(5e5)),
                               "--maxReadsForConsensuses", "500",
                               "--maxConsensuses", "100"]
                try:
                    runner.run_gatk(params, tmp_dir)
                except:
                    logger.exception("Running GATK IndelRealigner failed: {} {}".format(
                        os.path.basename(align_bam), region))
                    raise
    return out_file
Beispiel #28
0
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform,
        dbsnp_file, intervals):
    """Step 1 of GATK recalibration process, producing table of covariates.
    """
    out_file = "%s.grp" % os.path.splitext(dup_align_bam)[0]
    plot_file = "%s-plots.pdf" % os.path.splitext(dup_align_bam)[0]
    if not file_exists(out_file):
        if has_aligned_reads(dup_align_bam, intervals):
            with curdir_tmpdir() as tmp_dir:
                with file_transaction(out_file) as tx_out_file:
                    params = ["-T", "BaseRecalibrator",
                              "-o", tx_out_file,
                              "--plot_pdf_file", plot_file,
                              "-I", dup_align_bam,
                              "-R", ref_file,
                              ]
                    downsample_pct = _get_downsample_pct(broad_runner, dup_align_bam)
                    if downsample_pct:
                        params += ["--downsample_to_fraction", str(downsample_pct),
                                   "--downsampling_type", "ALL_READS"]
                    # GATK-lite does not have support for
                    # insertion/deletion quality modeling
                    if broad_runner.gatk_type() == "lite":
                        params += ["--disable_indel_quals"]
                    if dbsnp_file:
                        params += ["--knownSites", dbsnp_file]
                    if intervals:
                        params += ["-L", intervals, "--interval_set_rule", "INTERSECTION"]
                    broad_runner.run_gatk(params, tmp_dir)
        else:
            with open(out_file, "w") as out_handle:
                out_handle.write("# No aligned reads")
    return out_file
Beispiel #29
0
def _gatk_recalibrate(broad_runner, dup_align_bam, ref_file, recal_file,
                      platform, intervals):
    """Step 2 of GATK recalibration -- use covariates to re-write output file.
    """
    out_file = "%s-gatkrecal.bam" % os.path.splitext(dup_align_bam)[0]
    if not file_exists(out_file):
        if _recal_available(recal_file):
            with curdir_tmpdir() as tmp_dir:
                with file_transaction(out_file) as tx_out_file:
                    params = [
                        "-T",
                        "PrintReads",
                        "-BQSR",
                        recal_file,
                        "-R",
                        ref_file,
                        "-I",
                        dup_align_bam,
                        "--out",
                        tx_out_file,
                    ]
                    if intervals:
                        params += [
                            "-L", intervals, "--interval_set_rule",
                            "INTERSECTION"
                        ]
                    broad_runner.run_gatk(params, tmp_dir)
        else:
            shutil.copy(dup_align_bam, out_file)
    return out_file
Beispiel #30
0
def _extract_split_and_discordants(in_bam, work_dir, data):
    """Retrieve split-read alignments from input BAM file.
    """
    dedup_file = os.path.join(work_dir, "%s-dedup.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    sr_file = os.path.join(work_dir, "%s-sr.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    disc_file = os.path.join(work_dir, "%s-disc.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    samtools = config_utils.get_program("samtools", data["config"])
    cores = utils.get_in(data, ("config", "algorithm", "num_cores"), 1)
    resources = config_utils.get_resources("sambamba", data["config"])
    mem = config_utils.adjust_memory(resources.get("memory", "2G"),
                                     3, "decrease").upper()
    if not utils.file_exists(sr_file) or not utils.file_exists(disc_file) or utils.file_exists(dedup_file):
        with utils.curdir_tmpdir(data) as tmpdir:
            with file_transaction(sr_file) as tx_sr_file:
                with file_transaction(disc_file) as tx_disc_file:
                    with file_transaction(dedup_file) as tx_dedup_file:
                        samblaster_cl = postalign.samblaster_dedup_sort(data, tmpdir, tx_dedup_file,
                                                                        tx_sr_file, tx_disc_file)
                        out_base = os.path.join(tmpdir, "%s-namesort" % os.path.splitext(in_bam)[0])
                        cmd = ("{samtools} sort -n -o -@ {cores} -m {mem} {in_bam} {out_base} | "
                               "{samtools} view -h - | ")
                        cmd = cmd.format(**locals()) + samblaster_cl
                        do.run(cmd, "samblaster: split and discordant reads", data)
    for fname in [sr_file, disc_file, dedup_file]:
        bam.index(fname, data["config"])
    return dedup_file, sr_file, disc_file
Beispiel #31
0
def _run_toplevel(config, config_file, work_dir, parallel,
                  fc_dir=None, run_info_yaml=None):
    """
    Run toplevel analysis, processing a set of input files.
    config_file -- Main YAML configuration file with system parameters
    fc_dir -- Directory of fastq files to process
    run_info_yaml -- YAML configuration file specifying inputs to process
    """
    parallel = log.create_base_logger(config, parallel)
    log.setup_local_logging(config, parallel)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir)
                                                        if fc_dir else None,
                                                        config, config_file)
    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir,
            "work": work_dir, "flowcell": fc_dir, "config": config_dir}
    samples = run_info.organize(dirs, config, run_info_yaml)
    pipelines = _pair_lanes_with_pipelines(samples)
    final = []
    with utils.curdir_tmpdir() as tmpdir:
        tempfile.tempdir = tmpdir
        for pipeline, pipeline_items in pipelines.items():
            pipeline_items = _add_provenance(pipeline_items, dirs, parallel, config)
            versioncheck.testall(pipeline_items)
            for xs in pipeline.run(config, config_file, parallel, dirs, pipeline_items):
                if len(xs) == 1:
                    upload.from_sample(xs[0])
                    final.append(xs[0])
Beispiel #32
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, config):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    novoalign = config_utils.get_program("novoalign", config)
    resources = config_utils.get_resources("novoalign", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "1G")
    extra_novo_args = " ".join(_novoalign_args_from_config(config, False))
    rg_info = get_rg_info(names)
    if not utils.file_exists(out_file):
        check_samtools_version()
        with utils.curdir_tmpdir() as work_dir:
            with file_transaction(out_file) as tx_out_file:
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                cmd = ("{novoalign} -o SAM '{rg_info}' -d {ref_file} -f {fastq_file} {pair_file} "
                       "  -c {num_cores} {extra_novo_args} "
                       "| {samtools} view -b -S -u - "
                       "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}")
                cmd = cmd.format(**locals())
                do.run(cmd, "Novoalign: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file)])
    return out_file
Beispiel #33
0
def _gatk_count_covariates(picard, dup_align_bam, ref_file, platform,
        snp_file):
    """Step 1 of GATK recalibration process -- counting covariates.
    """
    out_file = "%s.recal" % os.path.splitext(dup_align_bam)[0]
    params = ["-T", "CountCovariates",
              "-cov", "ReadGroupCovariate",
              "-cov", "QualityScoreCovariate",
              "-cov", "CycleCovariate",
              "-cov", "DinucCovariate",
              "-cov", "TileCovariate",
              "-recalFile", out_file,
              "-I", dup_align_bam,
              "-R", ref_file,
              "-l", "INFO",
              "-U",
              "-OQ",
              "--default_platform", platform,
              ]
    if snp_file:
        params += ["-B:dbsnp,VCF", snp_file]
    if not os.path.exists(out_file):
        with curdir_tmpdir() as tmp_dir:
            with file_transaction(out_file):
                picard.run_gatk(params, tmp_dir)
    return out_file
Beispiel #34
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file = alignprep.split_namedpipe_cl(fastq_file, data)
        if pair_file:
            pair_file = alignprep.split_namedpipe_cl(pair_file, data)
    else:
        final_file = None
    samtools = config_utils.get_program("samtools", data["config"])
    novoalign = config_utils.get_program("novoalign", data["config"])
    resources = config_utils.get_resources("novoalign", data["config"])
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "1G")
    extra_novo_args = " ".join(_novoalign_args_from_config(data["config"]))
    rg_info = get_rg_info(names)
    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        with utils.curdir_tmpdir(data) as work_dir:
            with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file):
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                cmd = ("{novoalign} -o SAM '{rg_info}' -d {ref_file} -f {fastq_file} {pair_file} "
                       "  -c {num_cores} {extra_novo_args} | ")
                cmd = cmd.format(**locals()) + tobam_cl
                do.run(cmd, "Novoalign: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, fastq_file)])
    data["work_bam"] = out_file
    return data
Beispiel #35
0
def align_bam(in_bam, ref_file, names, align_dir, config):
    """Perform realignment of input BAM file, handling sorting of input/output with novosort.

    Uses unix pipes for avoid IO writing between steps:
      - novosort of input BAM to coordinates
      - alignment with novoalign
      - conversion to BAM with samtools
      - coordinate sorting with novosort
    """
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    novosort = config_utils.get_program("novosort", config)
    novoalign = config_utils.get_program("novoalign", config)
    samtools = config_utils.get_program("samtools", config)
    resources = config_utils.get_resources("novoalign", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "4G")

    if not file_exists(out_file):
        with curdir_tmpdir(base_dir=align_dir) as work_dir:
            with file_transaction(out_file) as tx_out_file:
                rg_info = r"@RG\tID:{rg}\tPL:{pl}\tPU:{pu}\tSM:{sample}".format(**names)
                cmd = ("{novosort} -c {num_cores} -m {max_mem} --compression 0 "
                       " -n -t {work_dir} {in_bam} "
                       "| {novoalign} -o SAM '{rg_info}' -d {ref_file} -f /dev/stdin "
                       "  -F BAMPE -c {num_cores} "
                       "| {samtools} view -b -S -u - "
                       "| {novosort} -c {num_cores} -m {max_mem} -t {work_dir} "
                       "  -o {tx_out_file} /dev/stdin")
                subprocess.check_call(cmd.format(**locals()), shell=True)
    return out_file
def _run_kraken(data,ratio):
    """Run kraken, generating report in specified directory and parsing metrics.
       Using only first paired reads.
    """
    logger.info("Number of aligned reads < than 0.60 in %s: %s" % (str(data["name"]),ratio))
    logger.info("Running kraken to determine contaminant: %s" % str(data["name"]))
    qc_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "qc", data["description"]))
    kraken_out = os.path.join(qc_dir, "kraken")
    stats = out = out_stats = None
    db = data['config']["algorithm"]["kraken"] 
    if db == "minikraken":
        db = os.path.join(_get_data_dir(),"genome","kraken","minikraken")
    else:
        if not os.path.exists(db):
            logger.info("kraken: no database found %s, skipping" % db)
            return {"kraken_report" : "null"}
    if not os.path.exists(os.path.join(kraken_out,"kraken_out")):
        work_dir = os.path.dirname(kraken_out)
        utils.safe_makedir(work_dir)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        files = data["files"]        
        with utils.curdir_tmpdir(data, work_dir) as tx_tmp_dir:
            with utils.chdir(tx_tmp_dir):
                out = os.path.join(tx_tmp_dir,"kraken_out")
                out_stats = os.path.join(tx_tmp_dir,"kraken_stats")
                cl = (" ").join([config_utils.get_program("kraken", data["config"]),
                      "--db",db,"--quick",
                      "--preload","--min-hits","2","--threads",str(num_cores), 
                      "--out", out, files[0]," 2>",out_stats])
                do.run(cl,"kraken: %s" % data["name"][-1])
                if os.path.exists(kraken_out):
                    shutil.rmtree(kraken_out)
                shutil.move(tx_tmp_dir, kraken_out)
    metrics = _parse_kraken_output(kraken_out,db,data)
    return metrics
def _run_fastqc(bam_file, data, fastqc_out):
    """Run fastqc, generating report in specified directory and parsing metrics.

    Downsamples to 10 million reads to avoid excessive processing times with large
    files, unless we're running a Standard/QC pipeline.
    """
    sentry_file = os.path.join(fastqc_out, "fastqc_report.html")
    if not os.path.exists(sentry_file):
        work_dir = os.path.dirname(fastqc_out)
        utils.safe_makedir(work_dir)
        ds_bam = (bam.downsample(bam_file, data, 1e7)
                  if data.get("analysis", "").lower() not in ["standard"]
                  else None)
        bam_file = ds_bam if ds_bam else bam_file
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        with utils.curdir_tmpdir(data, work_dir) as tx_tmp_dir:
            with utils.chdir(tx_tmp_dir):
                cl = [config_utils.get_program("fastqc", data["config"]),
                      "-t", str(num_cores), "-o", tx_tmp_dir, "-f", "bam", bam_file]
                do.run(cl, "FastQC: %s" % data["name"][-1])
                fastqc_outdir = os.path.join(tx_tmp_dir,
                                             "%s_fastqc" % os.path.splitext(os.path.basename(bam_file))[0])
                if os.path.exists("%s.zip" % fastqc_outdir):
                    os.remove("%s.zip" % fastqc_outdir)
                if not os.path.exists(sentry_file):
                    if os.path.exists(fastqc_out):
                        shutil.rmtree(fastqc_out)
                    shutil.move(fastqc_outdir, fastqc_out)
        if ds_bam and os.path.exists(ds_bam):
            os.remove(ds_bam)
    parser = FastQCParser(fastqc_out)
    stats = parser.get_fastqc_summary()
    return stats
Beispiel #38
0
def _extract_split_and_discordants(in_bam, work_dir, data):
    """Retrieve split-read alignments from input BAM file.
    """
    dedup_file = os.path.join(work_dir, "%s-dedup.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    sr_file = os.path.join(work_dir, "%s-sr.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    disc_file = os.path.join(work_dir, "%s-disc.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    samtools = config_utils.get_program("samtools", data["config"])
    cores = utils.get_in(data, ("config", "algorithm", "num_cores"), 1)
    resources = config_utils.get_resources("sambamba", data["config"])
    mem = config_utils.adjust_memory(resources.get("memory", "2G"),
                                     3, "decrease")
    if not utils.file_exists(sr_file) or not utils.file_exists(disc_file) or utils.file_exists(dedup_file):
        with utils.curdir_tmpdir() as tmpdir:
            with file_transaction(sr_file) as tx_sr_file:
                with file_transaction(disc_file) as tx_disc_file:
                    with file_transaction(dedup_file) as tx_dedup_file:
                        samblaster_cl = postalign.samblaster_dedup_sort(data, tmpdir, tx_dedup_file,
                                                                        tx_sr_file, tx_disc_file)
                        out_base = os.path.join(tmpdir, "%s-namesort" % os.path.splitext(in_bam)[0])
                        cmd = ("{samtools} sort -n -o -@ {cores} -m {mem} {in_bam} {out_base} | "
                               "{samtools} view -h - | ")
                        cmd = cmd.format(**locals()) + samblaster_cl
                        do.run(cmd, "samblaster: split and discordant reads", data)
    for fname in [sr_file, disc_file, dedup_file]:
        bam.index(fname, data["config"])
    return dedup_file, sr_file, disc_file
Beispiel #39
0
def align(fastq_file, pair_file, index_dir, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM.

    TODO: Use streaming with new development version of SNAP to feed into
    structural variation preparation de-duplication.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    assert not data.get(
        "align_split"), "Split alignments not supported with SNAP"
    snap = config_utils.get_program("snap", data["config"])
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    resources = config_utils.get_resources("snap", data["config"])
    max_mem = resources.get("memory", "1G")
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            with utils.curdir_tmpdir(data) as work_dir:
                if fastq_file.endswith(".bam"):
                    cmd_name = "paired" if bam.is_paired(
                        fastq_file) else "single"
                else:
                    cmd_name = "single" if not pair_file else "paired"
                cmd = (
                    "{snap} {cmd_name} {index_dir} {fastq_file} {pair_file} "
                    "-rg '{rg_info}' -t {num_cores} -sa -so -sm {max_mem} -o {tx_out_file}"
                )
                do.run(cmd.format(**locals()),
                       "SNAP alignment: %s" % names["sample"])
    data["work_bam"] = out_file
    return data
Beispiel #40
0
def align_bam(in_bam, ref_file, names, align_dir, config):
    """Perform direct alignment of an input BAM file with BWA using pipes.

    This avoids disk IO by piping between processes:
     - samtools sort of input BAM to queryname
     - bedtools conversion to interleaved FASTQ
     - bwa-mem alignment
     - samtools conversion to BAM
     - samtools sort to coordinate
    """
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    bedtools = config_utils.get_program("bedtools", config)
    bwa = config_utils.get_program("bwa", config)
    resources = config_utils.get_resources("samtools", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    # adjust memory for samtools since used for input and output
    max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                         3, "decrease")
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file):
        novoalign.check_samtools_version(config)
        with utils.curdir_tmpdir() as work_dir:
            with file_transaction(out_file) as tx_out_file:
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
                       "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout "
                       "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - "
                       "| {samtools} view -b -S -u - "
                       "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}")
                cmd = cmd.format(**locals())
                do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file)])
    return out_file
Beispiel #41
0
def align_bam(in_bam, ref_file, names, align_dir, config):
    """Perform direct alignment of an input BAM file with BWA using pipes.

    This avoids disk IO by piping between processes:
     - samtools sort of input BAM to queryname
     - bedtools conversion to interleaved FASTQ
     - bwa-mem alignment
     - samtools conversion to BAM
     - samtools sort to coordinate
    """
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    bedtools = config_utils.get_program("bedtools", config)
    bwa = config_utils.get_program("bwa", config)
    resources = config_utils.get_resources("samtools", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "768M")
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file):
        _check_samtools_version()
        with utils.curdir_tmpdir() as work_dir:
            with file_transaction(out_file) as tx_out_file:
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
                       "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout "
                       "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - "
                       "| {samtools} view -b -S -u - "
                       "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}")
                cmd = cmd.format(**locals())
                do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file)])
    return out_file
Beispiel #42
0
def gatk_recalibrate(picard, dup_align_bam, ref_file, recal_file, platform):
    """Step 2 of GATK recalibration -- use covariates to re-write output file.
    """
    out_file = "%s-gatkrecal.bam" % os.path.splitext(dup_align_bam)[0]
    params = [
        "-T",
        "TableRecalibration",
        "-recalFile",
        recal_file,
        "-R",
        ref_file,
        "-I",
        dup_align_bam,
        "--out",
        out_file,
        "-baq",
        "RECALCULATE",
        "-l",
        "INFO",
        "-U",
        "-OQ",
        "--default_platform",
        platform,
    ]
    if not os.path.exists(out_file):
        if _recal_available(recal_file):
            with curdir_tmpdir() as tmp_dir:
                picard.run_gatk(params, tmp_dir)
        else:
            shutil.copy(dup_align_bam, out_file)
    return out_file
Beispiel #43
0
def gatk_indel_realignment(runner, align_bam, ref_file, intervals, deep_coverage=False):
    """Perform realignment of BAM file in specified regions
    """
    out_file = "%s-realign.bam" % os.path.splitext(align_bam)[0]
    params = [
        "-T",
        "IndelRealigner",
        "-I",
        align_bam,
        "-R",
        ref_file,
        "-targetIntervals",
        intervals,
        "-o",
        out_file,
        "-l",
        "INFO",
    ]
    if deep_coverage:
        params += [
            "--maxReadsInMemory",
            "300000",
            "--maxReadsForRealignment",
            str(int(5e5)),
            "--maxReadsForConsensuses",
            "500",
            "--maxConsensuses",
            "100",
        ]
    if not (os.path.exists(out_file) and os.path.getsize(out_file) > 0):
        with curdir_tmpdir() as tmp_dir:
            with file_transaction(out_file):
                runner.run_gatk(params, tmp_dir)
    return out_file
Beispiel #44
0
 def run_mutect(self, params, tmp_dir=None):
     with curdir_tmpdir() as local_tmp_dir:
         if tmp_dir is None:
             tmp_dir = local_tmp_dir
         cl = self.cl_mutect(params, tmp_dir)
         prog = "MuTect"
         do.run(cl, "MuTect: {0}".format(prog), None)
Beispiel #45
0
def generate_align_summary(bam_file, is_paired, sam_ref, sample_name, config, dirs):
    """Run alignment summarizing script to produce a pdf with align details.
    """
    with utils.chdir(dirs["work"]):
        with utils.curdir_tmpdir() as tmp_dir:
            graphs, summary, overrep = _graphs_and_summary(bam_file, sam_ref, is_paired, tmp_dir, config)
        return _generate_pdf(graphs, summary, overrep, bam_file, sample_name, dirs, config)
Beispiel #46
0
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform,
        snp_file, intervals):
    """Step 1 of GATK recalibration process, producing table of covariates.
    """
    out_file = "%s.grp" % os.path.splitext(dup_align_bam)[0]
    if not file_exists(out_file):
        if has_aligned_reads(dup_align_bam):
            with curdir_tmpdir() as tmp_dir:
                with file_transaction(out_file) as tx_out_file:
                    params = ["-T", "BaseRecalibrator",
                              "-o", tx_out_file,
                              "-I", dup_align_bam,
                              "-R", ref_file,
                              ]
                    # GATK-lite does not have support for
                    # insertion/deletion quality modeling
                    if not broad_runner.has_gatk_full():
                        params += ["--disable_indel_quals"]
                    if snp_file:
                        params += ["--knownSites", snp_file]
                    if intervals:
                        params += ["-L", intervals, "--interval_set_rule", "INTERSECTION"]
                    broad_runner.run_gatk(params, tmp_dir)
        else:
            with open(out_file, "w") as out_handle:
                out_handle.write("# No aligned reads")
    return out_file
Beispiel #47
0
def picard_fastq_to_bam(picard, fastq_one, fastq_two, out_dir,
                        platform, sample_name="", rg_name="", pu_name="",
                        qual_format=None):
    """Convert fastq file(s) to BAM, adding sample, run group and platform information.
    """
    qual_formats = {"illumina": "Illumina"}
    if qual_format is None:
        try:
            qual_format = qual_formats[platform.lower()]
        except KeyError:
            raise ValueError("Need to specify quality format for %s" % platform)
    out_bam = os.path.join(out_dir, "%s-fastq.bam" %
                           os.path.splitext(os.path.basename(fastq_one))[0])
    if not file_exists(out_bam):
        with curdir_tmpdir() as tmp_dir:
            with file_transaction(out_bam) as tx_out_bam:
                opts = [("FASTQ", fastq_one),
                        ("QUALITY_FORMAT", qual_format),
                        ("READ_GROUP_NAME", rg_name),
                        ("SAMPLE_NAME", sample_name),
                        ("PLATFORM_UNIT", pu_name),
                        ("PLATFORM", platform),
                        ("TMP_DIR", tmp_dir),
                        ("OUTPUT", tx_out_bam)]
                if fastq_two:
                    opts.append(("FASTQ2", fastq_two))
                picard.run("FastqToSam", opts)
    return out_bam
Beispiel #48
0
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform,
                            snp_file, intervals):
    """Step 1 of GATK recalibration process, producing table of covariates.
    """
    out_file = "%s.grp" % os.path.splitext(dup_align_bam)[0]
    if not file_exists(out_file):
        if has_aligned_reads(dup_align_bam):
            with curdir_tmpdir() as tmp_dir:
                with file_transaction(out_file) as tx_out_file:
                    params = [
                        "-T",
                        "BaseRecalibrator",
                        "-o",
                        tx_out_file,
                        "-I",
                        dup_align_bam,
                        "-R",
                        ref_file,
                    ]
                    # GATK-lite does not have support for
                    # insertion/deletion quality modeling
                    if not broad_runner.has_gatk_full():
                        params += ["--disable_indel_quals"]
                    if snp_file:
                        params += ["--knownSites", snp_file]
                    if intervals:
                        params += [
                            "-L", intervals, "--interval_set_rule",
                            "INTERSECTION"
                        ]
                    broad_runner.run_gatk(params, tmp_dir)
        else:
            with open(out_file, "w") as out_handle:
                out_handle.write("# No aligned reads")
    return out_file
Beispiel #49
0
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform,
                            dbsnp_file, intervals):
    """Step 1 of GATK recalibration process, producing table of covariates.

    Large whole genome BAM files take an excessively long time to recalibrate and
    the extra inputs don't help much beyond a certain point. See the 'Downsampling analysis'
    plots in the GATK documentation:

    http://gatkforums.broadinstitute.org/discussion/44/base-quality-score-recalibrator#latest

    This identifies large files and calculates the fraction to downsample to.

    TODO: Use new GATK 2.6+ AnalyzeCovariates tool to plot recalibration results.
    """
    target_counts = 1e8  # 100 million reads per read group, 20x the plotted max
    out_file = "%s.grp" % os.path.splitext(dup_align_bam)[0]
    if not file_exists(out_file):
        if has_aligned_reads(dup_align_bam, intervals):
            with curdir_tmpdir() as tmp_dir:
                with file_transaction(out_file) as tx_out_file:
                    params = [
                        "-T",
                        "BaseRecalibrator",
                        "-o",
                        tx_out_file,
                        "-I",
                        dup_align_bam,
                        "-R",
                        ref_file,
                    ]
                    downsample_pct = bam.get_downsample_pct(
                        broad_runner, dup_align_bam, target_counts)
                    if downsample_pct:
                        params += [
                            "--downsample_to_fraction",
                            str(downsample_pct), "--downsampling_type",
                            "ALL_READS"
                        ]
                    if platform.lower() == "solid":
                        params += [
                            "--solid_nocall_strategy", "PURGE_READ",
                            "--solid_recal_mode", "SET_Q_ZERO_BASE_N"
                        ]
                    # GATK-lite does not have support for
                    # insertion/deletion quality modeling
                    if broad_runner.gatk_type() == "lite":
                        params += ["--disable_indel_quals"]
                    if dbsnp_file:
                        params += ["--knownSites", dbsnp_file]
                    if intervals:
                        params += [
                            "-L", intervals, "--interval_set_rule",
                            "INTERSECTION"
                        ]
                    broad_runner.run_gatk(params, tmp_dir)
        else:
            with open(out_file, "w") as out_handle:
                out_handle.write("# No aligned reads")
    return out_file
Beispiel #50
0
def main(config_file):
    task_module = "bcbio.distributed.tasks"
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    with utils.curdir_tmpdir() as work_dir:
        dirs = {"work": work_dir, "config": os.path.dirname(config_file)}
        with create_celeryconfig(task_module, dirs, config):
            run_celeryd(work_dir)
Beispiel #51
0
def picard_sort(picard, align_bam):
    base, ext = os.path.splitext(align_bam)
    out_file = "%s-sort%s" % (base, ext)
    if not os.path.exists(out_file):
        with curdir_tmpdir() as tmp_dir:
            opts = [("INPUT", align_bam), ("OUTPUT", out_file),
                    ("TMP_DIR", tmp_dir), ("SORT_ORDER", "coordinate")]
            picard.run("SortSam", opts)
    return out_file
Beispiel #52
0
def split_bam_file(bam_file, split_size, out_dir, config):
    """Split a BAM file into paired end fastq splits based on split size.

    XXX Need to generalize for non-paired end inputs.
    """
    existing = _find_current_bam_split(bam_file, out_dir)
    if len(existing) > 0:
        return existing
    pipe = True

    utils.safe_makedir(out_dir)
    broad_runner = broad.runner_from_config(config)
    out_files = []
    def new_handle(num):
        out = []
        for pair in [1, 2]:
            fname = os.path.join(out_dir, "{base}_{pair}_{num}.fastq".format(
                base=os.path.splitext(os.path.basename(bam_file))[0], pair=pair, num=num))
            out += [fname, open(fname, "w")]
        return out
    with utils.curdir_tmpdir(base_dir=config_utils.get_resources("tmp", config).get("dir")) as tmp_dir:
        if pipe:
            sort_file = os.path.join(tmp_dir, "%s-sort.bam" %
                                     os.path.splitext(os.path.basename(bam_file))[0])
            os.mkfifo(sort_file)
            broad_runner.run_fn("picard_sort", bam_file, "queryname", sort_file,
                                compression_level=0, pipe=True)
        else:
            sort_file = os.path.join(out_dir, "%s-sort.bam" %
                                     os.path.splitext(os.path.basename(bam_file))[0])
            broad_runner.run_fn("picard_sort", bam_file, "queryname", sort_file)

        samfile = pysam.Samfile(sort_file, "rb")
        i = 0
        num = 0
        f1, out_handle1, f2, out_handle2 = new_handle(num)
        out_files.append([f1, f2, None])
        for x1, x2 in utils.partition_all(2, samfile):
            x1_seq, x1_qual = _get_seq_qual(x1)
            out_handle1.write("@%s/1\n%s\n+\n%s\n" % (i, x1_seq, x1_qual))
            x2_seq, x2_qual = _get_seq_qual(x2)
            out_handle2.write("@%s/2\n%s\n+\n%s\n" % (i, x2_seq, x2_qual))
            i += 1
            if i % split_size == 0:
                num += 1
                out_handle1.close()
                out_handle2.close()
                f1, out_handle1, f2, out_handle2 = new_handle(num)
                out_files.append([f1, f2, num])
        out_handle1.close()
        out_handle2.close()
        samfile.close()
        if pipe:
            os.unlink(sort_file)
        else:
            utils.save_diskspace(sort_file, "Split to {}".format(out_files[0][0]), config)
    return out_files
Beispiel #53
0
 def run_gatk(self, params, tmp_dir=None):
     with curdir_tmpdir() as local_tmp_dir:
         if tmp_dir is None:
             tmp_dir = local_tmp_dir
         cl = self.cl_gatk(params, tmp_dir)
         atype_index = cl.index("-T") if cl.count("-T") > 0 \
                       else cl.index("--analysis_type")
         prog = cl[atype_index + 1]
         do.run(cl, "GATK: {0}".format(prog), None)
Beispiel #54
0
def bedtools_tmpdir(data):
    with utils.curdir_tmpdir(data) as tmpdir:
        orig_tmpdir = tempfile.gettempdir()
        pybedtools.set_tempdir(tmpdir)
        yield
        if orig_tmpdir and os.path.exists(orig_tmpdir):
            pybedtools.set_tempdir(orig_tmpdir)
        else:
            tempfile.tempdir = None
Beispiel #55
0
def generate_align_summary(bam_file, sam_ref, sample_name,
                           config, dirs):
    """Run alignment summarizing script to produce a pdf with align details.
    """
    with utils.chdir(dirs["work"]):
        with utils.curdir_tmpdir() as tmp_dir:
            graphs, summary, overrep = \
                    _graphs_and_summary(bam_file, sam_ref, tmp_dir, config)
        return {"pdf": _generate_pdf(graphs, summary, overrep, bam_file, sample_name,
                                     dirs, config)}
Beispiel #56
0
def picard_formatconverter(picard, align_sam):
    """Convert aligned SAM file to BAM format.
    """
    out_bam = "%s.bam" % os.path.splitext(align_sam)[0]
    if not file_exists(out_bam):
        with curdir_tmpdir() as tmp_dir:
            with file_transaction(out_bam) as tx_out_bam:
                opts = [("INPUT", align_sam), ("OUTPUT", tx_out_bam)]
                picard.run("SamFormatConverter", opts)
    return out_bam
Beispiel #57
0
def picard_downsample(picard, in_bam, ds_pct, random_seed=None):
    out_file = "%s-downsample%s" % os.path.splitext(in_bam)
    if not file_exists(out_file):
        with curdir_tmpdir() as tmp_dir:
            with file_transaction(out_file) as tx_out_file:
                opts = [("INPUT", in_bam), ("OUTPUT", tx_out_file),
                        ("PROBABILITY", "%.3f" % ds_pct), ("TMP_DIR", tmp_dir)]
                if random_seed:
                    opts += [("RANDOM_SEED", str(random_seed))]
                picard.run("DownsampleSam", opts)
    return out_file
Beispiel #58
0
def mark_duplicates(picard, align_bam):
    base, ext = os.path.splitext(align_bam)
    base = base.replace(".", "-")
    dup_bam = "%s-dup%s" % (base, ext)
    dup_metrics = "%s-dup.dup_metrics" % base
    if not os.path.exists(dup_bam):
        with curdir_tmpdir() as tmp_dir:
            opts = [("INPUT", align_bam), ("OUTPUT", dup_bam),
                    ("TMP_DIR", tmp_dir), ("METRICS_FILE", dup_metrics)]
        picard.run("MarkDuplicates", opts)
    return dup_bam