예제 #1
0
def assemble_transcripts(align_file, ref_file, config):
    """Create transcript assemblies using Cufflinks.
    """
    work_dir, fname = os.path.split(align_file)
    num_cores = config["algorithm"].get("num_cores", 1)
    core_flags = ["-p", str(num_cores)] if num_cores > 1 else []
    out_dir = os.path.join(work_dir,
                           "{base}-cufflinks".format(base=os.path.splitext(fname)[0]))
    cl = [config_utils.get_program("cufflinks", config),
          align_file,
          "-o", out_dir,
          "-b", ref_file,
          "-u"]
    cl += core_flags
    tx_file = configured_ref_file("transcripts", config, ref_file)
    tx_mask_file = configured_ref_file("transcripts_mask", config, ref_file)
    if tx_file:
        cl += ["-g", tx_file]
    if tx_mask_file:
        cl += ["-M", tx_mask_file]
    out_tx_file = os.path.join(out_dir, "transcripts.gtf")
    if not os.path.exists(out_tx_file):
        subprocess.check_call(cl)
    assert os.path.exists(out_tx_file)
    return out_tx_file
예제 #2
0
def variantcall_sample(data, region=None, out_file=None):
    """Parallel entry point for doing genotyping of a region of a sample.
    """
    from bcbio.variation import freebayes, cortex, samtools, varscan
    safe_makedir(os.path.dirname(out_file))
    caller_fns = {
        "gatk": unified_genotyper,
        "gatk-haplotype": haplotype_caller,
        "freebayes": freebayes.run_freebayes,
        "cortex": cortex.run_cortex,
        "samtools": samtools.run_samtools,
        "varscan": varscan.run_varscan
    }
    sam_ref = data["sam_ref"]
    config = data["config"]
    caller_fn = caller_fns[config["algorithm"].get("variantcaller", "gatk")]
    if isinstance(data["work_bam"], basestring):
        align_bams = [data["work_bam"]]
    else:
        align_bams = data["work_bam"]
    call_file = "%s-raw%s" % os.path.splitext(out_file)
    caller_fn(align_bams, sam_ref, config,
              configured_ref_file("dbsnp", config, sam_ref), region, call_file)
    if data["config"]["algorithm"].get("phasing", False) == "gatk":
        call_file = phasing.read_backed_phasing(call_file, align_bams, sam_ref,
                                                region, config)
    if not os.path.exists(out_file):
        for ext in ["", ".idx"]:
            if os.path.exists(call_file + ext):
                os.symlink(call_file + ext, out_file + ext)
    data["vrn_file"] = out_file
    return [data]
예제 #3
0
def prep_recal(data):
    """Perform a GATK recalibration of the sorted aligned BAM, producing recalibrated BAM.
    """
    if data["config"]["algorithm"].get("recalibrate", True):
        logger.info("Recalibrating %s with GATK" % str(data["name"]))
        ref_file = data["sam_ref"]
        config = data["config"]
        dbsnp_file = configured_ref_file("dbsnp", config, ref_file)
        broad_runner = broad.runner_from_config(config)
        platform = config["algorithm"]["platform"]
        broad_runner.run_fn("picard_index_ref", ref_file)
        if config["algorithm"].get("mark_duplicates", True):
            (dup_align_bam, _) = broad_runner.run_fn("picard_mark_duplicates",
                                                     data["work_bam"],
                                                     remove_dups=True)
        else:
            dup_align_bam = data["work_bam"]
        broad_runner.run_fn("picard_index", dup_align_bam)
        intervals = config["algorithm"].get("variant_regions", None)
        data["work_bam"] = dup_align_bam
        data["prep_recal"] = _gatk_base_recalibrator(broad_runner,
                                                     dup_align_bam, ref_file,
                                                     platform, dbsnp_file,
                                                     intervals)
    return [[data]]
예제 #4
0
def variantcall_sample(data, region=None, out_file=None):
    """Parallel entry point for doing genotyping of a region of a sample.
    """
    from bcbio.variation import freebayes, cortex, samtools, varscan
    safe_makedir(os.path.dirname(out_file))
    caller_fns = {"gatk": unified_genotyper,
                  "gatk-haplotype": haplotype_caller,
                  "freebayes": freebayes.run_freebayes,
                  "cortex": cortex.run_cortex,
                  "samtools": samtools.run_samtools,
                  "varscan": varscan.run_varscan}
    sam_ref = data["sam_ref"]
    config = data["config"]
    caller_fn = caller_fns[config["algorithm"].get("variantcaller", "gatk")]
    if isinstance(data["work_bam"], basestring):
        align_bams = [data["work_bam"]]
    else:
        align_bams = data["work_bam"]
    call_file = "%s-raw%s" % os.path.splitext(out_file)
    caller_fn(align_bams, sam_ref, config,
              configured_ref_file("dbsnp", config, sam_ref),
              region, call_file)
    if data["config"]["algorithm"].get("phasing", False) == "gatk":
        call_file = phasing.read_backed_phasing(call_file, align_bams, sam_ref, region, config)
    if not os.path.exists(out_file):
        for ext in ["", ".idx"]:
            if os.path.exists(call_file + ext):
                os.symlink(call_file + ext, out_file + ext)
    data["vrn_file"] = out_file
    return [data]
예제 #5
0
def _piped_realign_gatk(data, region, cl, out_base_file, tmp_dir):
    """Perform realignment with GATK, using input commandline.
    GATK requires writing to disk and indexing before realignment.
    """
    broad_runner = broad.runner_from_config(data["config"])
    pa_bam = "%s-prealign%s" % os.path.splitext(out_base_file)
    if not utils.file_exists(pa_bam):
        with file_transaction(pa_bam) as tx_out_file:
            subprocess.check_call("{cl} > {tx_out_file}".format(**locals()),
                                  shell=True)
    broad_runner.run_fn("picard_index", pa_bam)
    dbsnp_vcf = shared.configured_ref_file("dbsnp", data["config"],
                                           data["sam_ref"])
    recal_file = realign.gatk_realigner_targets(broad_runner,
                                                pa_bam,
                                                data["sam_ref"],
                                                dbsnp=dbsnp_vcf,
                                                region=region_to_gatk(region))
    recal_cl = realign.gatk_indel_realignment_cl(broad_runner,
                                                 pa_bam,
                                                 data["sam_ref"],
                                                 recal_file,
                                                 tmp_dir,
                                                 region=region_to_gatk(region))
    return pa_bam, " ".join(recal_cl)
예제 #6
0
def realign_sample(data, region=None, out_file=None):
    """Realign sample BAM file at indels.
    """
    realigner = data["config"]["algorithm"].get("realign", True)
    realigner = "gatk" if realigner is True else realigner
    realign_fn = _realign_approaches[realigner] if realigner else None

    if data["config"]["algorithm"]["snpcall"] and realign_fn:
        logger.info("Realigning %s with %s: %s %s" %
                    (data["name"], realigner, os.path.basename(
                        data["work_bam"]), region))
        sam_ref = data["sam_ref"]
        config = data["config"]
        if region == "nochr":
            realign_bam = write_nochr_reads(data["work_bam"], out_file)
        else:
            realign_bam = realign_fn(
                data["work_bam"], sam_ref, config,
                configured_ref_file("dbsnp", config, sam_ref), region,
                out_file)
        if region is None:
            save_diskspace(data["work_bam"], "Realigned to %s" % realign_bam,
                           config)
        data["work_bam"] = realign_bam
    return [data]
예제 #7
0
def recalibrate_quality(sort_bam_file, fastq1, fastq2, sam_ref, dirs, config):
    """Recalibrate alignments with GATK and provide pdf summary.
    """
    dbsnp_file = configured_ref_file("dbsnp", config, sam_ref)
    recal_file = gatk_recalibrate(sort_bam_file, sam_ref, config, dbsnp_file)
    if config["algorithm"].get("recalibration_plots", False):
        _analyze_recalibration(recal_file, fastq1, fastq2, dirs, config)
    return recal_file
예제 #8
0
파일: variation.py 프로젝트: mcicdata/bcbb
def recalibrate_quality(sort_bam_file, fastq1, fastq2, sam_ref,
                        dirs, config):
    """Recalibrate alignments with GATK and provide pdf summary.
    """
    dbsnp_file = configured_ref_file("dbsnp", config, sam_ref)
    recal_file = gatk_recalibrate(sort_bam_file, sam_ref, config, dbsnp_file)
    if config["algorithm"].get("recalibration_plots", False):
        _analyze_recalibration(recal_file, fastq1, fastq2, dirs, config)
    return recal_file
예제 #9
0
파일: genotype.py 프로젝트: bh0085/compbio
def unified_genotyper_sample(data, region=None, out_file=None):
    """Parallel entry point for doing genotyping of a region of a sample.
    """
    if data["config"]["algorithm"]["snpcall"]:
        sam_ref = data["sam_ref"]
        config = data["config"]
        data["vrn_file"] = unified_genotyper(
            data["work_bam"], sam_ref, config, configured_ref_file("dbsnp", config, sam_ref), region, out_file
        )
    return [data]
예제 #10
0
파일: realign.py 프로젝트: bh0085/compbio
def realign_sample(data, region=None, out_file=None):
    """Realign sample BAM file at indels.
    """
    log.info("Realigning %s with GATK" % str(data["name"]))
    if data["config"]["algorithm"]["snpcall"]:
        sam_ref = data["sam_ref"]
        config = data["config"]
        data["work_bam"] = gatk_realigner(data["work_bam"], sam_ref, config,
                                          configured_ref_file("dbsnp", config, sam_ref),
                                          region, out_file)
    return [data]
예제 #11
0
def bamutil_dedup_recal_cl(in_file, out_file, data, do_recal):
    """Prepare commandline for running deduplication and recalibration with bamutil.
    http://genome.sph.umich.edu/wiki/BamUtil:_dedup
    """
    raise NotImplementedError("Not functional for piped BAM analysis")
    config = data["config"]
    bam_cmd = config_utils.get_program("bam", config)
    ref_file = data["sam_ref"]
    dbsnp_file = configured_ref_file("dbsnp", config, ref_file)

    cmd = "{bam_cmd} dedup --in {in_file} --out {out_file} --oneChrom"
    if do_recal:
        cmd += " --recab --refFile {ref_file} --dbsnp {dbsnp_file}"
    return cmd.format(**locals())
예제 #12
0
def bamutil_dedup_recal_cl(in_file, out_file, data, do_recal):
    """Prepare commandline for running deduplication and recalibration with bamutil.
    http://genome.sph.umich.edu/wiki/BamUtil:_dedup
    """
    raise NotImplementedError("Not functional for piped BAM analysis")
    config = data["config"]
    bam_cmd = config_utils.get_program("bam", config)
    ref_file = data["sam_ref"]
    dbsnp_file = configured_ref_file("dbsnp", config, ref_file)

    cmd = "{bam_cmd} dedup --in {in_file} --out {out_file} --oneChrom"
    if do_recal:
        cmd += " --recab --refFile {ref_file} --dbsnp {dbsnp_file}"
    return cmd.format(**locals())
예제 #13
0
파일: genotype.py 프로젝트: carlcrott/bcbb
def variantcall_sample(data, region=None, out_file=None):
    """Parallel entry point for doing genotyping of a region of a sample.
    """
    from bcbio.variation import freebayes
    caller_fns = {"gatk": unified_genotyper,
                  "freebayes": freebayes.run_freebayes}
    if data["config"]["algorithm"]["snpcall"]:
        sam_ref = data["sam_ref"]
        config = data["config"]
        caller_fn = caller_fns[config["algorithm"].get("variantcaller", "gatk")]
        data["vrn_file"] = caller_fn(data["work_bam"], sam_ref, config,
                                     configured_ref_file("dbsnp", config, sam_ref),
                                     region, out_file)
    return [data]
예제 #14
0
파일: genotype.py 프로젝트: teslaa22/bcbb
def variantcall_sample(data, region=None, out_file=None):
    """Parallel entry point for doing genotyping of a region of a sample.
    """
    from bcbio.variation import freebayes
    caller_fns = {"gatk": unified_genotyper,
                  "freebayes": freebayes.run_freebayes}
    if data["config"]["algorithm"]["snpcall"]:
        sam_ref = data["sam_ref"]
        config = data["config"]
        caller_fn = caller_fns[config["algorithm"].get("variantcaller", "gatk")]
        data["vrn_file"] = caller_fn(data["work_bam"], sam_ref, config,
                                     configured_ref_file("dbsnp", config, sam_ref),
                                     region, out_file)
    return [data]
예제 #15
0
def assemble_transcripts(align_file, ref_file, config):
    """Create transcript assemblies using Cufflinks.
    """
    work_dir, fname = os.path.split(align_file)
    num_cores = config["algorithm"].get("num_cores", 1)
    core_flags = ["-p", str(num_cores)] if num_cores > 1 else []
    out_dir = os.path.join(
        work_dir, "{base}-cufflinks".format(base=os.path.splitext(fname)[0]))
    cl = [
        config_utils.get_program("cufflinks", config), align_file, "-o",
        out_dir, "-b", ref_file, "-u"
    ]
    cl += core_flags
    tx_file = configured_ref_file("transcripts", config, ref_file)
    tx_mask_file = configured_ref_file("transcripts_mask", config, ref_file)
    if tx_file:
        cl += ["-g", tx_file]
    if tx_mask_file:
        cl += ["-M", tx_mask_file]
    out_tx_file = os.path.join(out_dir, "transcripts.gtf")
    if not os.path.exists(out_tx_file):
        subprocess.check_call(cl)
    assert os.path.exists(out_tx_file)
    return out_tx_file
예제 #16
0
def _piped_realign_gatk(data, region, cl, out_base_file, tmp_dir):
    """Perform realignment with GATK, using input commandline.
    GATK requires writing to disk and indexing before realignment.
    """
    broad_runner = broad.runner_from_config(data["config"])
    pa_bam = "%s-prealign%s" % os.path.splitext(out_base_file)
    if not utils.file_exists(pa_bam):
        with file_transaction(pa_bam) as tx_out_file:
            subprocess.check_call("{cl} > {tx_out_file}".format(**locals()), shell=True)
    broad_runner.run_fn("picard_index", pa_bam)
    recal_file = realign.gatk_realigner_targets(broad_runner, pa_bam, data["sam_ref"],
                      dbsnp=shared.configured_ref_file("dbsnp", data["config"], data["sam_ref"]),
                      region=_region_to_gatk(region))
    recal_cl = realign.gatk_indel_realignment_cl(broad_runner, pa_bam, data["sam_ref"],
                                                 recal_file, tmp_dir, region=_region_to_gatk(region))
    return pa_bam, " ".join(recal_cl)
예제 #17
0
파일: realign.py 프로젝트: carlcrott/bcbb
def realign_sample(data, region=None, out_file=None):
    """Realign sample BAM file at indels.
    """
    logger.info("Realigning %s with GATK: %s %s" % (data["name"], os.path.basename(data["work_bam"]), region))
    if data["config"]["algorithm"]["snpcall"]:
        sam_ref = data["sam_ref"]
        config = data["config"]
        if region == "nochr":
            realign_bam = write_nochr_reads(data["work_bam"], out_file)
        else:
            realign_bam = gatk_realigner(
                data["work_bam"], sam_ref, config, configured_ref_file("dbsnp", config, sam_ref), region, out_file
            )
        if region is None:
            save_diskspace(data["work_bam"], "Realigned to %s" % realign_bam, config)
        data["work_bam"] = realign_bam
    return [data]
예제 #18
0
def _piped_realign_gatk(data, region, cl, out_base_file, tmp_dir, prep_params):
    """Perform realignment with GATK, using input commandline.
    GATK requires writing to disk and indexing before realignment.
    """
    broad_runner = broad.runner_from_config(data["config"])
    pa_bam = "%s-prealign%s" % os.path.splitext(out_base_file)
    if not utils.file_exists(pa_bam):
        with file_transaction(pa_bam) as tx_out_file:
            pipe = ">" if prep_params["dup"] else "-o"
            cmd = "{cl} {pipe} {tx_out_file}".format(**locals())
            do.run(cmd, "GATK pre-alignment {0}".format(region), data)
    broad_runner.run_fn("picard_index", pa_bam)
    dbsnp_vcf = shared.configured_ref_file("dbsnp", data["config"], data["sam_ref"])
    recal_file = realign.gatk_realigner_targets(broad_runner, pa_bam, data["sam_ref"],
                                                dbsnp=dbsnp_vcf, region=region_to_gatk(region))
    recal_cl = realign.gatk_indel_realignment_cl(broad_runner, pa_bam, data["sam_ref"],
                                                 recal_file, tmp_dir, region=region_to_gatk(region))
    return pa_bam, " ".join(recal_cl)
예제 #19
0
def _piped_realign_gatk(data, region, cl, out_base_file, tmp_dir, prep_params):
    """Perform realignment with GATK, using input commandline.
    GATK requires writing to disk and indexing before realignment.
    """
    broad_runner = broad.runner_from_config(data["config"])
    pa_bam = "%s-prealign%s" % os.path.splitext(out_base_file)
    if not utils.file_exists(pa_bam):
        with file_transaction(pa_bam) as tx_out_file:
            pipe = ">" if prep_params["dup"] else "-o"
            cmd = "{cl} {pipe} {tx_out_file}".format(**locals())
            do.run(cmd, "GATK pre-alignment {0}".format(region), data)
    broad_runner.run_fn("picard_index", pa_bam)
    dbsnp_vcf = shared.configured_ref_file("dbsnp", data["config"], data["sam_ref"])
    recal_file = realign.gatk_realigner_targets(broad_runner, pa_bam, data["sam_ref"],
                                                dbsnp=dbsnp_vcf, region=region_to_gatk(region))
    recal_cl = realign.gatk_indel_realignment_cl(broad_runner, pa_bam, data["sam_ref"],
                                                 recal_file, tmp_dir, region=region_to_gatk(region))
    return pa_bam, " ".join(recal_cl)
예제 #20
0
파일: realign.py 프로젝트: carlcrott/bcbb
def realign_sample(data, region=None, out_file=None):
    """Realign sample BAM file at indels.
    """
    logger.info("Realigning %s with GATK: %s %s" %
                (data["name"], os.path.basename(data["work_bam"]), region))
    if data["config"]["algorithm"]["snpcall"]:
        sam_ref = data["sam_ref"]
        config = data["config"]
        if region == "nochr":
            realign_bam = write_nochr_reads(data["work_bam"], out_file)
        else:
            realign_bam = gatk_realigner(
                data["work_bam"], sam_ref, config,
                configured_ref_file("dbsnp", config, sam_ref), region,
                out_file)
        if region is None:
            save_diskspace(data["work_bam"], "Realigned to %s" % realign_bam,
                           config)
        data["work_bam"] = realign_bam
    return [data]
예제 #21
0
def prep_recal(data):
    """Perform a GATK recalibration of the sorted aligned BAM, producing recalibrated BAM.
    """
    if data["config"]["algorithm"].get("recalibrate", True) in [True, "gatk"]:
        logger.info("Recalibrating %s with GATK" % str(data["name"]))
        ref_file = data["sam_ref"]
        config = data["config"]
        dbsnp_file = configured_ref_file("dbsnp", config, ref_file)
        broad_runner = broad.runner_from_config(config)
        platform = config["algorithm"]["platform"]
        broad_runner.run_fn("picard_index_ref", ref_file)
        if config["algorithm"].get("mark_duplicates", True):
            (dup_align_bam, _) = broad_runner.run_fn("picard_mark_duplicates", data["work_bam"])
        else:
            dup_align_bam = data["work_bam"]
        broad_runner.run_fn("picard_index", dup_align_bam)
        intervals = config["algorithm"].get("variant_regions", None)
        data["work_bam"] = dup_align_bam
        data["prep_recal"] = _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file,
                                                     platform, dbsnp_file, intervals)
    return [[data]]
예제 #22
0
def variantcall_sample(data, region=None, out_file=None):
    """Parallel entry point for doing genotyping of a region of a sample.
    """
    from bcbio.variation import freebayes, cortex, samtools, varscan
    caller_fns = {"gatk": unified_genotyper,
                  "gatk-haplotype": haplotype_caller,
                  "freebayes": freebayes.run_freebayes,
                  "cortex": cortex.run_cortex,
                  "samtools": samtools.run_samtools,
                  "varscan": varscan.run_varscan}
    if data["config"]["algorithm"]["snpcall"]:
        sam_ref = data["sam_ref"]
        config = data["config"]
        caller_fn = caller_fns[config["algorithm"].get("variantcaller", "gatk")]
        if isinstance(data["work_bam"], basestring):
            align_bams = [data["work_bam"]]
        else:
            align_bams = data["work_bam"]
        data["vrn_file"] = caller_fn(align_bams, sam_ref, config,
                                     configured_ref_file("dbsnp", config, sam_ref),
                                     region, out_file)
    return [data]
예제 #23
0
def realign_sample(data, region=None, out_file=None):
    """Realign sample BAM file at indels.
    """
    realigner = data["config"]["algorithm"].get("realign", True)
    realigner = "gatk" if realigner is True else realigner
    realign_fn = _realign_approaches[realigner] if realigner else None

    if data["config"]["algorithm"]["snpcall"] and realign_fn:
        logger.info("Realigning %s with %s: %s %s" % (data["name"], realigner,
                                                      os.path.basename(data["work_bam"]),
                                                      region))
        sam_ref = data["sam_ref"]
        config = data["config"]
        if region == "nochr":
            realign_bam = write_nochr_reads(data["work_bam"], out_file)
        else:
            realign_bam = realign_fn(data["work_bam"], sam_ref, config,
                                     configured_ref_file("dbsnp", config, sam_ref),
                                     region, out_file)
        if region is None:
            save_diskspace(data["work_bam"], "Realigned to %s" % realign_bam,
                           config)
        data["work_bam"] = realign_bam
    return [data]
예제 #24
0
파일: count.py 프로젝트: joshuashen/steady
def _get_gtf_file(data):
    ref_file = data["sam_ref"]
    return configured_ref_file("transcripts", data["config"], ref_file)
예제 #25
0
def _get_gtf_file(data):
    ref_file = data["sam_ref"]
    return configured_ref_file("transcripts", data["config"], ref_file)