Ejemplo n.º 1
0
def run_cortex(align_bam,
               ref_file,
               config,
               dbsnp=None,
               region=None,
               out_file=None):
    """Top level entry to regional de-novo based variant calling with cortex_var.
    """
    broad_runner = broad.runner_from_config(config)
    if out_file is None:
        out_file = "%s-cortex.vcf" % os.path.splitext(align_bam)[0]
    if not file_exists(out_file):
        broad_runner.run_fn("picard_index", align_bam)
        variant_regions = config["algorithm"].get("variant_regions", None)
        if not variant_regions:
            raise ValueError(
                "Only regional variant calling with cortex_var is supported. Set variant_regions"
            )
        target_regions = subset_variant_regions(variant_regions, region,
                                                out_file)
        if os.path.isfile(target_regions):
            with open(target_regions) as in_handle:
                regional_vcfs = [
                    _run_cortex_on_region(x.strip().split("\t")[:3], align_bam,
                                          ref_file, out_file, config)
                    for x in in_handle
                ]
            combine_variant_files(regional_vcfs, out_file, ref_file, config)
        else:
            write_empty_vcf(out_file)
    return out_file
Ejemplo n.º 2
0
def _combine_variants(in_vcfs, out_file, ref_file, config):
    """Combine variant files, batching to avoid problematic large commandlines.
    """
    max_batch = 500
    if len(in_vcfs) > max_batch:
        new_vcfs = []
        for i, batch_vcfs in enumerate(partition_all(max_batch, in_vcfs)):
            path, fname = os.path.split(out_file)
            batch_path = safe_makedir(os.path.join(path, "batch"))
            base, ext = os.path.splitext(fname)
            cur_out = os.path.join(batch_path, "{0}-batch{1}{2}".format(base, i, ext))
            for x in batch_vcfs:
                with open(x) as in_handle:
                    if not in_handle.readline().startswith("##fileformat=VCFv4"):
                        raise ValueError("Unexpected VCF file: %s" % x)
            combine_variant_files(batch_vcfs, cur_out, ref_file, config)
            new_vcfs.append(cur_out)
        in_vcfs = new_vcfs
    assert len(in_vcfs) <= max_batch
    combine_variant_files(in_vcfs, out_file, ref_file, config)
Ejemplo n.º 3
0
def _combine_variants(in_vcfs, out_file, ref_file, config):
    """Combine variant files, batching to avoid problematic large commandlines.
    """
    max_batch = 500
    if len(in_vcfs) > max_batch:
        new_vcfs = []
        for i, batch_vcfs in enumerate(partition_all(max_batch, in_vcfs)):
            path, fname = os.path.split(out_file)
            batch_path = safe_makedir(os.path.join(path, "batch"))
            base, ext = os.path.splitext(fname)
            cur_out = os.path.join(batch_path,
                                   "{0}-batch{1}{2}".format(base, i, ext))
            for x in batch_vcfs:
                with open(x) as in_handle:
                    if not in_handle.readline().startswith(
                            "##fileformat=VCFv4"):
                        raise ValueError("Unexpected VCF file: %s" % x)
            combine_variant_files(batch_vcfs, cur_out, ref_file, config)
            new_vcfs.append(cur_out)
        in_vcfs = new_vcfs
    assert len(in_vcfs) <= max_batch
    combine_variant_files(in_vcfs, out_file, ref_file, config)
Ejemplo n.º 4
0
def run_cortex(align_bam, ref_file, config, dbsnp=None, region=None,
               out_file=None):
    """Top level entry to regional de-novo based variant calling with cortex_var.
    """
    broad_runner = broad.runner_from_config(config)
    if out_file is None:
        out_file = "%s-cortex.vcf" % os.path.splitext(align_bam)[0]
    if not file_exists(out_file):
        broad_runner.run_fn("picard_index", align_bam)
        variant_regions = config["algorithm"].get("variant_regions", None)
        if not variant_regions:
            raise ValueError("Only regional variant calling with cortex_var is supported. Set variant_regions")
        target_regions = subset_variant_regions(variant_regions, region, out_file)
        if os.path.isfile(target_regions):
            with open(target_regions) as in_handle:
                regional_vcfs = [_run_cortex_on_region(x.strip().split("\t")[:3], align_bam,
                                                       ref_file, out_file, config)
                                 for x in in_handle]
            combine_variant_files(regional_vcfs, out_file, ref_file, config)
        else:
            write_empty_vcf(out_file)
    return out_file
Ejemplo n.º 5
0
def prep_gemini_db(fnames, call_id, data):
    """Prepare a gemini database from VCF inputs prepared with snpEff.
    """
    out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini"))
    gemini_db = os.path.join(out_dir, "-".join(call_id) + ".db")
    if not utils.file_exists(gemini_db):
        if len(fnames) > 1:
            gemini_vcf = "%s.vcf" % os.path.splitext(gemini_db)[0]
            gemini_vcf = genotype.combine_variant_files(fnames, gemini_vcf, data["sam_ref"],
                                                        data["config"])
        else:
            gemini_vcf = fnames[0]
        with file_transaction(gemini_db) as tx_gemini_db:
            gemini = config_utils.get_program("gemini", data["config"])
            num_cores = data["config"]["algorithm"].get("num_cores", 1)
            cmd = "{gemini} load -v {gemini_vcf} -t snpEff --cores {num_cores} {tx_gemini_db}"
            cmd = cmd.format(**locals())
            do.run(cmd, "Create gemini database for %s" % str(call_id), data)
            subprocess.check_call(cmd, shell=True)
    return [[call_id, gemini_db]]
Ejemplo n.º 6
0
def prep_gemini_db(fnames, call_id, data):
    """Prepare a gemini database from VCF inputs prepared with snpEff.
    """
    out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini"))
    gemini_db = os.path.join(out_dir, "-".join(call_id) + ".db")
    if not utils.file_exists(gemini_db):
        if len(fnames) > 1:
            gemini_vcf = "%s.vcf" % os.path.splitext(gemini_db)[0]
            gemini_vcf = genotype.combine_variant_files(
                fnames, gemini_vcf, data["sam_ref"], data["config"])
        else:
            gemini_vcf = fnames[0]
        with file_transaction(gemini_db) as tx_gemini_db:
            gemini = config_utils.get_program("gemini", data["config"])
            num_cores = data["config"]["algorithm"].get("num_cores", 1)
            cmd = "{gemini} load -v {gemini_vcf} -t snpEff --cores {num_cores} {tx_gemini_db}"
            cmd = cmd.format(**locals())
            do.run(cmd, "Create gemini database for %s" % str(call_id), data)
            subprocess.check_call(cmd, shell=True)
    return [[call_id, gemini_db]]
Ejemplo n.º 7
0
def combine_variant_files(*args):
    return genotype.combine_variant_files(*args)
Ejemplo n.º 8
0
def combine_variant_files(*args):
    return genotype.combine_variant_files(*args)
Ejemplo n.º 9
0
def main(config_file):
    # load yaml config file
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # setup logging
    setup_logging(config)
    from bipy.log import logger
    # start cluster
    start_cluster(config)
    from bipy.cluster import view

    found = sh.find(config["dir"]["data"], "-name", "Variations")
    var_dirs = [str(x).strip() for x in found]
    logger.info("Var_dirs: %s" % (var_dirs))
    in_dirs = map(os.path.dirname, var_dirs)
    logger.info("in_dirs: %s" % (in_dirs))
    # XXX for testing only load 3
    #curr_files = in_dirs[0:5]
    curr_files = in_dirs

    # run the illumina fixer
    logger.info("Running illumina fixer on %s." % (curr_files))
    illf_class = STAGE_LOOKUP.get("illumina_fixer")
    illf = illf_class(config)
    curr_files = view.map(illf, curr_files)

    # sort the vcf files
    def sort_vcf(in_file):
        from bipy.utils import append_stem
        from bcbio.distributed.transaction import file_transaction
        from bcbio.utils import file_exists
        import sh

        out_file = append_stem(in_file, "sorted")
        if file_exists(out_file):
            return out_file
        with file_transaction(out_file) as tmp_out_file:
            sh.vcf_sort(in_file, _out=tmp_out_file)
        return out_file

    # combine
    out_file = os.path.join(config["dir"].get("results", "results"),
                            "geminiloader", "all_combined.vcf")
    logger.info("Combining files %s into %s." % (curr_files, out_file))
    if file_exists(out_file):
        curr_files = [out_file]
    else:
        curr_files = [
            genotype.combine_variant_files(curr_files, out_file,
                                           config["ref"]["fasta"], config)
        ]

    # break the VCF files up by chromosome for speed
    logger.info("Breaking up %s by chromosome." % (curr_files))
    breakvcf_class = STAGE_LOOKUP.get("breakvcf")
    breakvcf = breakvcf_class(config)
    curr_files = view.map(breakvcf, curr_files)

    # run VEP on the separate files in parallel
    logger.info("Running VEP on %s." % (curr_files))
    vep_class = STAGE_LOOKUP.get("vep")
    vep = vep_class(config)
    curr_files = view.map(vep, list(flatten(curr_files)))

    curr_files = filter(file_exists, curr_files)

    # load the files into gemini not in parallel
    # don't run in parallel

    # sort the vcf files
    logger.info("Sorting %s." % (curr_files))
    curr_files = view.map(sort_vcf, curr_files)
    # don't run the rest of this in parallel, so take the cluster down
    stop_cluster()

    out_file = os.path.join(config["dir"].get("results", "results"),
                            "geminiloader", "all_combined.vep.vcf")
    logger.info("Combining files %s into %s." % (curr_files, out_file))
    if file_exists(out_file):
        curr_files = [out_file]
    else:
        curr_files = [
            genotype.combine_variant_files(curr_files, out_file,
                                           config["ref"]["fasta"], config)
        ]

    logger.info("Loading %s into gemini." % (curr_files))
    gemini_class = STAGE_LOOKUP.get("geminiloader")
    geminiloader = gemini_class(config)
    curr_files = map(geminiloader, curr_files)
    logger.info("Run complete.")
Ejemplo n.º 10
0
def main(config_file):
    # load yaml config file
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # setup logging
    setup_logging(config)
    from bipy.log import logger
    # start cluster
    start_cluster(config)
    from bipy.cluster import view

    found = sh.find(config["dir"]["data"], "-name", "Variations")
    var_dirs = [str(x).strip() for x in found]
    logger.info("Var_dirs: %s" % (var_dirs))
    in_dirs = map(os.path.dirname, var_dirs)
    logger.info("in_dirs: %s" % (in_dirs))
    # XXX for testing only load 3
    #curr_files = in_dirs[0:5]
    curr_files = in_dirs


    # run the illumina fixer
    logger.info("Running illumina fixer on %s." % (curr_files))
    illf_class = STAGE_LOOKUP.get("illumina_fixer")
    illf = illf_class(config)
    curr_files = view.map(illf, curr_files)

    # sort the vcf files
    def sort_vcf(in_file):
        from bipy.utils import append_stem
        from bcbio.distributed.transaction import file_transaction
        from bcbio.utils import file_exists
        import sh

        out_file = append_stem(in_file, "sorted")
        if file_exists(out_file):
            return out_file
        with file_transaction(out_file) as tmp_out_file:
            sh.vcf_sort(in_file, _out=tmp_out_file)
        return out_file


    # combine
    out_file = os.path.join(config["dir"].get("results", "results"),
                            "geminiloader",
                            "all_combined.vcf")
    logger.info("Combining files %s into %s." % (curr_files, out_file))
    if file_exists(out_file):
        curr_files = [out_file]
    else:
        curr_files = [genotype.combine_variant_files(curr_files, out_file,
                                                     config["ref"]["fasta"],
                                                     config)]

    # break the VCF files up by chromosome for speed
    logger.info("Breaking up %s by chromosome." % (curr_files))
    breakvcf_class = STAGE_LOOKUP.get("breakvcf")
    breakvcf = breakvcf_class(config)
    curr_files = view.map(breakvcf, curr_files)

    # run VEP on the separate files in parallel
    logger.info("Running VEP on %s." % (curr_files))
    vep_class = STAGE_LOOKUP.get("vep")
    vep = vep_class(config)
    curr_files = view.map(vep, list(flatten(curr_files)))

    curr_files = filter(file_exists, curr_files)

    # load the files into gemini not in parallel
    # don't run in parallel

    # sort the vcf files
    logger.info("Sorting %s." % (curr_files))
    curr_files = view.map(sort_vcf, curr_files)
    # don't run the rest of this in parallel, so take the cluster down
    stop_cluster()

    out_file = os.path.join(config["dir"].get("results", "results"),
                            "geminiloader",
                            "all_combined.vep.vcf")
    logger.info("Combining files %s into %s." % (curr_files, out_file))
    if file_exists(out_file):
        curr_files = [out_file]
    else:
        curr_files = [genotype.combine_variant_files(curr_files, out_file,
                                                     config["ref"]["fasta"],
                                                     config)]


    logger.info("Loading %s into gemini." % (curr_files))
    gemini_class = STAGE_LOOKUP.get("geminiloader")
    geminiloader = gemini_class(config)
    curr_files = map(geminiloader, curr_files)
    logger.info("Run complete.")