Esempio n. 1
0
def _make_dir(dir, label):
    if not os.path.exists(dir):
        if not options.dry_run:
            os.makedirs(dir)
        logger.info("Creating %s directory %s" % (label, dir))
    else:
        logger.warn("%s already exists: not creating new directory" % (dir))
Esempio n. 2
0
def _get_machine_info(parallel, sys_config, dirs, config):
    """Get machine resource information from the job scheduler via either the command line or the queue.
    """
    if parallel.get("queue") and parallel.get("scheduler"):
        # dictionary as switch statement; can add new scheduler implementation functions as (lowercase) keys
        sched_info_dict = {
            "slurm": _slurm_info,
            "torque": _torque_info,
            "sge": _sge_info
        }
        try:
            return sched_info_dict[parallel["scheduler"].lower()](
                parallel["queue"])
        except KeyError:
            logger.info(
                "Resource query function not implemented for scheduler \"{0}\"; "
                "submitting job to queue".format(parallel["scheduler"]))
        except:
            # If something goes wrong, just hit the queue
            logger.warn(
                "Couldn't get machine information from resource query function for queue "
                "'{0}' on scheduler \"{1}\"; "
                "submitting job to queue".format(parallel["queue"],
                                                 parallel["scheduler"]))
    from bcbio.distributed import prun
    with prun.start(parallel, [[sys_config]], config, dirs) as run_parallel:
        return run_parallel("machine_info", [[sys_config]])
Esempio n. 3
0
def run_vcfanno(vcf, anno_type, data):
    """
    annotated a VCF file using vcfanno, looks up the proper config/lua scripts
    under the `vcfanno` key under the algorithm section of the datadict,
    skipping if the files cannot be found
    """
    UNSUPPORTED_TYPE_MESSAGE = (
        "{anno_type} is not a supported vcf annotation type with vcfanno. "
        "Supported types are {SUPPORTED_ANNOTATION_TYPES}")
    if anno_type not in SUPPORTED_ANNOTATION_TYPES:
        logger.warn(UNSUPPORTED_TYPE_MESSAGE.format(**locals()))
        return vcf
    build = dd.get_genome_build(data)
    annodir = os.path.dirname(dd.get_ref_file(data))
    annodir = os.path.abspath(os.path.join(annodir, os.pardir, "vcfanno"))
    annostem = os.path.join(annodir, build + "-")
    conffn = annostem + anno_type + ".conf"
    luafn = annostem + anno_type + ".lua"
    CONF_NOT_FOUND = (
        "The vcfanno configuration {conffn} was not found for {build}, skipping.")
    if not utils.file_exists(conffn):
        logger.warn(CONF_NOT_FOUND.format(**locals()))
        return vcf

    base = os.path.splitext(vcf)[0]
    out_file = base + anno_type + "-annotated.vcf.gz"
    if utils.file_exists(out_file):
        return out_file
    basepath = os.path.abspath(os.path.join(os.path.dirname(dd.get_ref_file(data)),
                                            os.path.pardir))
    basepath = annodir

    out_file = vcfanno(vcf, out_file, conffn, data, basepath, luafn)
    return out_file
Esempio n. 4
0
def run_vcfanno(vcf, anno_type, data):
    """
    annotated a VCF file using vcfanno, looks up the proper config/lua scripts
    under the `vcfanno` key under the algorithm section of the datadict,
    skipping if the files cannot be found
    """
    UNSUPPORTED_TYPE_MESSAGE = (
        "{anno_type} is not a supported vcf annotation type with vcfanno. "
        "Supported types are {SUPPORTED_ANNOTATION_TYPES}")
    if anno_type not in SUPPORTED_ANNOTATION_TYPES:
        logger.warn(UNSUPPORTED_TYPE_MESSAGE.format(**locals()))
        return vcf
    build = dd.get_genome_build(data)
    annodir = os.path.dirname(dd.get_ref_file(data))
    annodir = os.path.abspath(os.path.join(annodir, os.pardir, "vcfanno"))
    annostem = os.path.join(annodir, build + "-")
    conffn = annostem + anno_type + ".conf"
    luafn = annostem + anno_type + ".lua"
    CONF_NOT_FOUND = (
        "The vcfanno configuration {conffn} was not found for {build}, skipping."
    )
    if not utils.file_exists(conffn):
        logger.warn(CONF_NOT_FOUND.format(**locals()))
        return vcf

    base = os.path.splitext(vcf)[0]
    out_file = base + anno_type + "-annotated.vcf.gz"
    if utils.file_exists(out_file):
        return out_file
    basepath = os.path.abspath(
        os.path.join(os.path.dirname(dd.get_ref_file(data)), os.path.pardir))
    basepath = annodir

    out_file = vcfanno(vcf, out_file, conffn, data, basepath, luafn)
    return out_file
Esempio n. 5
0
def _make_dir(dir, label):
    if not os.path.exists(dir):
        if not options.dry_run:
            os.makedirs(dir)
        logger.info("Creating %s directory %s" % (label, dir))
    else:
        logger.warn("%s already exists: not creating new directory" % (dir))
Esempio n. 6
0
def calling(data):
    """Main function to parallelize peak calling."""
    method = dd.get_chip_method(data)
    caller_fn = get_callers()[data["peak_fn"]]
    if method == "chip":
        chip_bam = data.get("work_bam")
        input_bam = data.get("work_bam_input", None)
        name = dd.get_sample_name(data)
        out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["peak_fn"], name))
        out_files = caller_fn(name, chip_bam, input_bam, dd.get_genome_build(data), out_dir,
                            dd.get_chip_method(data), data["resources"], data)
        greylistdir = greylisting(data)
        data.update({"peaks_files": out_files})
        if greylistdir:
            data["greylist"] = greylistdir
    if method == "atac":
        fractions = list(ATACRanges.keys()) + ["full"]
        for fraction in fractions:
            MIN_READS_TO_CALL = 1000
            chip_bam = tz.get_in(("atac", "align", fraction), data)
            if not bam.has_nalignments(chip_bam, MIN_READS_TO_CALL, data):
                logger.warn(f"{chip_bam} has less than {MIN_READS_TO_CALL}, peak calling will fail so skip this fraction.")
                continue
            logger.info(f"Running peak calling with {data['peak_fn']} on the {fraction} fraction of {chip_bam}.")
            name = dd.get_sample_name(data) + f"-{fraction}"
            out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["peak_fn"], name))
            out_files = caller_fn(name, chip_bam, None, dd.get_genome_build(data), out_dir,
                                  dd.get_chip_method(data), data["resources"], data)
            data = tz.assoc_in(data, ("peaks_files", fraction), out_files)
    return [[data]]
Esempio n. 7
0
def find_annotations(data):
    """Find annotation configuration files for vcfanno, using pre-installed inputs.

    Creates absolute paths for user specified inputs and finds locally
    installed defaults.

    Default annotations if not specified:
      - gemini for variant pipelines
      - somatic for variant tumor pipelines
      - rnaedit for RNA-seq variant calling
    """
    conf_files = dd.get_vcfanno(data)
    if not conf_files:
        conf_files = _default_conf_files(data)
    if not isinstance(conf_files, (list, tuple)):
        conf_files = [conf_files]
    out = []
    annodir = os.path.normpath(os.path.abspath(os.path.join(os.path.dirname(dd.get_ref_file(data)),
                                                            os.pardir, "config", "vcfanno")))
    for conf_file in conf_files:
        if utils.file_exists(conf_file) and os.path.isfile(conf_file):
            conffn = conf_file
        else:
            conffn = os.path.join(annodir, conf_file + ".conf")
        if not utils.file_exists(conffn):
            build = dd.get_genome_build(data)
            CONF_NOT_FOUND = (
                "The vcfanno configuration {conffn} was not found for {build}, skipping.")
            logger.warn(CONF_NOT_FOUND.format(**locals()))
        else:
            out.append(conffn)
            luafn = "%s.lua" % utils.splitext_plus(conffn)[0]
            if os.path.exists(luafn):
                out.append(luafn)
    return out
Esempio n. 8
0
def run(items, background=None):
    """Detect copy number variations from tumor/normal samples using Battenberg.
    """
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    if not paired or not paired.normal_bam:
        logger.warn(
            "Battenberg only works on paired tumor/normal inputs, skipping %s"
            % dd.get_sample_name(items[0]))
        batout = None
    elif not tz.get_in(["genome_resources", "aliases", "human"],
                       paired.tumor_data):
        logger.warn("Battenberg only works on human data, skipping %s" %
                    dd.get_sample_name(items[0]))
        batout = None
    else:
        batout = _do_run(paired)
        batout["variantcaller"] = "battenberg"
    out = []
    for data in items:
        if batout:
            if "sv" not in data:
                data["sv"] = []
            data["sv"].append(batout)
        out.append(data)
    return out
def _make_dir(dir):
    if options.dry_run:
        return
    if not os.path.exists(dir):
        os.makedirs(dir)
        logger.info("Creating delivery directory %s" % (dir))
    else:
        logger.warn("%s already exists: not creating new directory" % (dir))
Esempio n. 10
0
def _make_dir(dir):
    if options.dry_run:
        return
    if not os.path.exists(dir):
        os.makedirs(dir)
        logger.info("Creating delivery directory %s" % (dir))
    else:
        logger.warn("%s already exists: not creating new directory" % (dir))
Esempio n. 11
0
def find_annotations(data, retriever=None):
    """Find annotation configuration files for vcfanno, using pre-installed inputs.

    Creates absolute paths for user specified inputs and finds locally
    installed defaults.

    Default annotations:
      - gemini for variant pipelines
      - somatic for variant tumor pipelines
      - rnaedit for RNA-seq variant calling
    """
    conf_files = dd.get_vcfanno(data)
    if not isinstance(conf_files, (list, tuple)):
        conf_files = [conf_files]
    for c in _default_conf_files(data, retriever):
        if c not in conf_files:
            conf_files.append(c)
    conf_checkers = {"gemini": annotate_gemini, "somatic": _annotate_somatic}
    out = []
    annodir = os.path.normpath(
        os.path.join(os.path.dirname(dd.get_ref_file(data)), os.pardir,
                     "config", "vcfanno"))
    if not retriever:
        annodir = os.path.abspath(annodir)
    for conf_file in conf_files:
        if objectstore.is_remote(conf_file) or (os.path.exists(conf_file)
                                                and os.path.isfile(conf_file)):
            conffn = conf_file
        elif not retriever:
            conffn = os.path.join(annodir, conf_file + ".conf")
        else:
            conffn = conf_file + ".conf"
        luafn = "%s.lua" % utils.splitext_plus(conffn)[0]
        if retriever:
            conffn, luafn = [
                (x if objectstore.is_remote(x) else None)
                for x in retriever.add_remotes([conffn, luafn], data["config"])
            ]
        if not conffn:
            pass
        elif conf_file in conf_checkers and not conf_checkers[conf_file](
                data, retriever):
            logger.warn(
                "Skipping vcfanno configuration: %s. Not all input files found."
                % conf_file)
        elif not objectstore.file_exists_or_remote(conffn):
            build = dd.get_genome_build(data)
            CONF_NOT_FOUND = (
                "The vcfanno configuration {conffn} was not found for {build}, skipping."
            )
            logger.warn(CONF_NOT_FOUND.format(**locals()))
        else:
            out.append(conffn)
            if luafn and objectstore.file_exists_or_remote(luafn):
                out.append(luafn)
    return out
Esempio n. 12
0
def run(vrn_info, cnvs_by_name, somatic_info):
    """Run PhyloWGS given variant calls, CNVs and tumor/normal information.
    """
    config = {"sample_size": 5000}
    work_dir = _cur_workdir(somatic_info.tumor_data)
    if "battenberg" not in cnvs_by_name:
        logger.warn("PhyloWGS requires Battenberg CNV calls, skipping %s"
                    % dd.get_sample_name(somatic_info.tumor_data))
    else:
        ssm_file, cnv_file = _prep_inputs(vrn_info, cnvs_by_name["battenberg"], somatic_info, work_dir, config)
        evolve_file = _run_evolve(ssm_file, cnv_file, work_dir, somatic_info.tumor_data)
        print evolve_file, ssm_file, cnv_file
Esempio n. 13
0
def run(vrn_info, cnvs_by_name, somatic_info):
    """Run PhyloWGS given variant calls, CNVs and tumor/normal information.
    """
    config = {"sample_size": 5000}
    work_dir = _cur_workdir(somatic_info.tumor_data)
    if "battenberg" not in cnvs_by_name:
        logger.warn("PhyloWGS requires Battenberg CNV calls, skipping %s"
                    % dd.get_sample_name(somatic_info.tumor_data))
    else:
        ssm_file, cnv_file = _prep_inputs(vrn_info, cnvs_by_name["battenberg"], somatic_info, work_dir, config)
        evolve_file = _run_evolve(ssm_file, cnv_file, work_dir, somatic_info.tumor_data)
        summary_file = _prepare_summary(evolve_file, ssm_file, cnv_file, work_dir, somatic_info)
        print(summary_file)
Esempio n. 14
0
def run(items, background=None):
    """Detect copy number variations from batched set of samples using GATK4 CNV calling.

    TODO: implement germline calling with DetermineGermlineContigPloidy and GermlineCNVCaller
    """
    if not background: background = []
    paired = vcfutils.get_paired(items + background)
    if paired:
        out = _run_paired(paired)
    else:
        out = items
        logger.warn("GATK4 CNV calling currently only available for somatic samples: %s" %
                    ", ".join([dd.get_sample_name(d) for d in items + background]))
    return out
Esempio n. 15
0
def run(items, background=None):
    """Detect copy number variations from batched set of samples using GATK4 CNV calling.

    TODO: implement germline calling with DetermineGermlineContigPloidy and GermlineCNVCaller
    """
    if not background: background = []
    paired = vcfutils.get_paired(items + background)
    if paired:
        out = _run_paired(paired)
    else:
        out = items
        logger.warn("GATK4 CNV calling currently only available for somatic samples: %s" %
                    ", ".join([dd.get_sample_name(d) for d in items + background]))
    return out
Esempio n. 16
0
def rnaseqpipeline(config, run_info_yaml, parallel, dirs, samples):
    samples = rnaseq_prep_samples(config, run_info_yaml, parallel, dirs, samples)
    with prun.start(_wres(parallel, ["aligner", "picard", "samtools"],
                            ensure_mem={"tophat": 10, "tophat2": 10, "star": 2, "hisat2": 8}),
                    samples, config, dirs, "alignment",
                    multiplier=alignprep.parallel_multiplier(samples)) as run_parallel:
        with profile.report("alignment", dirs):
            samples = run_parallel("disambiguate_split", [samples])
            samples = run_parallel("process_alignment", samples)
    with prun.start(_wres(parallel, ["samtools", "cufflinks"]),
                    samples, config, dirs, "rnaseqcount") as run_parallel:
        with profile.report("disambiguation", dirs):
            samples = disambiguate.resolve(samples, run_parallel)
        with profile.report("transcript assembly", dirs):
            samples = rnaseq.assemble_transcripts(run_parallel, samples)
        with profile.report("estimate expression (threaded)", dirs):
            samples = rnaseq.quantitate_expression_parallel(samples, run_parallel)

    with prun.start(_wres(parallel, ["dexseq", "express"]), samples, config,
                    dirs, "rnaseqcount-singlethread", max_multicore=1) as run_parallel:
        with profile.report("estimate expression (single threaded)", dirs):
            samples = rnaseq.quantitate_expression_noparallel(samples, run_parallel)

    samples = rnaseq.combine_files(samples)
    with prun.start(_wres(parallel, ["gatk", "vardict"]), samples, config,
                    dirs, "rnaseq-variation") as run_parallel:
        with profile.report("RNA-seq variant calling", dirs):
            samples = rnaseq.rnaseq_variant_calling(samples, run_parallel)

    with prun.start(_wres(parallel, ["samtools", "fastqc", "qualimap",
                                     "kraken", "gatk", "preseq"], ensure_mem={"qualimap": 4}),
                    samples, config, dirs, "qc") as run_parallel:
        with profile.report("quality control", dirs):
            samples = qcsummary.generate_parallel(samples, run_parallel)
        with profile.report("upload", dirs):
            samples = run_parallel("upload_samples", samples)
            for sample in samples:
                run_parallel("upload_samples_project", [sample])
        with profile.report("bcbioRNAseq loading", dirs):
            tools_on = dd.get_in_samples(samples, dd.get_tools_on)
            bcbiornaseq_on = tools_on and "bcbiornaseq" in tools_on
            if bcbiornaseq_on:
                if len(samples) < 3:
                    logger.warn("bcbioRNASeq needs at least three samples total, skipping.")
                elif len(samples) > 100:
                    logger.warn("Over 100 samples, skipping bcbioRNASeq.")
                else:
                    run_parallel("run_bcbiornaseqload", [sample])
    logger.info("Timing: finished")
    return samples
Esempio n. 17
0
 def get_version(config):
     try:
         pdir = config_utils.get_program(program_name, config, "dir")
     # not configured
     except ValueError:
         return ""
     jar = os.path.basename(config_utils.get_jar(jar_name, pdir))
     for to_remove in [jar_name, ".jar", "-standalone"]:
         jar = jar.replace(to_remove, "")
     if jar.startswith(("-", ".")):
         jar = jar[1:]
     if not jar:
         logger.warn("Unable to determine version for program '{}' from jar file {}".format(
             program_name, config_utils.get_jar(jar_name, pdir)))
     return jar
Esempio n. 18
0
 def get_version(config):
     try:
         pdir = config_utils.get_program(program_name, config, "dir")
     # not configured
     except ValueError:
         return ""
     jar = os.path.basename(config_utils.get_jar(jar_name, pdir))
     for to_remove in [jar_name, ".jar", "-standalone"]:
         jar = jar.replace(to_remove, "")
     if jar.startswith(("-", ".")):
         jar = jar[1:]
     if not jar:
         logger.warn("Unable to determine version for program '{}' from jar file {}".format(
             program_name, config_utils.get_jar(jar_name, pdir)))
     return jar
Esempio n. 19
0
File: lane.py Progetto: vals/bcbb
def process_lane(lane_items, fc_name, fc_date, dirs, config):
    """Prepare lanes, potentially splitting based on barcodes.
    """

    lane_name = "%s_%s_%s" % (lane_items[0]['lane'], fc_date, fc_name)
    full_fastq1, full_fastq2 = get_fastq_files(dirs["fastq"], dirs["work"],
                                               lane_items[0], fc_name, config=config)

    # Filter phiX
    custom_config = _update_config_w_custom(config, lane_items[0])
    if custom_config["algorithm"].get("filter_phix", False):
        # If we are starting from demultiplexed material, we will skip a lane-wise screening
        # Screening will be performed on a sample basis
        if custom_config["algorithm"].get("demultiplexed",False):
            logger.warn("Will not filter phix lane-wise on already demultiplexed files. You will have to specify genomes_filter_out option for each sample")
        else:
            logger.info("Filtering phiX from %s" % lane_name)
            info = {"genomes_filter_out": "spiked_phix", "description": lane_name}
            processed = remove_contaminants(full_fastq1, full_fastq2, info, lane_name, info["description"], dirs, custom_config)
            (full_fastq1, full_fastq2, _, lane_name) = processed[0][0:4]

    logger.info("Demultiplexing %s" % lane_name)
    bc_files = split_by_barcode(full_fastq1, full_fastq2, lane_items,
                                lane_name, dirs, config)

    out = []
    for item in lane_items:
        config = _update_config_w_custom(config, item)
        # Can specify all barcodes but might not have actual sequences
        # Would be nice to have a good way to check this is okay here.
        if item["barcode_id"] in bc_files:
            fastq1, fastq2 = bc_files[item["barcode_id"]]
            cur_lane_name = lane_name
            cur_lane_desc = item["description"]
            if item.get("name", "") and config["algorithm"].get("include_short_name", True):
                cur_lane_desc = "%s : %s" % (item["name"], cur_lane_desc)
            if item["barcode_id"] is not None:
                cur_lane_name += "_%s" % (item["barcode_id"])
            if config["algorithm"].get("trim_reads", False):
                trim_info = brun_trim_fastq([x for x in [fastq1, fastq2] if x is not None],
                                            dirs, config)
                fastq1 = trim_info[0]
                if fastq2 is not None:
                    fastq2 = trim_info[1]
            out.append((fastq1, fastq2, item, cur_lane_name, cur_lane_desc,
                        dirs, config))

    return out
Esempio n. 20
0
def find_annotations(data, retriever=None):
    """Find annotation configuration files for vcfanno, using pre-installed inputs.

    Creates absolute paths for user specified inputs and finds locally
    installed defaults.

    Default annotations:
      - gemini for variant pipelines
      - somatic for variant tumor pipelines
      - rnaedit for RNA-seq variant calling
    """
    conf_files = dd.get_vcfanno(data)
    if not isinstance(conf_files, (list, tuple)):
        conf_files = [conf_files]
    for c in _default_conf_files(data, retriever):
        if c not in conf_files:
            conf_files.append(c)
    conf_checkers = {"gemini": annotate_gemini, "somatic": _annotate_somatic}
    out = []
    annodir = os.path.normpath(os.path.join(os.path.dirname(dd.get_ref_file(data)), os.pardir, "config", "vcfanno"))
    if not retriever:
        annodir = os.path.abspath(annodir)
    for conf_file in conf_files:
        if objectstore.is_remote(conf_file) or (os.path.exists(conf_file) and os.path.isfile(conf_file)):
            conffn = conf_file
        elif not retriever:
            conffn = os.path.join(annodir, conf_file + ".conf")
        else:
            conffn = conf_file + ".conf"
        luafn = "%s.lua" % utils.splitext_plus(conffn)[0]
        if retriever:
            conffn, luafn = [(x if objectstore.is_remote(x) else None)
                             for x in retriever.add_remotes([conffn, luafn], data["config"])]
        if not conffn:
            pass
        elif conf_file in conf_checkers and not conf_checkers[conf_file](data, retriever):
            logger.warn("Skipping vcfanno configuration: %s. Not all input files found." % conf_file)
        elif not objectstore.file_exists_or_remote(conffn):
            build = dd.get_genome_build(data)
            CONF_NOT_FOUND = (
                "The vcfanno configuration {conffn} was not found for {build}, skipping.")
            logger.warn(CONF_NOT_FOUND.format(**locals()))
        else:
            out.append(conffn)
            if luafn and objectstore.file_exists_or_remote(luafn):
                out.append(luafn)
    return out
Esempio n. 21
0
def remove_multimappers(bam_file, data):
    aligner = dd.get_aligner(data)
    if aligner:
        if aligner == "bowtie2":
            filterer = bowtie2.filter_multimappers
        elif aligner == "bwa":
            filterer = bwa.filter_multimappers
        else:
            logger.error("ChIP-seq only supported for bowtie2 and bwa.")
            sys.exit(-1)
        unique_bam = filterer(bam_file, data)
    else:
        unique_bam = bam_file
        logger.warn(
            "When a BAM file is given as input, bcbio skips removal of "
            "multimappers.")
    return unique_bam
Esempio n. 22
0
def _deliver_file(src, tgt):
    if options.move:
        deliver_fn = shutil.move
    else:
        deliver_fn = shutil.copyfile
    if src is None:
        return
    if not os.path.exists(src):
        return
    if os.path.exists(tgt):
        logger.warn("%s already exists: not doing anything!" % (tgt))
        return
    if options.dry_run:
        print "DRY_RUN: %s file %s to %s" % (deliver_fn.__name__, src, tgt)
    else:
        logger.info("%s file %s to %s" % (deliver_fn.__name__, src, tgt))
        f(src, tgt)
def _deliver_file(src, tgt):
    if options.move:
        deliver_fn = shutil.move
    else:
        deliver_fn = shutil.copyfile
    if src is None:
        return
    if not os.path.exists(src):
        return
    if os.path.exists(tgt):
        logger.warn("%s already exists: not doing anything!" % (tgt))
        return
    if options.dry_run:
        print "DRY_RUN: %s file %s to %s" % (deliver_fn.__name__, src, tgt)
    else:
        logger.info("%s file %s to %s" % (deliver_fn.__name__, src, tgt))
        f(src, tgt)
Esempio n. 24
0
def _handle_data(src, tgt, f=shutil.copyfile, f2=None):
    if options.only_run_info:
        return
    if src is None:
        return
    if os.path.exists(tgt):
        logger.warn("%s already exists: not doing anything!" % (tgt))
        return
    if options.dry_run:
        print "DRY_RUN: %s file %s to %s" % (f.__name__, src, tgt)
        if not f2 is None:
            print "DRY_RUN: %s file %s to %s" % (f2.__name__, tgt, src)
    else:
        logger.info("%s file %s to %s" % (f.__name__, src, tgt))
        f(src, tgt)
        if not f2 is None:
            logger.info("%s file %s to %s" % (f2.__name__, tgt, src))
            f2(tgt, src)
Esempio n. 25
0
def _handle_data(src, tgt, f=shutil.copyfile, f2=None):
    if options.only_run_info:
        return
    if src is None:
        return
    if os.path.exists(tgt):
        logger.warn("%s already exists: not doing anything!" %(tgt))
        return
    if options.dry_run:
        print "DRY_RUN: %s file %s to %s" % (f.__name__, src, tgt)
        if not f2 is None:
            print "DRY_RUN: %s file %s to %s" % (f2.__name__, tgt, src)
    else:
        logger.info("%s file %s to %s" % (f.__name__, src, tgt))
        f(src, tgt)
        if not f2 is None:
            logger.info("%s file %s to %s" % (f2.__name__, tgt, src))
            f2(tgt, src)
Esempio n. 26
0
def run_vcfanno(vcf, conf_files, data, data_basepath=None):
    """
    annotated a VCF file using vcfanno, looks up the proper config/lua scripts
    under the `vcfanno` key under the algorithm section of the datadict,
    skipping if the files cannot be found
    """
    if not isinstance(conf_files, (list, tuple)):
        conf_files = [conf_files]
    build = dd.get_genome_build(data)
    basepath = os.path.abspath(
        os.path.join(os.path.dirname(dd.get_ref_file(data)), os.pardir))
    annodir = os.path.abspath(os.path.join(basepath, "config", "vcfanno"))
    conf_fns = []
    lua_fns = []
    anno_type = None
    for conf_file in conf_files:
        if utils.file_exists(conf_file) and os.path.isfile(conf_file):
            conffn = conf_file
            luafn = "%s.lua" % utils.splitext_plus(conffn)[0]
        else:
            anno_type = os.path.basename(conf_file)
            conffn = os.path.join(annodir, anno_type + ".conf")
            luafn = os.path.join(annodir, anno_type + ".lua")
        if not utils.file_exists(conffn):
            CONF_NOT_FOUND = (
                "The vcfanno configuration {conffn} was not found for {build}, skipping."
            )
            logger.warn(CONF_NOT_FOUND.format(**locals()))
        else:
            conf_fns.append(conffn)
            lua_fns.append(luafn)
    if not conf_fns:
        return vcf
    if not anno_type:
        anno_type = "gemini"
    out_file = utils.splitext_plus(
        vcf)[0] + "-annotated-" + anno_type + ".vcf.gz"
    if utils.file_exists(out_file):
        return out_file

    out_file = vcfanno(vcf, out_file, conf_fns, data, data_basepath
                       or basepath, lua_fns)
    return out_file
Esempio n. 27
0
def _get_machine_info(parallel, run_parallel, sys_config):
    """Get machine resource information from the job scheduler via either the command-line or the queue.
    """
    if parallel.get("queue") and parallel.get("scheduler"):
        # dictionary as switch statement; can add new scheduler implementation functions as (lowercase) keys
        sched_info_dict = {
                            "slurm": _slurm_info,
                          }
        try:
            return sched_info_dict[parallel["scheduler"].lower()](parallel["queue"])
        except KeyError:
            logger.info("Resource query function not implemented for scheduler \"{0}\"; "
                         "submitting job to queue".format(parallel["scheduler"]))
        except:
            # If something goes wrong, just hit the queue
            logger.warn("Couldn't get machine information from resource query function for queue "
                        "'{0}' on scheduler \"{1}\"; "
                         "submitting job to queue".format(parallel["queue"], parallel["scheduler"]))
    return run_parallel("machine_info", [[sys_config]])
Esempio n. 28
0
def run(items, background=None):
    """Detect copy number variations from tumor/normal samples using Battenberg.
    """
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    if not paired or not paired.normal_bam:
        logger.warn("Battenberg only works on paired tumor/normal inputs, skipping %s"
                    % dd.get_sample_name(items[0]))
        batout = None
    elif not tz.get_in(["genome_resources", "aliases", "human"], paired.tumor_data):
        logger.warn("Battenberg only works on human data, skipping %s"
                    % dd.get_sample_name(items[0]))
        batout = None
    else:
        batout = _do_run(paired)
        batout["variantcaller"] = "battenberg"
    out = []
    for data in items:
        if batout and dd.get_sample_name(data) == paired.tumor_name:
            if "sv" not in data:
                data["sv"] = []
            data["sv"].append(batout)
        out.append(data)
    return out
Esempio n. 29
0
def make_quality_report(data):
    """ create and render the bcbioRNASeq quality report """
    MAX_SAMPLES = 100
    if "bcbiornaseq" not in dd.get_tools_on(data):
        return data
    upload_dir = tz.get_in(("upload", "dir"), data)
    report_dir = os.path.join(upload_dir, "bcbioRNASeq")
    nsamples = len(list(Path(upload_dir).rglob('quant.sf')))
    groups = dd.get_bcbiornaseq(data).get("interesting_groups", None)
    safe_makedir(report_dir)
    quality_rmd = os.path.join(report_dir, "quality_control.Rmd")
    quality_html = os.path.join(report_dir, "quality_control.html")
    quality_rmd = rmarkdown_draft(quality_rmd, "01-quality-control",
                                  "bcbioRNASeq")
    if nsamples > MAX_SAMPLES and not groups:
        logger.warn(
            f"{nsamples} detected, disabling a few bcbioRNASeq plots which break "
            f"with many samples. Setting `interesting_groups` would allow these plots "
            f"to be created.")
        quality_rmd = many_samples_workaround(quality_rmd)
    if not file_exists(quality_html):
        render_rmarkdown_file(quality_rmd)
    return data
Esempio n. 30
0
def run_vcfanno(vcf, conf_files, data, data_basepath=None):
    """
    annotated a VCF file using vcfanno, looks up the proper config/lua scripts
    under the `vcfanno` key under the algorithm section of the datadict,
    skipping if the files cannot be found
    """
    if not isinstance(conf_files, (list, tuple)):
        conf_files = [conf_files]
    build = dd.get_genome_build(data)
    basepath = os.path.abspath(os.path.join(os.path.dirname(dd.get_ref_file(data)),
                                            os.pardir))
    annodir = os.path.abspath(os.path.join(basepath, "config", "vcfanno"))
    conf_fns = []
    lua_fns = []
    anno_type = None
    for conf_file in conf_files:
        if utils.file_exists(conf_file) and os.path.isfile(conf_file):
            conffn = conf_file
            luafn = "%s.lua" % utils.splitext_plus(conffn)[0]
        else:
            anno_type = os.path.basename(conf_file)
            conffn = os.path.join(annodir, anno_type + ".conf")
            luafn = os.path.join(annodir, anno_type + ".lua")
        if not utils.file_exists(conffn):
            CONF_NOT_FOUND = (
                "The vcfanno configuration {conffn} was not found for {build}, skipping.")
            logger.warn(CONF_NOT_FOUND.format(**locals()))
        else:
            conf_fns.append(conffn)
            lua_fns.append(luafn)
    if conf_fns:
        if not anno_type:
            anno_type = "gemini"
        out_file = utils.splitext_plus(vcf)[0] + "-annotated-" + anno_type + ".vcf.gz"
        if not utils.file_exists(out_file):
            out_file = vcfanno(vcf, out_file, conf_fns, data, data_basepath or basepath, lua_fns)
        return vcfutils.bgzip_and_index(out_file, data["config"])
Esempio n. 31
0
def run_vcfanno(vcf, anno_type, data, data_basepath=None):
    """
    annotated a VCF file using vcfanno, looks up the proper config/lua scripts
    under the `vcfanno` key under the algorithm section of the datadict,
    skipping if the files cannot be found
    """
    build = dd.get_genome_build(data)
    basepath = os.path.abspath(os.path.join(os.path.dirname(dd.get_ref_file(data)),
                                            os.pardir))
    annodir = os.path.abspath(os.path.join(basepath, "config", "vcfanno"))
    conffn = os.path.join(annodir, anno_type + ".conf")
    luafn = os.path.join(annodir, anno_type + ".lua")
    CONF_NOT_FOUND = (
        "The vcfanno configuration {conffn} was not found for {build}, skipping.")
    if not utils.file_exists(conffn):
        logger.warn(CONF_NOT_FOUND.format(**locals()))
        return vcf

    out_file = utils.splitext_plus(vcf)[0] + "-annotated-" + anno_type + ".vcf.gz"
    if utils.file_exists(out_file):
        return out_file

    out_file = vcfanno(vcf, out_file, conffn, data, data_basepath or basepath, luafn)
    return out_file
Esempio n. 32
0
def _check_mutect_version(broad_runner):
    mutect_version = broad_runner.get_mutect_version()
    try:
        assert mutect_version is not None
    except AssertionError:
        logger.warn("WARNING")
        logger.warn("MuTect version could not be determined from jar file. "
                    "Please ensure you are using at least version 1.1.5, "
                    "as versions 1.1.4 and lower have known issues.")
        logger.warn("Proceeding but assuming correct version 1.1.5.")
    else:
        try:
            assert LooseVersion(mutect_version) >= LooseVersion("1.1.5")
        except AssertionError:
            message = ("MuTect 1.1.4 and lower is known to have incompatibilities "
                       "with Java < 7, and this may lead to problems in analyses. "
                       "Please use MuTect 1.1.5 or higher (note that it requires "
                       "Java 7).")
            raise ValueError(message)
Esempio n. 33
0
def main(run,project_id,sample_names,config_file,Map_Stat,Read_Dist,FPKM,rRNA_table):

    TEMPLATE="""\
RNA-seq analysis report for ${project_id}
=================================
   
${latex_opt}

Summary
-------------------------
**Project name:**
${project_id} (UPPMAX project ${uppnex})

**Samples:**
${samplenames}
    
**Run name:**
${runname}

**Mapping:** 
${mapping}
    
**Duplicate removal:**
${dup_rem}
    
**Read count:**
${read_count}
    
**RPKM/FPKM values:**
${quantifyer}
    
**Result directories on UPPMAX:** /proj/${uppnex}/INBOX/${project_id}/analysis/alignments (BAM files), /proj/${uppnex}/INBOX/${project_id}/analysis/quantification (FPKM files)

.. raw:: latex
       
   \clearpage    
    
Results
-------------------------"""
    
    if Map_Stat:
        TEMPLATE=TEMPLATE+"""
Mapping statistics
^^^^^^^^^^^^^^^
${Mapping_statistics}
    
Comments
~~~~~~~~
    
**tot # read pairs:** 

The total number of read pairs indicates the total number of sequenced paired-end reads. Since a paired-end read is made up of two sequenced fragments (mates), the total number of sequenced 100-bp regions is twice the number shown in this column.
    
**% mapped reads:**
    
The number of fragments that are mapped relative to the total number of sequenced fragments. 
    
**% reads left after dup rem:**
    
We remove duplicate reads (paired end reads where both mates map to the same loci as both mates in a different paired-end read because these are likely to be artifacts caused by PCR amplification or over-sequencing. Aligned files in BAM format with duplicates removed can be found in /proj/${uppnex}/INBOX/${project_id}/analysis/alignments.


.. raw:: latex
       
   \clearpage

"""
    
    TEMPLATE=TEMPLATE+"""
Expression values
^^^^^^^^^^^^^^^^^
    
The /proj/${uppnex}/INBOX/${project_id}/analysis/quantification folder contains FPKM values calculated using the Cufflinks program using ENSEMBL annotation of genes and transcripts for each sample. These files also contain the upper and lower limits of the confidence interval for the FPKM estimate. FPKM values are the paired-end equivalent of RPKM (Reads Per Kilobase per Million mapped reads; the standard measure for gene expression in RNA-seq.)
    
There is also a single fpkm_table.txt file, which contains all of the FPKM values. This can be opened in Excel or a regular text processing application.
    
For analyzing differential expression of genes or transcripts, it may be useful to have the raw read counts (the number of sequences that map to each gene/transcript) as well. These are calculated using the HTSeq software and are collected into a table called count_table.txt.


.. raw:: latex
       
   \clearpage

"""
    
    
    if FPKM:
        TEMPLATE=TEMPLATE+"""
FPKM heatmap
^^^^^^^^^^^^^^^^^
This heatmap shows the (Pearson) correlation between FPKM values of samples. 
    
${FPKM_heatmap}
    
    
.. raw:: latex
       
   \clearpage
    
FPKM PCA
^^^^^^^^^^^^^^^^^
This PCA (principal component analysis) score plot has the samples plotted according to their scores for the two principal components that explain the largest amount of variance in the FPKM data table. The number after 'expl var' in the axis labels tells you how much of the variance that is explained by each component. Similar samples should, in theory, cluster together in the PCA plot. PCA is a way to compress the information in your high-dimensional data matrix so that it can be plotted in two dimensions.
    
${FPKM_PCAplot}


.. raw:: latex
       
   \clearpage

"""
    
    if Read_Dist:
        TEMPLATE=TEMPLATE+"""
Read distribution
^^^^^^^^^^^^^^^^^
This table contain information about the extent to which sequences from each sample mapped to different structural parts of genes, like coding exons, untranslated regions, and transcription start sites. The actual number itself is less important than the relative values for the different kinds of regions. For a normal RNA-seq experiment you should have a higher value in the CDS Exon column than in the others, for example. "CDS Exon" means "coding sequence exons", "UTR" stands for "untranslated region", "TES" stands for "transcription end site", "TSS" stands for "transcription start site". "Intronic regions" should be interpreted as "intronic or intergenic regions".
Perhaps the most easily interpretable column is the final column, mRNA fraction, which gives the fraction [0-1] of sequences that mapped to ENSEMBL-annotated mRNA (including coding regions and UTRs). While this fraction is not completely accurate (because ENSEMBL doe not completely describe the transcriptome), it is a useful summary statistic which should be relatively high for an mRNA-seq experiment, typically above 0.8.

${Read_Distribution}


.. raw:: latex
       
   \clearpage

"""
    
    if rRNA_table:
        TEMPLATE=TEMPLATE+"""
Quantification of rRNA present in the samples
^^^^^^^^^^^^^^^^^^^^^
    
${rRNA_table}


.. raw:: latex
       
   \clearpage

"""

    sphinx_defs = []

    if config_file:
        config = load_config(config_file)
    else:
        config = {}

    sphinx_defs.append("('%s', '%s_analysis.tex', 'RNA-seq Analysis Report', u'SciLifeLab Stockholm', 'howto'),\n"  % (project_id, project_id))
    projectfile = "%s.mako" % (project_id) 
    fp = open(projectfile, "w")
    fp.write(TEMPLATE)
    fp.close()
    mylookup = TemplateLookup(directories=['./'])
    tmpl = Template(filename=projectfile, lookup=mylookup)

    proj_conf = {
        'id' : project_id,
	'run':run,
        'config' : config,
	'samples': sample_names.split(',')
         }

    d = generate_report(proj_conf)
    rstfile = "%s.rst" % (project_id)
    fp = open(rstfile, "w")
    fp.write(tmpl.render(**d))
    fp.close()

    sphinxconf = os.path.join(os.getcwd(), "conf.py")
    if not os.path.exists(sphinxconf):
        logger.warn("no sphinx configuration file conf.py found: you have to edit conf.py yourself!")
    else:
        fp = open(sphinxconf)
        lines = fp.readlines()
        fp.close()
        sdout = []
        modify_conf = False
        for sd in sphinx_defs:
            if not sd in lines:
                sdout.append(sd)
                modify_conf = True
        if modify_conf:
            i = lines.index("latex_documents = [\n")
            newconf = lines[:i+3] + sdout + lines[i+3:]
            fp = open("conf.py", "w")
            fp.write("".join(newconf))
            fp.close()
Esempio n. 34
0
def main(run, project_id, sample_names, single_end, config_file, Map_Stat,
         Read_Dist, FPKM, rRNA_table):
    TEMPLATE = """\
RNA-seq analysis report for ${project_id}
=================================
   
${latex_opt}

Summary
-------------------------
**Project name:**
${project_id} (UPPMAX project ${uppnex})

**Samples:**
${samplenames}
    
**Run name:**
${runname}

**Mapping:** 
${mapping}
    
**Duplicate removal:**
${dup_rem}
    
**Read count:**
${read_count}
    
**RPKM/FPKM values:**
${quantifyer}
    
**Result directories on UPPMAX:** /proj/${uppnex}/INBOX/${project_id}/analysis/alignments (BAM files), /proj/${uppnex}/INBOX/${project_id}/analysis/quantification (FPKM files)

.. raw:: latex
       
   \clearpage    
    
Results
-------------------------"""

    if Map_Stat:
        TEMPLATE = TEMPLATE + """
Mapping statistics
^^^^^^^^^^^^^^^
${Mapping_statistics}
    
Comments
~~~~~~~~
    
**Tot # reads:** 

If paired-end reads, the total number of reads indicates the total number of sequenced paired-end reads. Since a paired-end read is made up of two sequenced fragments (mates), the total number of sequenced 100-bp regions is twice the number shown in this column.

If single-end reads, this column reflects the total numer of sequences.

**% mapped reads:**
    
The number of fragments that are mapped relative to the total number of sequenced fragments. 
    
**% reads left after dup rem:**
    
We remove duplicate reads (if paired-end, duplicates are definded as the paired end reads where both mates map to the same loci as both mates in a different paired-end read) because these are likely to be artifacts caused by PCR amplification or over-sequencing. Aligned files in BAM format with duplicates removed can be found in /proj/${uppnex}/INBOX/${project_id}/analysis/alignments.


.. raw:: latex
       
   \clearpage

"""

    TEMPLATE = TEMPLATE + """
Expression values
^^^^^^^^^^^^^^^^^
    
The /proj/${uppnex}/INBOX/${project_id}/analysis/quantification folder contains FPKM values calculated using the Cufflinks program using ENSEMBL annotation of genes and transcripts for each sample. These files also contain the upper and lower limits of the confidence interval for the FPKM estimate. FPKM values are the paired-end equivalent of RPKM (Reads Per Kilobase per Million mapped reads; the standard measure for gene expression in RNA-seq.)
    
There is also a single fpkm_table.txt file, which contains all of the FPKM values. This can be opened in Excel or a regular text processing application.
    
For analyzing differential expression of genes or transcripts, it may be useful to have the raw read counts (the number of sequences that map to each gene/transcript) as well. These are calculated using the HTSeq software and are collected into a table called count_table.txt.


.. raw:: latex
       
   \clearpage

"""

    if FPKM:
        TEMPLATE = TEMPLATE + """
FPKM heatmap
^^^^^^^^^^^^^^^^^
This heatmap shows the (Pearson) correlation between FPKM values of samples. 
    
${FPKM_heatmap}
    
    
.. raw:: latex
       
   \clearpage
    
FPKM PCA
^^^^^^^^^^^^^^^^^
This PCA (principal component analysis) score plot has the samples plotted according to their scores for the two principal components that explain the largest amount of variance in the FPKM data table. The number after 'expl var' in the axis labels tells you how much of the variance that is explained by each component. Similar samples should, in theory, cluster together in the PCA plot. PCA is a way to compress the information in your high-dimensional data matrix so that it can be plotted in two dimensions.
    
${FPKM_PCAplot}


.. raw:: latex
       
   \clearpage

"""
    if Read_Dist:
        TEMPLATE = TEMPLATE + """
Read distribution
^^^^^^^^^^^^^^^^^
This table contain information about the extent to which sequences from each sample mapped to different structural parts of genes, like coding exons, untranslated regions, and transcription start sites. The actual number itself is less important than the relative values for the different kinds of regions. For a normal RNA-seq experiment you should have a higher value in the CDS Exon column than in the others, for example. "CDS Exon" means "coding sequence exons", "UTR" stands for "untranslated region", "TES" stands for "transcription end site", "TSS" stands for "transcription start site". "Intronic regions" should be interpreted as "intronic or intergenic regions".
Perhaps the most easily interpretable column is the final column, mRNA fraction, which gives the fraction [0-1] of sequences that mapped to ENSEMBL-annotated mRNA (including coding regions and UTRs). While this fraction is not completely accurate (because ENSEMBL doe not completely describe the transcriptome), it is a useful summary statistic which should be relatively high for an mRNA-seq experiment, typically above 0.8.

${Read_Distribution}


.. raw:: latex
       
   \clearpage

"""

    if rRNA_table:
        TEMPLATE = TEMPLATE + """
Quantification of rRNA present in the samples
^^^^^^^^^^^^^^^^^^^^^
    
${rRNA_table}


.. raw:: latex
       
   \clearpage

"""

    sphinx_defs = []

    if config_file:
        config = load_config(config_file)
    else:
        config = {}

    sphinx_defs.append(
        "('%s', '%s_analysis.tex', 'RNA-seq Analysis Report', u'SciLifeLab Stockholm', 'howto'),\n"
        % (project_id, project_id))
    projectfile = "%s.mako" % (project_id)
    fp = open(projectfile, "w")
    fp.write(TEMPLATE)
    fp.close()
    mylookup = TemplateLookup(directories=['./'])
    tmpl = Template(filename=projectfile, lookup=mylookup)

    proj_conf = {
        'id': project_id,
        'run': run,
        'config': config,
        'samples': sample_names.split(',')
    }

    d = generate_report(proj_conf, single_end)
    rstfile = "%s.rst" % (project_id)
    fp = open(rstfile, "w")
    fp.write(tmpl.render(**d))
    fp.close()

    sphinxconf = os.path.join(os.getcwd(), "conf.py")
    if not os.path.exists(sphinxconf):
        logger.warn(
            "no sphinx configuration file conf.py found: you have to edit conf.py yourself!"
        )
    else:
        fp = open(sphinxconf)
        lines = fp.readlines()
        fp.close()
        sdout = []
        modify_conf = False
        for sd in sphinx_defs:
            if not sd in lines:
                sdout.append(sd)
                modify_conf = True
        if modify_conf:
            i = lines.index("latex_documents = [\n")
            newconf = lines[:i + 3] + sdout + lines[i + 3:]
            fp = open("conf.py", "w")
            fp.write("".join(newconf))
            fp.close()
Esempio n. 35
0
def _mutect_call_prep(align_bams,
                      items,
                      ref_file,
                      assoc_files,
                      region=None,
                      out_file=None):
    """
    Preparation work for MuTect.
    """

    #FIXME: We assume all other bits in the config are shared

    base_config = items[0]["config"]
    dbsnp = assoc_files["dbsnp"]
    cosmic = assoc_files.get("cosmic")

    broad_runner = broad.runner_from_config(base_config, "mutect")

    mutect_version = broad_runner.get_mutect_version()

    try:
        assert mutect_version is not None
    except AssertionError:
        logger.warn("WARNING")
        logger.warn("MuTect version could not be determined from jar file. "
                    "Please ensure you are using at least version 1.1.5, "
                    "as versions 1.1.4 and lower have known issues.")
        logger.warn("Proceeding but assuming correct version 1.1.5.")
    else:
        try:
            assert LooseVersion(mutect_version) >= LooseVersion("1.1.5")
        except AssertionError:
            message = (
                "MuTect 1.1.4 and lower is known to have incompatibilities "
                "with Java < 7, and this may lead to problems in analyses. "
                "Please use MuTect 1.1.5 or higher (note that it requires "
                "Java 7).")
            raise ValueError(message)

    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, base_config)

    variant_regions = base_config["algorithm"].get("variant_regions", None)
    contamination = base_config["algorithm"].get("fraction_contamination", 0)
    region = subset_variant_regions(variant_regions, region, out_file)

    #FIXME: Add more parameters like fraction contamination etc

    params = ["-R", ref_file, "-T", "MuTect"]
    params += ["--dbsnp", dbsnp]

    tumor_bam = None
    normal_bam = None

    for bamfile, item in itertools.izip(align_bams, items):

        metadata = item["metadata"]

        if metadata["phenotype"] == "normal":
            normal_bam = bamfile
            normal_sample_name = item["name"][1]
        elif metadata["phenotype"] == "tumor":
            tumor_bam = bamfile
            tumor_sample_name = item["name"][1]

    if tumor_bam is None or normal_bam is None:
        raise ValueError("Missing phenotype definition (tumor or normal) "
                         "in samples")

    params += ["-I:normal", normal_bam]
    params += ["-I:tumor", tumor_bam]
    params += ["--tumor_sample_name", tumor_sample_name]
    params += ["--normal_sample_name", normal_sample_name]
    params += ["--fraction_contamination", contamination]

    if cosmic is not None:
        params += ["--cosmic", cosmic]

    if region:
        params += [
            "-L",
            bamprep.region_to_gatk(region), "--interval_set_rule",
            "INTERSECTION"
        ]

    return broad_runner, params
Esempio n. 36
0
def _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """
    Preparation work for MuTect.
    """

    #FIXME: We assume all other bits in the config are shared

    base_config = items[0]["config"]
    dbsnp = assoc_files["dbsnp"]
    cosmic = assoc_files.get("cosmic")

    broad_runner = broad.runner_from_config(base_config, "mutect")

    mutect_version = broad_runner.get_mutect_version()

    try:
        assert mutect_version is not None
    except AssertionError:
        logger.warn("WARNING")
        logger.warn("MuTect version could not be determined from jar file. "
                    "Please ensure you are using at least version 1.1.5, "
                    "as versions 1.1.4 and lower have known issues.")
        logger.warn("Proceeding but assuming correct version 1.1.5.")
    else:
        try:
            assert LooseVersion(mutect_version) >= LooseVersion("1.1.5")
        except AssertionError:
            message =  ("MuTect 1.1.4 and lower is known to have incompatibilities "
                        "with Java < 7, and this may lead to problems in analyses. "
                        "Please use MuTect 1.1.5 or higher (note that it requires "
                        "Java 7).")
            raise ValueError(message)

    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, base_config)

    variant_regions = base_config["algorithm"].get("variant_regions", None)
    contamination = base_config["algorithm"].get("fraction_contamination", 0)
    region = subset_variant_regions(variant_regions, region, out_file)

    #FIXME: Add more parameters like fraction contamination etc

    params = ["-R", ref_file, "-T", "MuTect"]
    params += ["--dbsnp", dbsnp]

    tumor_bam = None
    normal_bam = None

    for bamfile, item in itertools.izip(align_bams, items):

        metadata = item["metadata"]

        if metadata["phenotype"] == "normal":
            normal_bam = bamfile
            normal_sample_name = item["name"][1]
        elif metadata["phenotype"] == "tumor":
            tumor_bam = bamfile
            tumor_sample_name = item["name"][1]

    if tumor_bam is None or normal_bam is None:
        raise ValueError("Missing phenotype definition (tumor or normal) "
                         "in samples")

    params += ["-I:normal", normal_bam]
    params += ["-I:tumor", tumor_bam]
    params += ["--tumor_sample_name", tumor_sample_name]
    params += ["--normal_sample_name", normal_sample_name]
    params += ["--fraction_contamination", contamination]

    if cosmic is not None:
        params += ["--cosmic", cosmic]

    if region:
        params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule",
                   "INTERSECTION"]

    return broad_runner, params