Example #1
0
def run(bam_file, data, out_dir):
    """Run qualimap to assess alignment quality metrics.
    """
    resources = config_utils.get_resources("qualimap", data["config"])
    options = " ".join(resources.get("options", ""))
    report_file = os.path.join(out_dir, "qualimapReport.html")
    pdf_file = "qualimapReport.pdf"
    if not utils.file_exists(report_file) and not utils.file_exists(os.path.join(out_dir, pdf_file)):
        if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []):
            logger.info("Full qualimap analysis for %s may be slow." % bam_file)
            ds_bam = bam_file
        else:
            ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir)
            bam_file = ds_bam if ds_bam else bam_file
        if options.find("PDF") > -1:
            options = "%s -outfile %s" % (options, pdf_file)
        utils.safe_makedir(out_dir)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        qualimap = config_utils.get_program("qualimap", data["config"])
        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                             num_cores)
        cmd = ("unset DISPLAY && {qualimap} bamqc -bam {bam_file} -outdir {out_dir} "
               "-nt {num_cores} --java-mem-size={max_mem} {options}")
        species = tz.get_in(("genome_resources", "aliases", "ensembl"), data, "")
        if species in ["HUMAN", "MOUSE"]:
            cmd += " -gd {species}"
        regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data)
        if regions:
            bed6_regions = _bed_to_bed6(regions, out_dir)
            cmd += " -gff {bed6_regions}"
        do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data))

    return _parse_qualimap_metrics(report_file)
Example #2
0
def organize(dirs, config, run_info_yaml):
    """Organize run information from a passed YAML file or the Galaxy API.

    Creates the high level structure used for subsequent processing.
    """
    if run_info_yaml and os.path.exists(run_info_yaml):
        logger.info("Using input YAML configuration: %s" % run_info_yaml)
        run_details = _run_info_from_yaml(dirs["flowcell"], run_info_yaml, config)
    else:
        logger.info("Fetching run details from Galaxy instance")
        fc_name, fc_date = flowcell.parse_dirname(dirs["flowcell"])
        galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key'])
        run_details = []
        galaxy_info = galaxy_api.run_details(fc_name, fc_date)
        for item in galaxy_info["details"]:
            item["upload"] = {"method": "galaxy", "run_id": galaxy_info["run_id"],
                              "fc_name": fc_name, "fc_date": fc_date}
            run_details.append(item)
    out = []
    for item in run_details:
        # add algorithm details to configuration, avoid double specification
        item["config"] = config_utils.update_w_custom(config, item)
        item.pop("algorithm", None)
        item["dirs"] = dirs
        if "name" not in item:
            item["name"] = ["", item["description"]]
        item = add_reference_resources(item)
        out.append(item)
    return out
Example #3
0
def organize(dirs, config, run_info_yaml):
    """Organize run information from a passed YAML file or the Galaxy API.

    Creates the high level structure used for subsequent processing.
    """
    logger.info("Using input YAML configuration: %s" % run_info_yaml)
    assert run_info_yaml and os.path.exists(run_info_yaml), \
        "Did not find input sample YAML file: %s" % run_info_yaml
    run_details = _run_info_from_yaml(dirs["flowcell"], run_info_yaml, config)
    out = []
    for item in run_details:
        # add algorithm details to configuration, avoid double specification
        item["config"] = config_utils.update_w_custom(config, item)
        item.pop("algorithm", None)
        item["dirs"] = dirs
        if "name" not in item:
            item["name"] = ["", item["description"]]
        elif isinstance(item["name"], basestring):
            description = "%s-%s" % (item["name"], clean_name(item["description"]))
            item["name"] = [item["name"], description]
            item["description"] = description
        item = add_reference_resources(item)
        # Create temporary directories and make absolute
        if utils.get_in(item, ("config", "resources", "tmp", "dir")):
            utils.safe_makedir(utils.get_in(item, ("config", "resources", "tmp", "dir")))
            item["config"]["resources"]["tmp"] = genome.abs_file_paths(
                utils.get_in(item, ("config", "resources", "tmp")))
        out.append(item)
    return out
Example #4
0
def run_prepare(*data):
    """
    Run seqcluster prepare to merge all samples in one file
    """
    out_dir = os.path.join(dd.get_work_dir(data[0][0]), "seqcluster", "prepare")
    out_dir = os.path.abspath(safe_makedir(out_dir))
    prepare_dir = os.path.join(out_dir, "prepare")
    tools = dd.get_expression_caller(data[0][0])
    if len(tools) == 0:
        logger.info("You didn't specify any other expression caller tool."
                       "You can add to the YAML file:"
                       "expression_caller:[trna, seqcluster, mirdeep2]")
    fn = []
    for sample in data:
        name = sample[0]["rgnames"]['sample']
        fn.append("%s\t%s" % (sample[0]['collapse'], name))
    args = namedtuple('args', 'debug print_debug minc minl maxl out')
    args = args(False, False, 2, 17, 40, out_dir)
    ma_out = op.join(out_dir, "seqs.ma")
    seq_out = op.join(out_dir, "seqs.fastq")
    min_shared = max(int(len(fn) / 10.0), 1)
    if not file_exists(ma_out):
        seq_l, sample_l = prepare._read_fastq_files(fn, args)
        with file_transaction(ma_out) as ma_tx:
            with open(ma_tx, 'w') as ma_handle:
                with open(seq_out, 'w') as seq_handle:
                    prepare._create_matrix_uniq_seq(sample_l, seq_l, ma_handle, seq_handle, min_shared)

    for sample in data:
        sample[0]["seqcluster_prepare_ma"] = ma_out
        sample[0]["seqcluster_prepare_fastq"] = seq_out
    return data
Example #5
0
File: do.py Project: zeneofa/bcbio
def run_memory_retry(cmd, descr, data=None, check=None, region=None):
    """Run command, retrying when detecting fail due to memory errors.

    This is useful for high throughput Java jobs which fail
    intermittently due to an inability to get system resources.
    """
    max_runs = 5
    num_runs = 0
    while 1:
        try:
            run(cmd, descr, data, check, region=region, log_error=False)
            break
        except subprocess.CalledProcessError, msg:
            if num_runs < max_runs and ("insufficient memory" in str(msg) or
                                        "did not provide enough memory" in str(msg) or
                                        "A fatal error has been detected" in str(msg) or
                                        "java.lang.OutOfMemoryError" in str(msg) or
                                        "Resource temporarily unavailable" in str(msg)):
                logger.info("Retrying job. Memory or resource issue with run: %s"
                            % _descr_str(descr, data, region))
                time.sleep(30)
                num_runs += 1
            else:
                logger.exception()
                raise
Example #6
0
File: rseqc.py Project: roryk/bipy
def genebody_coverage2(in_file, config, out_prefix=None):
    """
    used to check the 5'/3' bias across transcripts, takes a bam file,
    converts it to bigwig and then uses that
    """
    PROGRAM = "geneBody_coverage2.py"
    if not program_exists(PROGRAM):
        logger.info("%s is not in the path or is not executable." % (PROGRAM))
        exit(1)

    in_bigwig = bam2bigwig(in_file, config)
    prefix = "coverage"
    out_dir = os.path.join(os.path.dirname(in_bigwig), os.pardir, "coverage")
    safe_makedir(out_dir)
    out_prefix = out_dir + "/wiggle"
    #out_prefix = _get_out_prefix(in_bigwig, config, out_prefix, prefix)
    coverage_plot_file = out_prefix + ".geneBodyCoverage.pdf"
    if file_exists(coverage_plot_file):
        return coverage_plot_file

    gtf = _get_gtf(config)
    bed = _gtf2bed(gtf)
    coverage_run = sh.Command(which(PROGRAM))
    cmd = str(coverage_run.bake(i=in_bigwig, r=bed, o=out_prefix, t="pdf"))
    do.run(cmd, "Calculating coverage of %s." % (in_bigwig), None)
    return coverage_plot_file
Example #7
0
def filter_reads_by_length(fq1, fq2, quality_format, min_length=20):
    """
    removes reads from a pair of fastq files that are shorter than
    a minimum length. removes both ends of a read if one end falls
    below the threshold while maintaining the order of the reads

    """

    logger.info("Removing reads in %s and %s that "
                "are less than %d bases." % (fq1, fq2, min_length))
    fq1_out = utils.append_stem(fq1, ".fixed")
    fq2_out = utils.append_stem(fq2, ".fixed")
    fq1_single = utils.append_stem(fq1, ".singles")
    fq2_single = utils.append_stem(fq2, ".singles")
    if all(map(utils.file_exists, [fq1_out, fq2_out, fq2_single, fq2_single])):
        return [fq1_out, fq2_out]

    fq1_in = SeqIO.parse(fq1, quality_format)
    fq2_in = SeqIO.parse(fq2, quality_format)

    with open(fq1_out, 'w') as fq1_out_handle, open(fq2_out, 'w') as fq2_out_handle, open(fq1_single, 'w') as fq1_single_handle, open(fq2_single, 'w') as fq2_single_handle:
        for fq1_record, fq2_record in izip(fq1_in, fq2_in):
            if len(fq1_record.seq) >= min_length and len(fq2_record.seq) >= min_length:
                fq1_out_handle.write(fq1_record.format(quality_format))
                fq2_out_handle.write(fq2_record.format(quality_format))
            else:
                if len(fq1_record.seq) > min_length:
                    fq1_single_handle.write(fq1_record.format(quality_format))
                if len(fq2_record.seq) > min_length:
                    fq2_single_handle.write(fq2_record.format(quality_format))

    return [fq1_out, fq2_out]
Example #8
0
File: fastqc.py Project: roryk/bipy
def detect_fastq_format(in_file, MAX_RECORDS=1000000):
    """
    detects the format of a fastq file
    will return multiple formats if it could be more than one
    """
    logger.info("Detecting FASTQ format on %s." % (in_file))
    kept = list(_FASTQ_RANGES.keys())
    with open(in_file) as in_handle:
        records_read = 0
        for i, line in enumerate(in_handle):
            # get the quality line
            if records_read >= MAX_RECORDS:
                break
            if i % 4 is 3:
                records_read += 1
                for c in line:
                    formats = kept
                    if len(formats) == 1:
                        return formats
                    for form in formats:
                        if (_FASTQ_RANGES[form][0] > ord(c) or
                            _FASTQ_RANGES[form][1] < ord(c)):
                            kept.remove(form)

    return formats
Example #9
0
def _merge_metrics(samples, out_dir):
    """Merge metrics from multiple QC steps
    """
    logger.info("summarize metrics")
    out_dir = utils.safe_makedir(os.path.join(out_dir, "report", "metrics"))
    sample_metrics = collections.defaultdict(dict)
    for s in samples:
        s = _add_disambiguate(s)
        m = tz.get_in(['summary', 'metrics'], s)
        if isinstance(m, six.string_types):
            m = json.loads(m)
        if m:
            for me in m.keys():
                if isinstance(m[me], list) or isinstance(m[me], dict) or isinstance(m[me], tuple):
                    m.pop(me, None)
            sample_metrics[dd.get_sample_name(s)].update(m)
    out = []
    for sample_name, m in sample_metrics.items():
        sample_file = os.path.join(out_dir, "%s_bcbio.txt" % sample_name)
        with file_transaction(samples[0], sample_file) as tx_out_file:
            dt = pd.DataFrame(m, index=['1'])
            dt.columns = [k.replace(" ", "_").replace("(", "").replace(")", "") for k in dt.columns]
            dt['sample'] = sample_name
            dt['rRNA_rate'] = m.get('rRNA_rate', "NA")
            dt['RiP_pct'] = "%.3f" % (int(m.get("RiP", 0)) / float(m.get("Total_reads", 1)) * 100)
            dt = _fix_duplicated_rate(dt)
            dt.transpose().to_csv(tx_out_file, sep="\t", header=False)
        out.append(sample_file)
    return out
Example #10
0
def process_lane(lane_items, fc_name, fc_date, dirs, config):
    """Prepare lanes, potentially splitting based on barcodes.
    """
    lane_name = "%s_%s_%s" % (lane_items[0]['lane'], fc_date, fc_name)
    logger.info("Demulitplexing %s" % lane_name)
    full_fastq1, full_fastq2 = get_fastq_files(dirs["fastq"], dirs["work"],
                                               lane_items[0], fc_name, config=config)
    bc_files = split_by_barcode(full_fastq1, full_fastq2, lane_items,
                                lane_name, dirs, config)
    out = []
    for item in lane_items:
        config = _update_config_w_custom(config, item)
        # Can specify all barcodes but might not have actual sequences
        # Would be nice to have a good way to check this is okay here.
        if bc_files.has_key(item["barcode_id"]):
            for fastq1, fastq2, lane_ext in _prep_fastq_files(item, bc_files, dirs, config):
                cur_lane_name = lane_name
                cur_lane_desc = item["description"]
                if item.get("name", "") and config["algorithm"].get("include_short_name", True):
                    cur_lane_desc = "%s : %s" % (item["name"], cur_lane_desc)
                if item["barcode_id"] is not None:
                    cur_lane_name += "_%s" % (item["barcode_id"])
                if lane_ext is not None:
                    cur_lane_name += "_s{0}".format(lane_ext)
                if config["algorithm"].get("trim_reads", False):
                    trim_info = brun_trim_fastq([x for x in [fastq1, fastq2] if x is not None],
                                                dirs, config)
                    fastq1 = trim_info[0]
                    if fastq2 is not None:
                        fastq2 = trim_info[1]
                out.append((fastq1, fastq2, item, cur_lane_name, cur_lane_desc,
                            dirs, config))
    return out
Example #11
0
def combine_calls(batch_id, samples, data):
    """Combine multiple callsets into a final set of merged calls.
    """
    logger.info("Ensemble consensus calls for {0}: {1}".format(
        batch_id, ",".join(x["variantcaller"] for x in samples[0]["variants"])))
    edata = copy.deepcopy(data)
    base_dir = utils.safe_makedir(os.path.join(edata["dirs"]["work"], "ensemble", batch_id))
    caller_names, vrn_files, bam_files = _organize_variants(samples, batch_id)
    exist_variants = False
    for tmp_vrn_file in vrn_files:
        if vcfutils.vcf_has_variants(tmp_vrn_file):
            exist_variants = True
            break
    if exist_variants:
        if "classifiers" not in edata["config"]["algorithm"]["ensemble"]:
            callinfo = _run_ensemble_intersection(batch_id, vrn_files, base_dir, edata)
        else:
            config_file = _write_config_file(batch_id, caller_names, base_dir, edata)
            callinfo = _run_ensemble(batch_id, vrn_files, config_file, base_dir,
                                     edata["sam_ref"], edata)
        edata["config"]["algorithm"]["variantcaller"] = "ensemble"
        edata["vrn_file"] = callinfo["vrn_file"]
        edata["ensemble_bed"] = callinfo["bed_file"]
        callinfo["validate"] = validate.compare_to_rm(edata)[0][0].get("validate")
    else:
        out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id))
        vcfutils.write_empty_vcf(out_vcf_file)
        callinfo = {"variantcaller": "ensemble",
                    "vrn_file": out_vcf_file,
                    "bed_file": None}
    return [[batch_id, callinfo]]
Example #12
0
 def run(self, config, run_info_yaml, parallel, dirs, samples):
     ## Alignment and preparation requiring the entire input file (multicore cluster)
     with prun.start(_wres(parallel, ["aligner"]), samples, config, dirs, "multicore") as run_parallel:
         with profile.report("organize samples", dirs):
             samples = run_parallel(
                 "organize_samples", [[dirs, config, run_info_yaml, [x[0]["description"] for x in samples]]]
             )
         with profile.report("alignment", dirs):
             samples = run_parallel("process_alignment", samples)
         with profile.report("callable regions", dirs):
             samples = run_parallel("prep_samples", [samples])
             samples = run_parallel("postprocess_alignment", samples)
             samples = run_parallel("combine_sample_regions", [samples])
             samples = region.clean_sample_data(samples)
     ## Quality control
     with prun.start(
         _wres(parallel, ["fastqc", "bamtools", "samtools", "qsignature", "kraken"]),
         samples,
         config,
         dirs,
         "multicore2",
     ) as run_parallel:
         with profile.report("quality control", dirs):
             samples = qcsummary.generate_parallel(samples, run_parallel)
         with profile.report("upload", dirs):
             for sample in samples:
                 run_parallel("upload_samples", [sample])
     logger.info("Timing: finished")
     return samples
Example #13
0
def _variant_filtration_indel(snp_file, ref_file, vrn_files, config):
    """Filter indel variant calls using GATK best practice recommendations.
    """
    broad_runner = broad.runner_from_config(config)
    filter_type = "INDEL"
    variantcaller = config["algorithm"].get("variantcaller", "gatk")
    if not config_utils.use_vqsr([config["algorithm"]]):
        return vfilter.jexl_hard(broad_runner, snp_file, ref_file, filter_type,
                                 ["QD < 2.0", "ReadPosRankSum < -20.0", "FS > 200.0"])
    else:
        # also check if we've failed recal and needed to do strict filtering
        filter_file = "{base}-filter{ext}.vcf".format(base=os.path.splitext(snp_file)[0], ext=filter_type)
        if file_exists(filter_file):
            config["algorithm"]["coverage_interval"] = "regional"
            return _variant_filtration_indel(snp_file, ref_file, vrn_files, config)
        assert "train_indels" in vrn_files, "Need indel training file specified"
        params, recal_file, tranches_file = _shared_variant_filtration(
            filter_type, snp_file, ref_file, vrn_files, variantcaller)
        if not file_exists(recal_file):
            with file_transaction(recal_file, tranches_file) as (tx_recal, tx_tranches):
                params.extend(["--recal_file", tx_recal,
                               "--tranches_file", tx_tranches])
                if LooseVersion(broad_runner.get_gatk_version()) >= LooseVersion("2.7"):
                    params.extend(["--numBadVariants", "3000"])
                try:
                    broad_runner.new_resources("gatk-vqsr")
                    broad_runner.run_gatk(params, log_error=False)
                except:
                    logger.info("VQSR failed due to lack of training data. Using hard filtering.")
                    config["algorithm"]["coverage_interval"] = "regional"
                    return _variant_filtration_indel(snp_file, ref_file, vrn_files, config)
        return _apply_variant_recal(broad_runner, snp_file, ref_file, recal_file,
                                    tranches_file, filter_type)
Example #14
0
def align_transcriptome(fastq_file, pair_file, ref_file, data):
    """
    bwa mem with settings for aligning to the transcriptome for eXpress/RSEM/etc
    """
    work_bam = dd.get_work_bam(data)
    base, ext = os.path.splitext(work_bam)
    out_file = base + ".transcriptome" + ext
    if utils.file_exists(out_file):
        data = dd.set_transcriptome_bam(data, out_file)
        return data
    # bwa mem needs phred+33 quality, so convert if it is Illumina
    if dd.get_quality_format(data).lower() == "illumina":
        logger.info("bwa mem does not support the phred+64 quality format, " "converting %s and %s to phred+33.")
        fastq_file = fastq.groom(fastq_file, in_qual="fastq-illumina", data=data)
        if pair_file:
            pair_file = fastq.groom(pair_file, in_qual="fastq-illumina", data=data)
    bwa = config_utils.get_program("bwa", data["config"])
    gtf_file = dd.get_gtf_file(data)
    gtf_fasta = index_transcriptome(gtf_file, ref_file, data)
    args = " ".join(_bwa_args_from_config(data["config"]))
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    cmd = (
        "{bwa} mem {args} -a -t {num_cores} {gtf_fasta} {fastq_file} "
        "{pair_file} | samtools view -bhS - > {tx_out_file}"
    )

    with file_transaction(out_file) as tx_out_file:
        message = "Aligning %s and %s to the transcriptome." % (fastq_file, pair_file)
        do.run(cmd.format(**locals()), message)
    data = dd.set_transcriptome_bam(data, out_file)
    return data
Example #15
0
def generate_transcript_counts(data):
    """Generate counts per transcript and per exon from an alignment"""
    data["count_file"] = featureCounts.count(data)

    if dd.get_fusion_mode(data, False):
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data = dd.set_oncofuse_file(data, oncofuse_file)

    if dd.get_transcriptome_align(data) and not dd.get_transcriptome_bam(data):
        file1, file2 = None, None

        if dd.get_disambiguate(data):
            bam_path = data["work_bam"]
            fastq_paths = alignprep._bgzip_from_bam(bam_path, data["dirs"], data["config"], is_retry=False, output_infix='-transcriptome')
            if len(fastq_paths) == 2:
                file1, file2 = fastq_paths
            else:
                file1, file2 = fastq_paths[0], None
        else:
            file1, file2 = dd.get_input_sequence_files(data)

        ref_file = dd.get_ref_file(data)
        logger.info("Transcriptome alignment was flagged to run, but the "
                    "transcriptome BAM file was not found. Aligning to the "
                    "transcriptome with bowtie2.")
        data = bowtie2.align_transcriptome(file1, file2, ref_file, data)
    return [[data]]
Example #16
0
def _mint_trna_annotation(data):
    """
    use MINTmap to quantify tRNAs
    """
    trna_lookup = op.join(dd.get_srna_mint_lookup(data))
    trna_space = op.join(dd.get_srna_mint_space(data))
    trna_other = op.join(dd.get_srna_mint_other(data))
    name = dd.get_sample_name(data)
    work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "trna_mint", name))
    in_file = op.basename(data["clean_fastq"])
    mintmap = os.path.realpath(os.path.join(os.path.dirname(sys.executable), "MINTmap.pl"))
    perl_export = utils.get_perl_exports()
    if not file_exists(trna_lookup) or not file_exists(mintmap):
        logger.info("There is no tRNA annotation to run MINTmap.")
        return work_dir
    jar_folder = os.path.join(os.path.dirname(mintmap), "MINTplates")
    out_file = op.join(work_dir, name + "-MINTmap_v1-exclusive-tRFs.expression.txt")
    if not file_exists(out_file):
        with tx_tmpdir(data) as txdir:
            with utils.chdir(txdir):
                utils.symlink_plus(data["clean_fastq"], op.join(txdir, in_file))
                cmd = ("{perl_export} && {mintmap} -f {in_file} -p {name} "
                       "-l {trna_lookup} -s {trna_space} -j {jar_folder} "
                       "-o {trna_other}").format(**locals())
                do.run(cmd, "tRNA for %s" % name)
                for filename in glob.glob("*MINTmap*"):
                    shutil.move(filename, work_dir)
    return work_dir
Example #17
0
def copy_flowcell(dname, fastq_dir, sample_cfile, config):
    """Copy required files for processing using rsync, potentially to a remote server.
    """
    with utils.chdir(dname):
        reports = reduce(operator.add,
                         [glob.glob("*.xml"),
                          glob.glob("Data/Intensities/BaseCalls/*.xml"),
                          glob.glob("Data/Intensities/BaseCalls/*.xsl"),
                          glob.glob("Data/Intensities/BaseCalls/*.htm"),
                          ["Data/Intensities/BaseCalls/Plots", "Data/reports",
                           "Data/Status.htm", "Data/Status_Files", "InterOp"]])
        run_info = reduce(operator.add,
                          [glob.glob("run_info.yaml"),
                           glob.glob("*.csv")])
        fastq = glob.glob(os.path.join(fastq_dir.replace(dname + "/", "", 1),
                                       "*.gz"))
        configs = [sample_cfile.replace(dname + "/", "", 1)]
    include_file = os.path.join(dname, "transfer_files.txt")
    with open(include_file, "w") as out_handle:
        out_handle.write("+ */\n")
        for fname in configs + fastq + run_info + reports:
            out_handle.write("+ %s\n" % fname)
        out_handle.write("- *\n")
    # remote transfer
    if utils.get_in(config, ("process", "host")):
        dest = "%s@%s:%s" % (utils.get_in(config, ("process", "username")),
                             utils.get_in(config, ("process", "host")),
                             utils.get_in(config, ("process", "dir")))
    # local transfer
    else:
        dest = utils.get_in(config, ("process", "dir"))
    cmd = ["rsync", "-akmrtv", "--include-from=%s" % include_file, dname, dest]
    logger.info("Copying files to analysis machine")
    logger.info(" ".join(cmd))
    subprocess.check_call(cmd)
Example #18
0
def prep_recal(data):
    """Perform a GATK recalibration of the sorted aligned BAM, producing recalibrated BAM.
    """
    if dd.get_recalibrate(data) in [True, "gatk"]:
        logger.info("Recalibrating %s with GATK" % str(dd.get_sample_name(data)))
        ref_file = data["sam_ref"]
        config = data["config"]
        dbsnp_file = tz.get_in(("genome_resources", "variation", "dbsnp"), data)
        if not dbsnp_file:
            logger.info("Skipping GATK BaseRecalibrator because no VCF file of known variants was found.")
            return [[data]]
        platform = config["algorithm"].get("platform", "illumina")
        broad_runner = broad.runner_from_path("picard", config)
        broad_runner.run_fn("picard_index_ref", ref_file)
        if config["algorithm"].get("mark_duplicates", True):
            (dup_align_bam, _) = broad_runner.run_fn("picard_mark_duplicates", data["work_bam"])
        else:
            dup_align_bam = data["work_bam"]
        bam.index(dup_align_bam, config)
        intervals = config["algorithm"].get("variant_regions", None)
        data["work_bam"] = dup_align_bam
        broad_runner = broad.runner_from_config(config)
        data["prep_recal"] = _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file,
                                                     platform, dbsnp_file, intervals, data)
    return [[data]]
Example #19
0
def _call_variants_samtools(align_bams, ref_file, items, target_regions, out_file):
    """Call variants with samtools in target_regions.

    Works around a GATK VCF compatibility issue in samtools 0.20 by removing extra
    Version information from VCF header lines.
    """
    config = items[0]["config"]

    max_read_depth = "1000"
    mpileup = prep_mpileup(align_bams, ref_file, max_read_depth, config,
                           target_regions=target_regions)
    bcftools = config_utils.get_program("bcftools", config)
    bcftools_version = programs.get_version("bcftools", config=config)
    samtools_version = programs.get_version("samtools", config=config)
    if LooseVersion(bcftools_version) > LooseVersion("0.1.19"):
        if LooseVersion(samtools_version) <= LooseVersion("0.1.19"):
            raise ValueError("samtools calling not supported with 0.1.19 samtools and 0.20 bcftools")
        bcftools_opts = "call -v -c"
    else:
        bcftools_opts = "view -v -c -g"
    compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
    vcfutils = config_utils.get_program("vcfutils.pl", config)
    # XXX Check if we need this when supporting samtools 0.2.0 calling.
    # 0.1.9 fails on regions without reads.
    if not any(realign.has_aligned_reads(x, target_regions) for x in align_bams):
        vcfutils.write_empty_vcf(out_file, config)
    else:
        cmd = ("{mpileup} "
               "| {bcftools} {bcftools_opts} - "
               "| {vcfutils} varFilter -D {max_read_depth} "
               "| sed 's/,Version=3>/>/'"
               "{compress_cmd} > {out_file}")
        logger.info(cmd.format(**locals()))
        do.run(cmd.format(**locals()), "Variant calling with samtools", {})
Example #20
0
def gatk_realigner_targets(runner, align_bam, ref_file, dbsnp=None,
                           region=None, out_file=None, deep_coverage=False,
                           variant_regions=None):
    """Generate a list of interval regions for realignment around indels.
    """
    if out_file:
        out_file = "%s.intervals" % os.path.splitext(out_file)[0]
    else:
        out_file = "%s-realign.intervals" % os.path.splitext(align_bam)[0]
    # check only for file existence; interval files can be empty after running
    # on small chromosomes, so don't rerun in those cases
    if not os.path.exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            logger.info("GATK RealignerTargetCreator: %s %s" %
                        (os.path.basename(align_bam), region))
            params = ["-T", "RealignerTargetCreator",
                      "-I", align_bam,
                      "-R", ref_file,
                      "-o", tx_out_file,
                      "-l", "INFO",
                      ]
            region = subset_variant_regions(variant_regions, region, tx_out_file)
            if region:
                params += ["-L", region, "--interval_set_rule", "INTERSECTION"]
            if dbsnp:
                params += ["--known", dbsnp]
            if deep_coverage:
                params += ["--mismatchFraction", "0.30",
                           "--maxIntervalSize", "650"]
            runner.run_gatk(params)
    return out_file
def gff3_to_gtf(gff3_file):

    dialect = {'field separator': '; ',
               'fmt': 'gtf',
               'keyval separator': ' ',
               'leading semicolon': False,
               'multival separator': ',',
               'quoted GFF2 values': True,
               'order': ['gene_id', 'transcript_id'],
               'repeated keys': False,
               'trailing semicolon': True}

    out_file = os.path.splitext(gff3_file)[0] + ".gtf"
    if file_exists(out_file):
        return out_file

    logger.info("Converting %s to %s." % (gff3_file, out_file))

    if _is_from_ncbi(gff3_file):
        logger.info("NCBI format detected by the presence of the %s key."
                    % _is_from_ncbi(gff3_file))
        _output_ncbi_gff3(gff3_file, out_file, dialect)
    else:
        _output_gff3(gff3_file, out_file, dialect)
    return out_file
Example #22
0
File: bwa.py Project: vals/bcbb
def align(fastq_file, pair_file, ref_file, out_base, align_dir, config,
          rg_name=None):
    """Perform a BWA alignment, generating a SAM file.
    """
    sai1_file = os.path.join(align_dir, "%s_1.sai" % out_base)
    sai2_file = (os.path.join(align_dir, "%s_2.sai" % out_base)
                 if pair_file else None)
    sam_file = os.path.join(align_dir, "%s.sam" % out_base)
    if not file_exists(sam_file):
        if not file_exists(sai1_file):
            with file_transaction(sai1_file) as tx_sai1_file:
                _run_bwa_align(fastq_file, ref_file, tx_sai1_file, config)
        if sai2_file and not file_exists(sai2_file):
            with file_transaction(sai2_file) as tx_sai2_file:
                _run_bwa_align(pair_file, ref_file, tx_sai2_file, config)
        align_type = "sampe" if sai2_file else "samse"
        sam_cl = [config["program"]["bwa"], align_type, ref_file, sai1_file]
        if sai2_file:
            sam_cl.append(sai2_file)
        sam_cl.append(fastq_file)
        if sai2_file:
            sam_cl.append(pair_file)
        with file_transaction(sam_file) as tx_sam_file:
            with open(tx_sam_file, "w") as out_handle:
                logger.info(" ".join(sam_cl))
                subprocess.check_call(sam_cl, stdout=out_handle)
    return sam_file
Example #23
0
def _analysis_block_stats(regions):
    """Provide statistics on sizes and number of analysis blocks.
    """
    prev = None
    between_sizes = []
    region_sizes = []
    for region in regions:
        if prev and prev.chrom == region.chrom:
            between_sizes.append(region.start - prev.end)
        region_sizes.append(region.end - region.start)
        prev = region
    def descriptive_stats(xs):
        if len(xs) < 2:
            return xs
        parts = ["min: %s" % min(xs),
                 "5%%: %s" % numpy.percentile(xs, 5),
                 "25%%: %s" % numpy.percentile(xs, 25),
                 "median: %s" % numpy.percentile(xs, 50),
                 "75%%: %s" % numpy.percentile(xs, 75),
                 "95%%: %s" % numpy.percentile(xs, 95),
                 "99%%: %s" % numpy.percentile(xs, 99),
                 "max: %s" % max(xs)]
        return "\n".join(["  " + x for x in parts])
    logger.info("Identified %s parallel analysis blocks\n" % len(region_sizes) +
                "Block sizes:\n%s\n" % descriptive_stats(region_sizes) +
                "Between block sizes:\n%s\n" % descriptive_stats(between_sizes))
    if len(region_sizes) == 0:
        raise ValueError("No callable analysis regions found in all samples")
Example #24
0
def illumina_qual_bin(in_file, ref_file, out_dir, config):
    """Uses CRAM to perform Illumina 8-bin approaches to existing BAM files.

    Bins quality scores according to Illumina scheme:

    http://www.illumina.com/Documents/products/whitepapers/whitepaper_datacompression.pdf

    Also fixes output header to remove extra run groups added by CRAM during conversion.
    """
    index_file = ref_file + ".fai"
    assert os.path.exists(index_file), "Could not find FASTA reference index: %s" % index_file
    out_file = os.path.join(out_dir, "%s-qualbin%s" % os.path.splitext(os.path.basename(in_file)))
    cram_jar = config_utils.get_jar("cramtools",
                                    config_utils.get_program("cram", config, "dir"))
    samtools = config_utils.get_program("samtools", config)
    if not file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            orig_header = "%s-header.sam" % os.path.splitext(out_file)[0]
            header_cmd = "{samtools} view -H -o {orig_header} {in_file}"
            cmd = ("java -jar {cram_jar} cram --input-bam-file {in_file} "
                   " --reference-fasta-file {ref_file} --preserve-read-names "
                   " --capture-all-tags --lossy-quality-score-spec '*8' "
                   "| java -jar {cram_jar} bam --output-bam-format "
                   "  --reference-fasta-file {ref_file} "
                   "| {samtools} reheader {orig_header} - "
                   "> {tx_out_file}")
            logger.info("Quality binning with CRAM")
            subprocess.check_call(header_cmd.format(**locals()), shell=True)
            subprocess.check_call(cmd.format(**locals()), shell=True)
    return out_file
Example #25
0
def run_freebayes(align_bam, ref_file, config, dbsnp=None, region=None,
                  out_file=None):
    """Detect small polymorphisms with FreeBayes.
    """
    if out_file is None:
        out_file = "%s-variants.vcf" % os.path.splitext(align_bam)[0]
    if not file_exists(out_file):
        logger.info("Genotyping with FreeBayes: {region} {fname}".format(
            region=region, fname=os.path.basename(align_bam)))
        with file_transaction(out_file) as tx_out_file:
            cl = [config["program"].get("freebayes", "freebayes"),
                  "-b", align_bam, "-v", tx_out_file, "-f", ref_file]
            if region:
                cl.extend(["-r", region])
            try:
                subprocess.check_call(cl)
            # XXX Temporary, work around freebayes issue; need to recall these regions
            # later so this is an ugly silent fix. Will need to grep for 'freebayes failed'
            # https://github.com/ekg/freebayes/issues/22
            except subprocess.CalledProcessError:
                with open(tx_out_file, "w") as out_handle:
                    out_handle.write("##fileformat=VCFv4.1\n"
                                     "## No variants; freebayes failed\n"
                                     "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n")
    return out_file
Example #26
0
def run(items):
    paired = vcfutils.get_paired(items)
    if not paired:
        logger.info("Skipping PureCN; no somatic tumor calls in batch: %s" %
                    " ".join([dd.get_sample_name(d) for d in items]))
        return items
    work_dir = _sv_workdir(paired.tumor_data)
    purecn_out = _run_purecn(paired, work_dir)
    # XXX Currently finding edge case failures with Dx calling, needs additional testing
    # purecn_out = _run_purecn_dx(purecn_out, paired)
    out = []
    if paired.normal_data:
        out.append(paired.normal_data)
    if purecn_out:
        purecn_out["variantcaller"] = "purecn"
        if "loh" in purecn_out:
            from bcbio.structural import titancna
            purecn_out["vrn_file"] = titancna.to_vcf(purecn_out["loh"], "PureCN", _get_header, _loh_to_vcf,
                                                     paired.tumor_data, sep=",")
            purecn_out["lohsummary"] = loh.summary_status(purecn_out, paired.tumor_data)
        if "sv" not in paired.tumor_data:
            paired.tumor_data["sv"] = []
        paired.tumor_data["sv"].append(purecn_out)
    out.append(paired.tumor_data)
    return out
Example #27
0
 def run_parallel(fn_name, items, metadata=None):
     items = [x for x in items if x is not None]
     if len(items) == 0:
         return []
     items = diagnostics.track_parallel(items, fn_name)
     imodule = parallel.get("module", "bcbio.distributed")
     sysinfo = system.get_info(dirs, parallel)
     if parallel["type"].startswith("messaging"):
         task_module = "{base}.tasks".format(base=imodule)
         runner_fn = runner(task_module, dirs, config, config_file)
         return runner_fn(fn_name, items)
     elif parallel["type"] == "ipython":
         return ipython.runner(parallel, fn_name, items, dirs["work"], sysinfo, config)
     else:
         logger.info("multiprocessing: %s" % fn_name)
         fn = getattr(__import__("{base}.multitasks".format(base=imodule),
                                 fromlist=["multitasks"]),
                      fn_name)
         jobr = ipython.find_job_resources([fn], parallel, items, sysinfo, config)
         items = [ipython.add_cores_to_config(x, jobr.cores_per_job) for x in items]
         if joblib is None:
             raise ImportError("Need joblib for multiprocessing parallelization")
         out = []
         for data in joblib.Parallel(jobr.num_jobs)(joblib.delayed(fn)(x) for x in items):
             if data:
                 out.extend(data)
         return out
Example #28
0
def process_alignment(data):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    fastq1, fastq2 = data["files"]
    config = data["config"]
    aligner = config["algorithm"].get("aligner", None)
    out_bam = ""
    if os.path.exists(fastq1) and aligner:
        logger.info("Aligning lane %s with %s aligner" % (data["rgnames"]["lane"], aligner))
        out_bam = align_to_sort_bam(fastq1, fastq2, aligner, data)
    elif os.path.exists(fastq1) and fastq1.endswith(".bam"):
        sort_method = config["algorithm"].get("bam_sort")
        bamclean = config["algorithm"].get("bam_clean")
        if sort_method:
            runner = broad.runner_from_config(config)
            out_file = os.path.join(data["dirs"]["work"], "{}-sort.bam".format(
                os.path.splitext(os.path.basename(fastq1))[0]))
            out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file)
        elif bamclean is True or bamclean == "picard":
            out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], data["sam_ref"], data["dirs"], config)
        else:
            out_bam = link_bam_file(fastq1, os.path.join(data["dirs"]["work"], "prealign",
                                                         data["rgnames"]["sample"]))
        _check_prealigned_bam(fastq1, data["sam_ref"], config)
    if not out_bam and not os.path.exists(fastq1):
        raise ValueError("Could not find input file: %s" % fastq1)
    data["work_bam"] = out_bam
    return [[data]]
Example #29
0
def gatk_indel_realignment(runner, align_bam, ref_file, intervals,
                           region=None, out_file=None, deep_coverage=False):
    """Perform realignment of BAM file in specified regions
    """
    if out_file is None:
        out_file = "%s-realign.bam" % os.path.splitext(align_bam)[0]
    if not file_exists(out_file):
        with curdir_tmpdir() as tmp_dir:
            with file_transaction(out_file) as tx_out_file:
                logger.info("GATK IndelRealigner: %s %s" %
                            (os.path.basename(align_bam), region))
                params = ["-T", "IndelRealigner",
                          "-I", align_bam,
                          "-R", ref_file,
                          "-targetIntervals", intervals,
                          "-o", tx_out_file,
                          "-l", "INFO",
                          ]
                if region:
                    params += ["-L", region]
                if deep_coverage:
                    params += ["--maxReadsInMemory", "300000",
                               "--maxReadsForRealignment", str(int(5e5)),
                               "--maxReadsForConsensuses", "500",
                               "--maxConsensuses", "100"]
                try:
                    runner.run_gatk(params, tmp_dir)
                except:
                    logger.exception("Running GATK IndelRealigner failed: {} {}".format(
                        os.path.basename(align_bam), region))
                    raise
    return out_file
Example #30
0
    def run(self, config, config_file, parallel, dirs, samples):
        with prun.start(_wres(parallel, ["picard", "AlienTrimmer"]),
                        samples, config, dirs, "trimming") as run_parallel:
            with profile.report("adapter trimming", dirs):
                samples = run_parallel("prepare_sample", samples)
                samples = run_parallel("trim_sample", samples)
        with prun.start(_wres(parallel, ["aligner", "picard"],
                              ensure_mem={"tophat": 8, "tophat2": 8, "star": 40}),
                        samples, config, dirs, "multicore",
                        multiplier=alignprep.parallel_multiplier(samples)) as run_parallel:
            with profile.report("alignment", dirs):
                samples = disambiguate.split(samples)
                samples = run_parallel("process_alignment", samples)
        with prun.start(_wres(parallel, ["samtools", "cufflinks"]),
                        samples, config, dirs, "rnaseqcount") as run_parallel:
            with profile.report("disambiguation", dirs):
                samples = disambiguate.resolve(samples, run_parallel)
            with profile.report("transcript assembly", dirs):
                samples = rnaseq.assemble_transcripts(run_parallel, samples)
            with profile.report("estimate expression", dirs):
                samples = rnaseq.estimate_expression(samples, run_parallel)

        with prun.start(_wres(parallel, ["picard", "fastqc", "rnaseqc","kraken"]),
                        samples, config, dirs, "persample") as run_parallel:
            with profile.report("quality control", dirs):
                samples = qcsummary.generate_parallel(samples, run_parallel)
        
        logger.info("Timing: finished")
        return samples
Example #31
0
def process_alignment(data, alt_input=None):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    data = utils.to_single_data(data)
    fastq1, fastq2 = dd.get_input_sequence_files(data)
    if alt_input:
        fastq1, fastq2 = alt_input
    config = data["config"]
    aligner = config["algorithm"].get("aligner", None)
    if fastq1 and objectstore.file_exists_or_remote(fastq1) and aligner:
        logger.info("Aligning lane %s with %s aligner" %
                    (data["rgnames"]["lane"], aligner))
        data = align_to_sort_bam(fastq1, fastq2, aligner, data)
        if dd.get_umi_consensus(data):
            data["umi_bam"] = dd.get_work_bam(data)
            if fastq2:
                f1, f2 = postalign.umi_consensus(data)
                del data["config"]["algorithm"]["umi_type"]
                data["config"]["algorithm"]["mark_duplicates"] = False
                data = align_to_sort_bam(f1, f2, aligner, data)
        data = _add_supplemental_bams(data)
    elif fastq1 and objectstore.file_exists_or_remote(
            fastq1) and fastq1.endswith(".bam"):
        sort_method = config["algorithm"].get("bam_sort")
        bamclean = config["algorithm"].get("bam_clean")
        if bamclean is True or bamclean == "picard":
            if sort_method and sort_method != "coordinate":
                raise ValueError(
                    "Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s"
                    % sort_method)
            out_bam = cleanbam.picard_prep(fastq1, data["rgnames"],
                                           dd.get_ref_file(data), data["dirs"],
                                           data)
        elif bamclean == "fixrg":
            out_bam = cleanbam.fixrg(fastq1, data["rgnames"],
                                     dd.get_ref_file(data), data["dirs"], data)
        elif sort_method:
            runner = broad.runner_from_path("picard", config)
            out_file = os.path.join(
                data["dirs"]["work"], "{}-sort.bam".format(
                    os.path.splitext(os.path.basename(fastq1))[0]))
            out_bam = runner.run_fn("picard_sort", fastq1, sort_method,
                                    out_file)
        else:
            out_bam = link_bam_file(
                fastq1,
                os.path.join(data["dirs"]["work"], "prealign",
                             data["rgnames"]["sample"]))
        bam.index(out_bam, data["config"])
        bam.check_header(out_bam, data["rgnames"], dd.get_ref_file(data),
                         data["config"])
        dedup_bam = postalign.dedup_bam(out_bam, data)
        bam.index(dedup_bam, data["config"])
        data["work_bam"] = dedup_bam
    elif fastq1 and objectstore.file_exists_or_remote(
            fastq1) and fastq1.endswith(".cram"):
        data["work_bam"] = fastq1
    elif fastq1 is None and "vrn_file" in data:
        data["config"]["algorithm"]["variantcaller"] = False
        data["work_bam"] = None
    elif not fastq1:
        raise ValueError("No 'files' specified for input sample: %s" %
                         dd.get_sample_name(data))
    else:
        raise ValueError(
            "Could not process input file from sample configuration. \n" +
            fastq1 + "\nIs the path to the file correct or is empty?\n" +
            "If it is a fastq file (not pre-aligned BAM or CRAM), "
            "is an aligner specified in the input configuration?")
    if data.get("work_bam"):
        # Add stable 'align_bam' target to use for retrieving raw alignment
        data["align_bam"] = data["work_bam"]
        data = _add_hla_files(data)
    return [[data]]
Example #32
0
def variant2pipeline(config, run_info_yaml, parallel, dirs, samples):
    ## Alignment and preparation requiring the entire input file (multicore cluster)
    with prun.start(
            _wres(
                parallel, ["aligner", "samtools", "sambamba"],
                (["reference", "fasta"], ["reference", "aligner"], ["files"])),
            samples,
            config,
            dirs,
            "multicore",
            multiplier=alignprep.parallel_multiplier(samples)) as run_parallel:
        with profile.report("organize samples", dirs):
            samples = run_parallel("organize_samples", [[
                dirs, config, run_info_yaml,
                [x[0]["description"] for x in samples]
            ]])
        with profile.report("alignment preparation", dirs):
            samples = run_parallel("prep_align_inputs", samples)
            samples = run_parallel("disambiguate_split", [samples])
        with profile.report("alignment", dirs):
            samples = run_parallel("process_alignment", samples)
            samples = disambiguate.resolve(samples, run_parallel)
            samples = alignprep.merge_split_alignments(samples, run_parallel)
        with profile.report("callable regions", dirs):
            samples = run_parallel("prep_samples", [samples])
            samples = run_parallel("postprocess_alignment", samples)
            samples = run_parallel("combine_sample_regions", [samples])
            samples = run_parallel("calculate_sv_bins", [samples])
            samples = run_parallel("calculate_sv_coverage", samples)
            samples = region.clean_sample_data(samples)
        with profile.report("hla typing", dirs):
            samples = hla.run(samples, run_parallel)

    ## Variant calling on sub-regions of the input file (full cluster)
    with prun.start(_wres(parallel, ["gatk", "picard", "variantcaller"]),
                    samples,
                    config,
                    dirs,
                    "full",
                    multiplier=region.get_max_counts(samples),
                    max_multicore=1) as run_parallel:
        with profile.report("alignment post-processing", dirs):
            samples = region.parallel_prep_region(samples, run_parallel)
        with profile.report("variant calling", dirs):
            samples = genotype.parallel_variantcall_region(
                samples, run_parallel)

    ## Finalize variants, BAMs and population databases (per-sample multicore cluster)
    with prun.start(_wres(parallel, [
            "gatk", "gatk-vqsr", "snpeff", "bcbio_variation", "gemini",
            "samtools", "fastqc", "sambamba", "bcbio-variation-recall",
            "qsignature", "svcaller", "kraken", "preseq"
    ]),
                    samples,
                    config,
                    dirs,
                    "multicore2",
                    multiplier=structural.parallel_multiplier(
                        samples)) as run_parallel:
        with profile.report("joint squaring off/backfilling", dirs):
            samples = joint.square_off(samples, run_parallel)
        with profile.report("variant post-processing", dirs):
            samples = run_parallel("postprocess_variants", samples)
            samples = run_parallel("split_variants_by_sample", samples)
        with profile.report("prepped BAM merging", dirs):
            samples = region.delayed_bamprep_merge(samples, run_parallel)
        with profile.report("validation", dirs):
            samples = run_parallel("compare_to_rm", samples)
            samples = genotype.combine_multiple_callers(samples)
        with profile.report("ensemble calling", dirs):
            samples = ensemble.combine_calls_parallel(samples, run_parallel)
        with profile.report("validation summary", dirs):
            samples = validate.summarize_grading(samples)
        with profile.report("structural variation precall", dirs):
            samples = structural.run(samples, run_parallel, "precall")
        with profile.report("structural variation", dirs):
            samples = structural.run(samples, run_parallel, "initial")
        with profile.report("structural variation", dirs):
            samples = structural.run(samples, run_parallel, "standard")
        with profile.report("structural variation ensemble", dirs):
            samples = structural.run(samples, run_parallel, "ensemble")
        with profile.report("structural variation validation", dirs):
            samples = run_parallel("validate_sv", samples)
        with profile.report("heterogeneity", dirs):
            samples = heterogeneity.run(samples, run_parallel)
        with profile.report("population database", dirs):
            samples = population.prep_db_parallel(samples, run_parallel)
        with profile.report("quality control", dirs):
            samples = qcsummary.generate_parallel(samples, run_parallel)
        with profile.report("archive", dirs):
            samples = archive.compress(samples, run_parallel)
        with profile.report("upload", dirs):
            samples = run_parallel("upload_samples", samples)
            for sample in samples:
                run_parallel("upload_samples_project", [sample])
    logger.info("Timing: finished")
    return samples
Example #33
0
def _report_summary(samples, out_dir):
    """
    Run coverage report with bcbiocov package
    """
    try:
        import bcbreport.prepare as bcbreport
    except ImportError:
        logger.info("skipping report. No bcbreport installed.")
        return samples
    # samples = utils.unpack_worlds(samples)
    work_dir = dd.get_work_dir(samples[0])
    parent_dir = utils.safe_makedir(out_dir)
    with utils.chdir(parent_dir):
        logger.info("copy qsignature")
        qsignature_fn = os.path.join(work_dir, "qc", "qsignature",
                                     "qsignature.ma")
        if qsignature_fn:  # this need to be inside summary/qc dict
            if utils.file_exists(
                    qsignature_fn) and not utils.file_exists("qsignature.ma"):
                shutil.copy(qsignature_fn, "bcbio_qsignature.ma")

        out_dir = utils.safe_makedir("fastqc")
        logger.info("summarize fastqc")
        with utils.chdir(out_dir):
            _merge_fastqc(samples)

        logger.info("summarize metrics")
        samples = _merge_metrics(samples)

        logger.info("summarize target information")
        if samples[0].get("analysis", "").lower() in ["variant", "variant2"]:
            samples = _merge_target_information(samples)

        out_dir = utils.safe_makedir("coverage")
        logger.info("summarize coverage")
        for data in samples:
            pfiles = tz.get_in(["summary", "qc", "coverage"], data, [])
            if isinstance(pfiles, dict):
                pfiles = [pfiles["base"]] + pfiles["secondary"]
            elif pfiles:
                pfiles = [pfiles]
            for fn in pfiles:
                if os.path.basename(fn).find("coverage_fixed") > -1:
                    utils.copy_plus(
                        fn, os.path.join(out_dir, os.path.basename(fn)))

        out_dir = utils.safe_makedir("variants")
        logger.info("summarize variants")
        for data in samples:
            pfiles = tz.get_in(["summary", "qc", "variants"], data, [])
            if isinstance(pfiles, dict):
                pfiles = [pfiles["base"]] + pfiles["secondary"]
            elif pfiles:
                pfiles = [pfiles]
            for fn in pfiles:
                if os.path.basename(fn).find("gc-depth-parse.tsv") > -1:
                    utils.copy_plus(
                        fn, os.path.join(out_dir, os.path.basename(fn)))
        bcbreport.report(parent_dir)
        out_report = os.path.join(parent_dir, "qc-coverage-report.html")
        if not utils.file_exists(out_report):
            rmd_file = os.path.join(parent_dir, "report-ready.Rmd")
            run_file = "%s-run.R" % (os.path.splitext(out_report)[0])
            with open(run_file, "w") as out_handle:
                out_handle.write("""library(rmarkdown)\nrender("%s")\n""" %
                                 rmd_file)
            # cmd = "%s %s" % (utils.Rscript_cmd(), run_file)
            # Skip automated generation of coverage report to avoid error
            # messages. We need to generalize coverage reporting and re-include.
            # try:
            #     do.run(cmd, "Prepare coverage summary", log_error=False)


# except subprocess.CalledProcessError as msg:
#     logger.info("Skipping generation of coverage report: %s" % (str(msg)))
            if utils.file_exists("report-ready.html"):
                shutil.move("report-ready.html", out_report)
    return samples
Example #34
0
def consensus(peakfiles, consensusfile, data, pad=250):
    """call consensus peaks from a set of narrow/broad peakfiles
    we use this method:
    https://bedops.readthedocs.io/en/latest/content/usage-examples/master-list.html
    """
    if utils.file_exists(consensusfile):
        return consensusfile

    try:
        bedops = config_utils.get_program("bedops", data)
    except config_utils.CmdNotFound:
        logger.info("bedops not found, skipping consensus peak calling. do a "
                    "--tools update to install bedops.")
        return None
    try:
        sortbed = config_utils.get_program("sort-bed", data)
    except config_utils.CmdNotFound:
        logger.info("sort-bed not found, skipping consensus peak calling. do "
                    "--tools update to install sort-bed.")
        return None
    try:
        bedmap = config_utils.get_program("bedmap", data)
    except config_utils.CmdNotFound:
        logger.info("bedmap not found, skipping consensus peak calling. do a "
                    "--tools update to install bedmap.")
        return None

    logger.info(f"Calling consensus peaks on {','.join(peakfiles)}")
    logger.info(f"Removing low quality peaks from {','.join(peakfiles)}")
    filteredsummits = []
    for fn in peakfiles:
        filteredpeak = NamedTemporaryFile(suffix=".bed", delete=False).name
        df = remove_low_quality_peaks(fn, qval=0.05)
        df.to_csv(filteredpeak, index=False, header=False, sep="\t")
        filteredsummit = peakfile_to_summitfile(filteredpeak)
        filteredsummits.append(filteredsummit)
    peakfiles = filteredsummits

    with file_transaction(consensusfile) as tx_consensus_file:
        message = (f"Combining summits of {' '.join(peakfiles)} and "
                   f"expanding {pad} bases.")
        with utils.tmpfile(suffix=".bed") as tmpbed:
            slopcommand = f"{bedops} --range {pad} -u {' '.join(peakfiles)} > {tmpbed}"
            do.run(slopcommand, message)
            iteration = 0
            solutions = []
            while os.path.getsize(tmpbed):
                iteration = iteration + 1
                iterationbed = NamedTemporaryFile(suffix=".bed",
                                                  delete=False).name
                with utils.tmpfile(suffix="bed") as mergedbed, \
                     utils.tmpfile(suffix="bed") as intermediatebed, \
                     utils.tmpfile(suffix="bed") as leftoverbed:
                    mergecmd = (f"{bedops} -m --range 0:-1 {tmpbed} | "
                                f"{bedops} -u --range 0:1 - > "
                                f"{mergedbed}")
                    message = f"Merging non-overlapping peaks, iteration {iteration}."
                    do.run(mergecmd, message)
                    nitems = len(open(mergedbed).readlines())
                    message = f"Considering {nitems} peaks, choosing the highest score for overlapping peaks."
                    highscorecmd = (
                        f"{bedmap} --max-element {mergedbed} {tmpbed} |"
                        f"{sortbed} - > "
                        f"{iterationbed}")
                    do.run(highscorecmd, message)
                    message = f"Checking if there are peaks left to merge."
                    anyleftcmd = (
                        f"{bedops} -n 1 {tmpbed} {iterationbed} > {intermediatebed}"
                    )
                    do.run(anyleftcmd, message)
                    shutil.move(intermediatebed, tmpbed)
                    solutions.append(iterationbed)
        message = f"Creating final consensus peak file: {consensusfile}."
        consensuscmd = (
            f"{bedops} -u {' '.join(solutions)} > {tx_consensus_file}")
        do.run(consensuscmd, message)
    return consensusfile
Example #35
0
def postprocess_variants(items):
    """Provide post-processing of variant calls: filtering and effects annotation.
    """
    vrn_key = "vrn_file"
    if not isinstance(items, dict):
        items = [utils.to_single_data(x) for x in items]
        if "vrn_file_joint" in items[0]:
            vrn_key = "vrn_file_joint"
    data, items = _get_batch_representative(items, vrn_key)
    items = cwlutils.unpack_tarballs(items, data)
    data = cwlutils.unpack_tarballs(data, data)
    cur_name = "%s, %s" % (dd.get_sample_name(data), get_variantcaller(data))
    logger.info("Finalizing variant calls: %s" % cur_name)
    orig_vrn_file = data.get(vrn_key)
    data = _symlink_to_workdir(data, [vrn_key])
    data = _symlink_to_workdir(data, ["config", "algorithm", "variant_regions"])
    if data.get(vrn_key):
        logger.info("Calculating variation effects for %s" % cur_name)
        ann_vrn_file, vrn_stats = effects.add_to_vcf(data[vrn_key], data)
        if ann_vrn_file:
            data[vrn_key] = ann_vrn_file
        if vrn_stats:
            data["vrn_stats"] = vrn_stats
        orig_items = _get_orig_items(items)
        logger.info("Annotate VCF file: %s" % cur_name)
        data[vrn_key] = annotation.finalize_vcf(data[vrn_key], get_variantcaller(data), orig_items)
        logger.info("Filtering for %s" % cur_name)
        data[vrn_key] = variant_filtration(data[vrn_key], dd.get_ref_file(data),
                                              tz.get_in(("genome_resources", "variation"), data, {}),
                                              data, orig_items)
        logger.info("Prioritization for %s" % cur_name)
        prio_vrn_file = prioritize.handle_vcf_calls(data[vrn_key], data, orig_items)
        if prio_vrn_file != data[vrn_key]:
            data[vrn_key] = prio_vrn_file
            logger.info("Germline extraction for %s" % cur_name)
            data = germline.extract(data, orig_items)

        if dd.get_align_bam(data):
            data = damage.run_filter(data[vrn_key], dd.get_align_bam(data), dd.get_ref_file(data),
                                     data, orig_items)
    if orig_vrn_file and os.path.samefile(data[vrn_key], orig_vrn_file):
        data[vrn_key] = orig_vrn_file
    return [[data]]
Example #36
0
def htseq_count(data):
    """ adapted from Simon Anders htseq-count.py script
    http://www-huber.embl.de/users/anders/HTSeq/doc/count.html
    """

    sam_filename, gff_filename, out_file, stats_file = _get_files(data)
    stranded = _get_stranded_flag(data["config"])
    overlap_mode = "union"
    feature_type = "exon"
    id_attribute = "gene_id"
    minaqual = 0

    if file_exists(out_file):
        return out_file

    logger.info(
        "Counting reads mapping to exons in %s using %s as the "
        "annotation and strandedness as %s." %
        (os.path.basename(sam_filename), os.path.basename(gff_filename),
         _get_strandedness(data["config"])))

    features = HTSeq.GenomicArrayOfSets("auto", stranded != "no")
    counts = {}

    # Try to open samfile to fail early in case it is not there
    open(sam_filename).close()

    gff = HTSeq.GFF_Reader(gff_filename)
    i = 0
    try:
        for f in gff:
            if f.type == feature_type:
                try:
                    feature_id = f.attr[id_attribute]
                except KeyError:
                    sys.exit("Feature %s does not contain a '%s' attribute" %
                             (f.name, id_attribute))
                if stranded != "no" and f.iv.strand == ".":
                    sys.exit("Feature %s at %s does not have strand "
                             "information but you are running htseq-count "
                             "in stranded mode. Use '--stranded=no'." %
                             (f.name, f.iv))
                features[f.iv] += feature_id
                counts[f.attr[id_attribute]] = 0
            i += 1
            if i % 100000 == 0:
                sys.stderr.write("%d GFF lines processed.\n" % i)
    except:
        sys.stderr.write("Error occured in %s.\n" %
                         gff.get_line_number_string())
        raise

    sys.stderr.write("%d GFF lines processed.\n" % i)

    if len(counts) == 0:
        sys.stderr.write("Warning: No features of type '%s' found.\n" %
                         feature_type)

    try:
        align_reader = htseq_reader(sam_filename)
        first_read = iter(align_reader).next()
        pe_mode = first_read.paired_end
    except:
        sys.stderr.write("Error occured when reading first line of sam "
                         "file.\n")
        raise

    try:
        if pe_mode:
            read_seq_pe_file = align_reader
            read_seq = HTSeq.pair_SAM_alignments(align_reader)
        empty = 0
        ambiguous = 0
        notaligned = 0
        lowqual = 0
        nonunique = 0
        i = 0
        for r in read_seq:
            i += 1
            if not pe_mode:
                if not r.aligned:
                    notaligned += 1
                    continue
                try:
                    if r.optional_field("NH") > 1:
                        nonunique += 1
                        continue
                except KeyError:
                    pass
                if r.aQual < minaqual:
                    lowqual += 1
                    continue
                if stranded != "reverse":
                    iv_seq = (co.ref_iv for co in r.cigar
                              if co.type == "M" and co.size > 0)
                else:
                    iv_seq = (invert_strand(co.ref_iv) for co in r.cigar
                              if co.type == "M" and co.size > 0)
            else:
                if r[0] is not None and r[0].aligned:
                    if stranded != "reverse":
                        iv_seq = (co.ref_iv for co in r[0].cigar
                                  if co.type == "M" and co.size > 0)
                    else:
                        iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar
                                  if co.type == "M" and co.size > 0)
                else:
                    iv_seq = tuple()
                if r[1] is not None and r[1].aligned:
                    if stranded != "reverse":
                        iv_seq = itertools.chain(
                            iv_seq, (invert_strand(co.ref_iv)
                                     for co in r[1].cigar
                                     if co.type == "M" and co.size > 0))
                    else:
                        iv_seq = itertools.chain(
                            iv_seq, (co.ref_iv for co in r[1].cigar
                                     if co.type == "M" and co.size > 0))
                else:
                    if (r[0] is None) or not (r[0].aligned):
                        notaligned += 1
                        continue
                try:
                    if (r[0] is not None and r[0].optional_field("NH") > 1) or \
                       (r[1] is not None and r[1].optional_field("NH") > 1):
                        nonunique += 1
                        continue
                except KeyError:
                    pass
                if (r[0] and r[0].aQual < minaqual) or (r[1] and
                                                        r[1].aQual < minaqual):
                    lowqual += 1
                    continue

            try:
                if overlap_mode == "union":
                    fs = set()
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            fs = fs.union(fs2)
                elif (overlap_mode == "intersection-strict"
                      or overlap_mode == "intersection-nonempty"):
                    fs = None
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            if (len(fs2) > 0
                                    or overlap_mode == "intersection-strict"):
                                if fs is None:
                                    fs = fs2.copy()
                                else:
                                    fs = fs.intersection(fs2)
                else:
                    sys.exit("Illegal overlap mode.")
                if fs is None or len(fs) == 0:
                    empty += 1
                elif len(fs) > 1:
                    ambiguous += 1
                else:
                    counts[list(fs)[0]] += 1
            except UnknownChrom:
                if not pe_mode:
                    rr = r
                else:
                    rr = r[0] if r[0] is not None else r[1]
                empty += 1

            if i % 100000 == 0:
                sys.stderr.write(
                    "%d sam %s processed.\n" %
                    (i, "lines " if not pe_mode else "line pairs"))

    except:
        if not pe_mode:
            sys.stderr.write("Error occured in %s.\n" %
                             read_seq.get_line_number_string())
        else:
            sys.stderr.write("Error occured in %s.\n" %
                             read_seq_pe_file.get_line_number_string())
        raise

    sys.stderr.write("%d sam %s processed.\n" %
                     (i, "lines " if not pe_mode else "line pairs"))

    with file_transaction(out_file) as tmp_out_file:
        with open(tmp_out_file, "w") as out_handle:
            on_feature = 0
            for fn in sorted(counts.keys()):
                on_feature += counts[fn]
                out_handle.write("%s\t%d\n" % (fn, counts[fn]))

    with file_transaction(stats_file) as tmp_stats_file:
        with open(tmp_stats_file, "w") as out_handle:
            out_handle.write("on_feature\t%d\n" % on_feature)
            out_handle.write("no_feature\t%d\n" % empty)
            out_handle.write("ambiguous\t%d\n" % ambiguous)
            out_handle.write("too_low_aQual\t%d\n" % lowqual)
            out_handle.write("not_aligned\t%d\n" % notaligned)
            out_handle.write("alignment_not_unique\t%d\n" % nonunique)

    return out_file
Example #37
0
def _run_purecn_normaldb(paired, out):
    """Run PureCN with normaldb and native segmentation
       paired is one t/n pair or only """
    sample = utils.to_single_data(paired.tumor_data)
    bed_file = tz.get_in(["config", "algorithm", "purecn_bed_ready"], sample)
    sample_name = dd.get_sample_name(sample)
    work_dir = _sv_workdir(sample)
    rscript = utils.Rscript_cmd()
    purecn_r = utils.R_package_script("PureCN", "extdata/PureCN.R", env="base")
    intervals = tz.get_in(["config", "algorithm", "purecn_bed_ready"], sample)
    bam_file = dd.get_align_bam(sample)
    # termline and somatic - just annotated and filters assigned
    variants_vcf = tz.get_in(["variants"], sample)[0].get("germline")
    # in a T/N case, there is no germline file - vrn file with all variants
    if not variants_vcf:
        variants_vcf = tz.get_in(["variants"], sample)[0].get("vrn_file")
    normaldb = tz.get_in([
        "config", "algorithm", "background", "cnv_reference", "purecn_normaldb"
    ], sample)
    mappingbiasfile = tz.get_in([
        "config", "algorithm", "background", "cnv_reference",
        "purecn_mapping_bias"
    ], sample)
    sample_coverage = tz.get_in(["depth", "bins", "purecn"], sample)
    simple_repeat_bed = dd.get_variation_resources(sample)["simple_repeat"]
    result_file = os.path.join(work_dir, sample_name + ".rds")
    genome = dd.get_genome_build(sample)
    cmd = [
        rscript, purecn_r, "--out", work_dir, "--tumor", sample_coverage,
        "--sampleid", sample_name, "--vcf", variants_vcf, "--normaldb",
        normaldb, "--mapping-bias-file", mappingbiasfile, "--intervals",
        intervals, "--snp-blacklist", simple_repeat_bed, "--genome", genome,
        "--force", "--post-optimize", "--seed", "123", "--bootstrapn", "500",
        "--cores",
        dd.get_num_cores(sample)
    ]
    resources = config_utils.get_resources("purecn", sample)
    if "options" in resources:
        cmd += [str(x) for x in resources.get("options", [])]
    # it is not recommended to use matched normal sample in PureCN analysis,
    # because then it skips PON coverage normalization and denoising steps!
    # but still, if it is supplied, we useit
    if paired.normal_data:
        normal_sample = utils.to_single_data(paired.normal_data)
        if normal_sample:
            normal_coverage = tz.get_in(["depth", "bins", "purecn"],
                                        normal_sample)
            cmd.extend(["--normal", normal_coverage])
    if not os.path.exists(result_file):
        try:
            cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(
                env="base"), utils.get_R_exports(env="base"), " ".join(
                    [str(x) for x in cmd]))
            do.run(cmd_line, "PureCN copy number calling")
            logger.debug("Saved PureCN output to " + work_dir)
        except subprocess.CalledProcessError as msg:
            logger.info("PureCN failed")
    out_base, out, all_files = _get_purecn_files(paired,
                                                 work_dir,
                                                 require_exist=True)
    return out
Example #38
0
def _run_purecn(paired, work_dir):
    """Run PureCN.R wrapper with pre-segmented CNVkit or GATK4 inputs.
    """
    segfns = {
        "cnvkit": _segment_normalized_cnvkit,
        "gatk-cnv": _segment_normalized_gatk
    }
    out_base, out, all_files = _get_purecn_files(paired, work_dir)
    failed_file = out_base + "-failed.log"
    cnr_file = tz.get_in(["depth", "bins", "normalized"], paired.tumor_data)
    if not utils.file_uptodate(
            out["rds"], cnr_file) and not utils.file_exists(failed_file):
        cnr_file, seg_file = segfns[cnvkit.bin_approach(paired.tumor_data)](
            cnr_file, work_dir, paired)
        from bcbio import heterogeneity
        vcf_file = heterogeneity.get_variants(
            paired.tumor_data, include_germline=False)[0]["vrn_file"]
        vcf_file = germline.filter_to_pass_and_reject(vcf_file,
                                                      paired,
                                                      out_dir=work_dir)
        with file_transaction(paired.tumor_data, out_base) as tx_out_base:
            # Use UCSC style naming for human builds to support BSgenome
            genome = ("hg19" if dd.get_genome_build(paired.tumor_data) in [
                "GRCh37", "hg19"
            ] else dd.get_genome_build(paired.tumor_data))
            rscript = utils.Rscript_cmd()
            purecn_r = utils.R_package_script("PureCN",
                                              "extdata/PureCN.R",
                                              env="base")
            cmd = [
                rscript, purecn_r, "--seed", "42", "--out", tx_out_base,
                "--rds",
                "%s.rds" % tx_out_base, "--sampleid",
                dd.get_sample_name(paired.tumor_data), "--genome", genome,
                "--vcf", vcf_file, "--tumor", cnr_file, "--segfile", seg_file,
                "--funsegmentation", "Hclust", "--maxnonclonal", "0.3"
            ]
            if dd.get_num_cores(paired.tumor_data) > 1:
                cmd += ["--cores", str(dd.get_num_cores(paired.tumor_data))]
            try:
                cmd = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(
                    env="base"), utils.get_R_exports(env="base"), " ".join(
                        [str(x) for x in cmd]))
                do.run(cmd, "PureCN copy number calling")
            except subprocess.CalledProcessError as msg:
                if _allowed_errors(str(msg)):
                    logger.info(
                        "PureCN failed to find solution for %s: skipping" %
                        dd.get_sample_name(paired.tumor_data))
                    with open(failed_file, "w") as out_handle:
                        out_handle.write(str(msg))
                else:
                    logger.exception()
                    raise
            for f in all_files:
                if os.path.exists(os.path.join(os.path.dirname(tx_out_base),
                                               f)):
                    shutil.move(os.path.join(os.path.dirname(tx_out_base), f),
                                os.path.join(os.path.dirname(out_base), f))
    out = _get_purecn_files(paired, work_dir, require_exist=True)[1]
    return out if (out.get("rds") and os.path.exists(out["rds"])) else None
Example #39
0
File: trim.py Project: roryk/bipy
    def _cut_file(self, in_file):
        """
        run cutadapt on a single file

        """
        adapters = self._get_adapters(self.chemistry)
        out_file = self.in2trimmed(in_file)
        if file_exists(out_file):
            return out_file
        cutadapt = sh.Command(self.stage_config.get("program", "cutadapt"))

        quality_format = self.quality_format
        if not quality_format:
            quality_format = self._detect_fastq_format(in_file)
        if quality_format == "sanger":
            logger.info("Quality format detected as sanger.")
            quality_base = 33
        elif quality_format == "illumina":
            logger.info("Quality format set to illumina 1.5/1.3")
            quality_base = 64
        else:
            logger.error("Quality format could not be detected. Quality "
                         "Detected or set as %s. It should be illumina "
                         "or sanger.")
            exit(1)

        # if we want to trim the polya tails we have to first remove
        # the adapters and then trim the tail
        if self.stage_config.get("trim_polya", True):
            temp_cut = tempfile.NamedTemporaryFile(suffix=".fastq",
                                                   dir=self.out_dir)
            # trim off adapters
            cmd = str(
                cutadapt.bake(in_file,
                              self.options,
                              adapters,
                              quality_base=quality_base,
                              out=temp_cut.name))
            do.run(cmd, "Cutadapt trim of adapters of %s." % (in_file), None)
            with file_transaction(out_file) as temp_out:
                polya = ADAPTERS.get("polya")
                # trim off polya
                cmd = str(
                    cutadapt.bake(temp_cut.name,
                                  self.options,
                                  "-a",
                                  polya,
                                  "-a",
                                  self._rc_adapters(polya),
                                  quality_base=quality_base,
                                  out=temp_out))
                do.run(cmd,
                       "Cutadapt trim of polyA tail of %s." % (temp_cut.name),
                       None)
            return out_file
        else:
            with file_transaction(out_file) as temp_out:
                cmd = str(
                    cutadapt.bake(in_file,
                                  self.options,
                                  adapters,
                                  out=temp_out))
                do.run(cmd, "Cutadapt trim of %s." % (in_file))
            return out_file
Example #40
0
def run_peddy(samples, out_dir=None):
    vcf_file = None
    for d in samples:
        vcinfo = variant.get_active_vcinfo(d, use_ensemble=False)
        if vcinfo and vcinfo.get("vrn_file") and utils.file_exists(
                vcinfo["vrn_file"]):
            if vcinfo["vrn_file"] and dd.get_sample_name(
                    d) in vcfutils.get_samples(vcinfo["vrn_file"]):
                vcf_file = vcinfo["vrn_file"]
                break
    data = samples[0]
    peddy = config_utils.get_program("peddy",
                                     data) if config_utils.program_installed(
                                         "peddy", data) else None
    if not peddy or not vcf_file or not is_human(data):
        logger.info(
            "peddy is not installed, not human or sample VCFs don't match, skipping correspondence checking "
            "for %s." % vcf_file)
        return samples
    batch = dd.get_batch(data) or dd.get_sample_name(data)
    if out_dir:
        peddy_dir = safe_makedir(out_dir)
    else:
        peddy_dir = safe_makedir(
            os.path.join(dd.get_work_dir(data), "qc", batch, "peddy"))
    ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir)
    peddy_prefix = os.path.join(peddy_dir, batch)
    peddy_report = peddy_prefix + ".html"
    peddyfiles = expected_peddy_files(peddy_report, batch)
    if file_exists(peddy_report):
        return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
    if file_exists(peddy_prefix + "-failed.log"):
        return samples
    num_cores = dd.get_num_cores(data)

    with tx_tmpdir(data) as tx_dir:
        peddy_prefix_tx = os.path.join(tx_dir, os.path.basename(peddy_prefix))
        # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2
        stderr_log = os.path.join(tx_dir, "run-stderr.log")
        sites_str = "--sites hg38" if dd.get_genome_build(
            data) == "hg38" else ""
        cmd = (
            "{peddy} -p {num_cores} {sites_str} --plot --prefix {peddy_prefix_tx} "
            "{vcf_file} {ped_file} 2> {stderr_log}")
        message = "Running peddy on {vcf_file} against {ped_file}."
        try:
            do.run(cmd.format(**locals()), message.format(**locals()))
        except:
            to_show = collections.deque(maxlen=100)
            with open(stderr_log) as in_handle:
                for line in in_handle:
                    to_show.append(line)

            def allowed_errors(l):
                return ((l.find("IndexError") >= 0
                         and l.find("is out of bounds for axis") >= 0) or
                        (l.find("n_components=") >= 0
                         and l.find("must be between 1 and n_features=") >= 0))

            if any([allowed_errors(l) for l in to_show]):
                logger.info(
                    "Skipping peddy because no variants overlap with checks: %s"
                    % batch)
                with open(peddy_prefix + "-failed.log", "w") as out_handle:
                    out_handle.write(
                        "peddy did not find overlaps with 1kg sites in VCF, skipping"
                    )
                return samples
            else:
                logger.warning("".join(to_show))
                raise
        for ext in PEDDY_OUT_EXTENSIONS:
            if os.path.exists(peddy_prefix_tx + ext):
                shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext)
    return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
Example #41
0
 def check():
     ok = os.path.exists(target_file)
     if not ok:
         logger.info("Did not find output file {0}".format(target_file))
     return ok
Example #42
0
def _bgzip_from_bam(bam_file, dirs, data, is_retry=False, output_infix=''):
    """Create bgzipped fastq files from an input BAM file.
    """
    # tools
    config = data["config"]
    bamtofastq = config_utils.get_program("bamtofastq", config)
    resources = config_utils.get_resources("bamtofastq", config)
    cores = config["algorithm"].get("num_cores", 1)
    max_mem = config_utils.convert_to_bytes(resources.get("memory",
                                                          "1G")) * cores
    bgzip = tools.get_bgzip_cmd(config, is_retry)
    # files
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep"))
    out_file_1 = os.path.join(
        work_dir, "%s%s-1.fq.gz" %
        (os.path.splitext(os.path.basename(bam_file))[0], output_infix))
    out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz")
    needs_retry = False
    if is_retry or not utils.file_exists(out_file_1):
        if not bam.is_paired(bam_file):
            out_file_2 = None
        with file_transaction(config, out_file_1) as tx_out_file:
            for f in [tx_out_file, out_file_1, out_file_2]:
                if f and os.path.exists(f):
                    os.remove(f)
            fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file)
            prep_cmd = _seqtk_fastq_prep_cl(data, read_num=0)
            if prep_cmd:
                fq1_bgzip_cmd = prep_cmd + " | " + fq1_bgzip_cmd
            sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0]
            if bam.is_paired(bam_file):
                prep_cmd = _seqtk_fastq_prep_cl(data, read_num=1)
                fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2)
                if prep_cmd:
                    fq2_bgzip_cmd = prep_cmd + " | " + fq2_bgzip_cmd
                out_str = (
                    "F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null "
                    "O2=/dev/null collate=1 colsbs={max_mem}")
            else:
                out_str = "S=>({fq1_bgzip_cmd})"
            bam_file = objectstore.cl_input(bam_file)
            extra_opts = " ".join(
                [str(x) for x in resources.get("options", [])])
            cmd = "{bamtofastq} filename={bam_file} T={sortprefix} {extra_opts} " + out_str
            try:
                do.run(cmd.format(**locals()),
                       "BAM to bgzipped fastq",
                       checks=[do.file_reasonable_size(tx_out_file, bam_file)],
                       log_error=False)
            except subprocess.CalledProcessError as msg:
                if not is_retry and "deflate failed" in str(msg):
                    logger.info(
                        "bamtofastq deflate IO failure preparing %s. Retrying with single core."
                        % (bam_file))
                    needs_retry = True
                else:
                    logger.exception()
                    raise
    if needs_retry:
        return _bgzip_from_bam(bam_file, dirs, data, is_retry=True)
    else:
        return [
            x for x in [out_file_1, out_file_2]
            if x is not None and utils.file_exists(x)
        ]
Example #43
0
def run_peddy(samples, out_dir=None):
    data = samples[0]
    batch = dd.get_batch(data) or dd.get_sample_name(data)
    if isinstance(batch, (list, tuple)):
        batch = batch[0]
    if out_dir:
        peddy_dir = safe_makedir(out_dir)
    else:
        peddy_dir = safe_makedir(
            os.path.join(dd.get_work_dir(data), "qc", batch, "peddy"))
    peddy_prefix = os.path.join(peddy_dir, batch)
    peddy_report = peddy_prefix + ".html"

    vcf_file = None
    for d in samples:
        vcinfo = None
        if dd.get_phenotype(d) == "germline" or dd.get_phenotype(d) not in [
                "tumor"
        ]:
            vcinfo = variant.get_active_vcinfo(d, use_ensemble=False)
        if not vcinfo and dd.get_phenotype(d) in ["tumor"]:
            vcinfo = variant.extract_germline_vcinfo(d, peddy_dir)
        if vcinfo:
            for key in ["germline", "vrn_file"]:
                if vcinfo and vcinfo.get(key) and utils.file_exists(
                        vcinfo[key]):
                    if vcinfo[key] and dd.get_sample_name(
                            d) in vcfutils.get_samples(vcinfo[key]):
                        if vcinfo[
                                key] and vcfutils.vcf_has_nonfiltered_variants(
                                    vcinfo[key]):
                            vcf_file = vcinfo[key]
                            break
    peddy = config_utils.get_program("peddy",
                                     data) if config_utils.program_installed(
                                         "peddy", data) else None
    if not peddy or not vcf_file or not vcfanno.is_human(data):
        if not peddy:
            reason = "peddy executable not found"
        elif not vcfanno.is_human(data):
            reason = "sample is not human"
        else:
            assert not vcf_file
            reason = "no suitable VCF files found with the sample and non-filtered variants"
        msg = "Skipping peddy QC, %s: %s" % (
            reason, [dd.get_sample_name(d) for d in samples])
        with open(peddy_prefix + "-failed.log", "w") as out_handle:
            out_handle.write(msg)
        logger.info(msg)
        return samples
    if file_exists(peddy_prefix + "-failed.log"):
        return samples
    if not file_exists(peddy_report):
        ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir)
        num_cores = dd.get_num_cores(data)
        with tx_tmpdir(data) as tx_dir:
            peddy_prefix_tx = os.path.join(tx_dir,
                                           os.path.basename(peddy_prefix))
            # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2
            stderr_log = os.path.join(tx_dir, "run-stderr.log")
            sites_str = "--sites hg38" if dd.get_genome_build(
                data) == "hg38" else ""
            cmd = (
                "{peddy} -p {num_cores} {sites_str} --plot --prefix {peddy_prefix_tx} "
                "{vcf_file} {ped_file} 2> {stderr_log}")
            message = "Running peddy on {vcf_file} against {ped_file}."
            try:
                do.run(cmd.format(**locals()), message.format(**locals()))
            except:
                to_show = collections.deque(maxlen=100)
                with open(stderr_log) as in_handle:
                    for line in in_handle:
                        to_show.append(line)

                def allowed_errors(l):
                    return ((
                        l.find("IndexError") >= 0
                        and l.find("is out of bounds for axis") >= 0
                    ) or (
                        l.find("n_components=") >= 0
                        and l.find("must be between 1 and n_features=") >= 0
                    ) or (l.find(
                        "Input contains NaN, infinity or a value too large for dtype"
                    ) >= 0))

                def all_line_errors(l):
                    return (l.find("no intervals found for") >= 0)

                if any([allowed_errors(l) for l in to_show]) or all(
                    [all_line_errors(l) for l in to_show]):
                    logger.info(
                        "Skipping peddy because no variants overlap with checks: %s"
                        % batch)
                    with open(peddy_prefix + "-failed.log", "w") as out_handle:
                        out_handle.write(
                            "peddy did not find overlaps with 1kg sites in VCF, skipping"
                        )
                    return samples
                else:
                    logger.warning("".join(to_show))
                    raise
            for ext in PEDDY_OUT_EXTENSIONS:
                if os.path.exists(peddy_prefix_tx + ext):
                    shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext)
    peddyfiles = expected_peddy_files(peddy_report, batch)
    return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
Example #44
0
 def check():
     ok = utils.file_exists(target_file)
     if not ok:
         logger.info(
             "Did not find non-empty output file {0}".format(target_file))
     return ok
Example #45
0
def umi_transform(data):
    """
    transform each read by identifying the barcode and UMI for each read
    and putting the information in the read name
    """
    fqfiles = data["files"]
    fqfiles.extend(list(repeat("", 4 - len(fqfiles))))
    fq1, fq2, fq3, fq4 = fqfiles
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    safe_makedir(umi_dir)
    transform = dd.get_umi_type(data)
    if not transform:
        logger.info(
            "No UMI transform specified, assuming pre-transformed data.")
        if is_transformed(fq1):
            logger.info(
                "%s detected as pre-transformed, passing it on unchanged." %
                fq1)
            data["files"] = [fq1]
            return [[data]]
        else:
            logger.error(
                "No UMI transform was specified, but %s does not look "
                "pre-transformed." % fq1)
            sys.exit(1)

    if file_exists(transform):
        transform_file = transform
    else:
        transform_file = get_transform_file(transform)
        if not file_exists(transform_file):
            logger.error(
                "The UMI transform can be specified as either a file or a "
                "bcbio-supported transform. Either the file %s does not exist "
                "or the transform is not supported by bcbio. Supported "
                "transforms are %s." %
                (dd.get_umi_type(data), ", ".join(SUPPORTED_TRANSFORMS)))
            sys.exit(1)
    out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz"
    out_file = os.path.join(umi_dir, out_base)
    if file_exists(out_file):
        data["files"] = [out_file]
        return [[data]]
    cellular_barcodes = get_cellular_barcodes(data)
    if len(cellular_barcodes) > 1:
        split_option = "--separate_cb"
    else:
        split_option = ""
    if dd.get_demultiplexed(data):
        demuxed_option = "--demuxed_cb %s" % dd.get_sample_name(data)
        split_option = ""
    else:
        demuxed_option = ""
    umis = config_utils.get_program("umis", data, default="umis")
    cores = dd.get_num_cores(data)
    # skip transformation if the file already looks transformed
    with open_fastq(fq1) as in_handle:
        read = next(in_handle)
        if "UMI_" in read:
            data["files"] = [out_file]
            return [[data]]
    locale_export = utils.locale_export()
    cmd = (
        "{locale_export}{umis} fastqtransform {split_option} {transform_file} "
        "--cores {cores} {demuxed_option} "
        "{fq1} {fq2} {fq3} {fq4}"
        "| seqtk seq -L 20 - | gzip > {tx_out_file}")
    message = (
        "Inserting UMI and barcode information into the read name of %s" % fq1)
    with file_transaction(out_file) as tx_out_file:
        do.run(cmd.format(**locals()), message)
    data["files"] = [out_file]
    return [[data]]
Example #46
0
def trim_srna_sample(data):
    """
    Remove 3' adapter for smallRNA-seq
    Uses cutadapt but with different parameters than for other pipelines.
    """
    in_file = data["files"][0]
    names = data["rgnames"]['sample']
    work_dir = os.path.join(dd.get_work_dir(data), "trimmed")
    out_dir = os.path.join(work_dir, names)
    log_out = os.path.join(out_dir, "%s.log" % names)
    utils.safe_makedir(out_dir)
    out_file = replace_directory(append_stem(in_file, ".clean"), out_dir)
    trim_reads = data["config"]["algorithm"].get("trim_reads", True)
    if utils.file_exists(out_file):
        data["clean_fastq"] = out_file
        data["collapse"] = _collapse(data["clean_fastq"])
        data["size_stats"] = _summary(data['collapse'])
        data["log_trimming"] = log_out
        return [[data]]

    adapter = dd.get_adapters(data)
    if adapter and not trim_reads:
        trim_reads = True
        logger.info(
            "Adapter is set up in config file, but trim_reads is not true."
            "If you want to skip trimming, skip adapter option from config.")
    if trim_reads and not adapter and error_dnapi:
        raise ValueError(error_dnapi)
    if trim_reads:
        adapters = adapter if adapter else _dnapi_prediction(in_file, out_dir)
    times = "" if not trim_reads or len(
        adapters) == 1 else "--times %s" % len(adapters)
    if trim_reads and adapters:
        adapter_cmd = " ".join(map(lambda x: "-a " + x, adapters))
        out_noadapter_file = replace_directory(
            append_stem(in_file, ".fragments"), out_dir)
        out_short_file = replace_directory(append_stem(in_file, ".short"),
                                           out_dir)
        atropos = _get_atropos()
        options = " ".join(
            data.get('resources', {}).get('atropos', {}).get("options", ""))
        cores = ("--threads %s" %
                 dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "")
        if " ".join(
                data.get('resources', {}).get('cutadapt',
                                              {}).get("options", "")):
            raise ValueError(
                "Atropos is now used, but cutadapt options found in YAML file."
                "See https://atropos.readthedocs.io/en/latest/")
        cmd = _cmd_atropos()
        if not utils.file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                do.run(cmd.format(**locals()), "remove adapter for %s" % names)
                if utils.file_exists(log_out):
                    content = open(log_out).read().replace(
                        out_short_file, names)
                    open(log_out, 'w').write(content)
                if options:
                    in_file = append_stem(tx_out_file, ".tmp")
                    utils.move_safe(tx_out_file, in_file)
                    cmd = "{atropos} {cores} {options} -se {in_file} -o {tx_out_file} -m 17"
                    do.run(cmd.format(**locals()),
                           "cutadapt with this %s for %s" % (options, names))
        data["log_trimming"] = log_out
    else:
        if not trim_reads:
            logger.debug("Skip trimming for: %s" % names)
        elif not adapters:
            logger.info("No adapter founds in %s, this is an issue related"
                        " to no small RNA enrichment in your sample." % names)
        symlink_plus(in_file, out_file)
    data["clean_fastq"] = out_file
    data["collapse"] = _collapse(data["clean_fastq"])
    data["size_stats"] = _summary(data['collapse'])
    return [[data]]
def _get_samples_to_process(fn, out_dir, config, force_single, separators):
    """parse csv file with one line per file. It will merge
    all files that have the same description name"""
    out_dir = os.path.abspath(out_dir)
    samples = defaultdict(list)
    with open(fn) as handle:
        for l in handle:
            if l.find("description") > 0:
                logger.info("Skipping header.")
                continue
            cols = l.strip().split(",")
            if len(cols) > 0:
                if len(cols) < 2:
                    raise ValueError("Line needs 2 values: file and name.")
                if utils.file_exists(cols[0]) or is_gsm(cols[0]) or is_srr(
                        cols[0]):
                    if cols[0].find(" ") > -1:
                        new_name = os.path.abspath(cols[0].replace(" ", "_"))
                        logger.warning("Space finds in %s. Linked to %s." %
                                       (cols[0], new_name))
                        logger.warning(
                            "Please, avoid names with spaces in the future.")
                        utils.symlink_plus(os.path.abspath(cols[0]), new_name)
                        cols[0] = new_name
                    samples[cols[1]].append(cols)
                else:
                    logger.info("skipping %s, File doesn't exist." % cols[0])
    for sample, items in samples.items():
        if is_fastq(items[0][0], True):
            fn = "fq_merge"
            ext = ".fastq.gz"
        elif is_bam(items[0][0]):
            fn = "bam_merge"
            ext = ".bam"
        elif is_gsm(items[0][0]):
            fn = "query_gsm"
            ext = ".fastq.gz"
        elif is_srr(items[0][0]):
            fn = "query_srr"
            ext = ".fastq.gz"
        files = [
            os.path.abspath(fn_file[0])
            if utils.file_exists(fn_file[0]) else fn_file[0]
            for fn_file in items
        ]
        samples[sample] = [{
            'files':
            _check_paired(files, force_single, separators),
            'out_file':
            os.path.join(out_dir, sample + ext),
            'fn':
            fn,
            'anno':
            items[0][2:],
            'config':
            config,
            'name':
            sample,
            'out_dir':
            out_dir
        }]
    return [samples[sample] for sample in samples]
Example #48
0
def umi_transform(data):
    """
    transform each read by identifying the barcode and UMI for each read
    and putting the information in the read name
    """
    fq1 = data["files"][0]
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    safe_makedir(umi_dir)
    transform = dd.get_umi_type(data)

    if not transform:
        logger.info(
            "No UMI transform specified, assuming pre-transformed data.")
        if is_transformed(fq1):
            logger.info(
                "%s detected as pre-transformed, passing it on unchanged." %
                fq1)
            data["files"] = [fq1]
            return data
        else:
            logger.error(
                "No UMI transform was specified, but %s does not look "
                "pre-transformed. Assuming non-umi data." % fq1)
            return data

    if file_exists(transform):
        transform_file = transform
    else:
        transform_file = get_transform_file(transform)
        if not file_exists(transform_file):
            logger.error(
                "The UMI transform can be specified as either a file or a "
                "bcbio-supported transform. Either the file %s does not exist "
                "or the transform is not supported by bcbio. Supported "
                "transforms are %s." %
                (dd.get_umi_type(data), ", ".join(SUPPORTED_TRANSFORMS)))
            sys.exit(1)
    out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz"
    out_file = os.path.join(umi_dir, out_base)
    if file_exists(out_file):
        data["files"] = [out_file]
        return data
    umis = config_utils.get_program("umis", data, default="umis")
    cores = dd.get_num_cores(data)
    # skip transformation if the file already looks transformed
    with open_fastq(fq1) as in_handle:
        read = in_handle.next()
        if "UMI_" in read:
            data["files"] = [out_file]
            return data

    cmd = ("{umis} fastqtransform {transform_file} "
           "--cores {cores} "
           "{fq1}"
           "| seqtk seq -L 20 - | gzip > {tx_out_file}")
    message = (
        "Inserting UMI and barcode information into the read name of %s" % fq1)
    with file_transaction(out_file) as tx_out_file:
        do.run(cmd.format(**locals()), message)
    data["files"] = [out_file]
    return data
Example #49
0
def rnaseqpipeline(config, run_info_yaml, parallel, dirs, samples):
    samples = rnaseq_prep_samples(config, run_info_yaml, parallel, dirs,
                                  samples)
    with prun.start(
            _wres(parallel, ["aligner", "picard", "samtools"],
                  ensure_mem={
                      "tophat": 10,
                      "tophat2": 10,
                      "star": 2,
                      "hisat2": 8
                  }),
            samples,
            config,
            dirs,
            "alignment",
            multiplier=alignprep.parallel_multiplier(samples)) as run_parallel:
        with profile.report("alignment", dirs):
            samples = run_parallel("disambiguate_split", [samples])
            samples = run_parallel("process_alignment", samples)
    with prun.start(_wres(parallel, ["samtools", "cufflinks"]), samples,
                    config, dirs, "rnaseqcount") as run_parallel:
        with profile.report("disambiguation", dirs):
            samples = disambiguate.resolve(samples, run_parallel)
        with profile.report("transcript assembly", dirs):
            samples = rnaseq.assemble_transcripts(run_parallel, samples)
        with profile.report("estimate expression (threaded)", dirs):
            samples = rnaseq.quantitate_expression_parallel(
                samples, run_parallel)

    with prun.start(_wres(parallel, ["dexseq", "express"]),
                    samples,
                    config,
                    dirs,
                    "rnaseqcount-singlethread",
                    max_multicore=1) as run_parallel:
        with profile.report("estimate expression (single threaded)", dirs):
            samples = rnaseq.quantitate_expression_noparallel(
                samples, run_parallel)

    samples = rnaseq.combine_files(samples)
    with prun.start(_wres(parallel, ["gatk", "vardict"]), samples, config,
                    dirs, "rnaseq-variation") as run_parallel:
        with profile.report("RNA-seq variant calling", dirs):
            samples = rnaseq.rnaseq_variant_calling(samples, run_parallel)

    with prun.start(
            _wres(
                parallel,
                ["samtools", "fastqc", "qualimap", "kraken", "gatk", "preseq"],
                ensure_mem={"qualimap": 4}), samples, config, dirs,
            "qc") as run_parallel:
        with profile.report("quality control", dirs):
            samples = qcsummary.generate_parallel(samples, run_parallel)
        with profile.report("upload", dirs):
            samples = run_parallel("upload_samples", samples)
            for sample in samples:
                run_parallel("upload_samples_project", [sample])
        with profile.report("bcbioRNAseq loading", dirs):
            tools_on = dd.get_in_samples(samples, dd.get_tools_on)
            bcbiornaseq_on = tools_on and "bcbiornaseq" in tools_on
            if bcbiornaseq_on and len(samples) == 1:
                logger.warn(
                    "bcbioRNASeq does not work with just one sample, skipping."
                )
            else:
                run_parallel("run_bcbiornaseqload", [sample])
    logger.info("Timing: finished")
    return samples
 parser.add_argument("-q", "--queue", help="Queue to submit jobs to.")
 parser.add_argument("-t",
                     "--paralleltype",
                     choices=["local", "ipython"],
                     default="local",
                     help="Run with iptyhon")
 args = parser.parse_args()
 system_config = os.path.join(_get_data_dir(), "galaxy",
                              "bcbio_system.yaml")
 with open(system_config) as in_handle:
     config = yaml.load(in_handle)
     config["algorithm"] = {"num_cores": 1}
 samples = _get_samples_to_process(args.csv)
 prepped = []
 if args.paralleltype == "ipython":
     logger.info("Starting IPython cluster. This may take a while.")
     with get_cluster_view(args) as view:
         logger.info("IPython cluster is up.")
         for sample, info in samples.iteritems():
             prepped.append(
                 view.apply_async(info['fn'], info["files"],
                                  os.path.join(args.out, info["out_file"]),
                                  config))
         prepped = wait_until_complete(prepped)
 else:
     for sample, info in samples.iteritems():
         logger.info("Merging sample: %s" % sample)
         prepped.append(info['fn'](info["files"],
                                   os.path.join(args.out, info["out_file"]),
                                   config))
 create_new_csv(prepped, samples, args)
Example #51
0
def process_alignment(data, alt_input=None):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    data = cwlutils.normalize_missing(utils.to_single_data(data))
    data = cwlutils.unpack_tarballs(data, data)
    fastq1, fastq2 = dd.get_input_sequence_files(data)
    if alt_input:
        fastq1, fastq2 = alt_input
    config = data["config"]
    aligner = config["algorithm"].get("aligner", None)
    if fastq1 and objectstore.file_exists_or_remote(fastq1) and aligner:
        if dd.get_umi_type(data) == "dragen":
            assert bam.is_bam(
                fastq1), f"umi_type: dragen needs a BAM file as input."
            data = dragen.fix_umi_dragen_bam(data, bam=fastq1)


#            fastq1 = bam.sort(fastq1, dd.get_config(data))
#            bam.index(fastq1, dd.get_config(data))
#            data["work_bam"] = fastq1
        else:
            logger.info("Aligning lane %s with %s aligner" %
                        (data["rgnames"]["lane"], aligner))
            data = align_to_sort_bam(fastq1, fastq2, aligner, data)
        if dd.get_correct_umis(data):
            data["work_bam"] = postalign.correct_umis(data)
        if dd.get_umi_consensus(data):
            data["umi_bam"] = dd.get_work_bam(data)
            if fastq2 or dd.get_umi_type(data) == "dragen":
                f1, f2, avg_cov = postalign.umi_consensus(data)
                data["config"]["algorithm"]["rawumi_avg_cov"] = avg_cov
                del data["config"]["algorithm"]["umi_type"]
                data["config"]["algorithm"]["mark_duplicates"] = False
                data = align_to_sort_bam(f1, f2, aligner, data)
            else:
                raise ValueError(
                    "Single fastq input for UMI processing; fgbio needs paired reads: %s"
                    % dd.get_sample_name(data))
        data = _add_supplemental_bams(data)
    elif fastq1 and objectstore.file_exists_or_remote(
            fastq1) and fastq1.endswith(".bam"):
        sort_method = config["algorithm"].get("bam_sort")
        bamclean = config["algorithm"].get("bam_clean")
        if bamclean is True or bamclean == "picard":
            if sort_method and sort_method != "coordinate":
                raise ValueError(
                    "Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s"
                    % sort_method)
            ref_file = dd.get_ref_file(data)
            out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], ref_file,
                                           data["dirs"], data)
        elif bamclean == "fixrg":
            out_bam = cleanbam.fixrg(fastq1, data["rgnames"],
                                     dd.get_ref_file(data), data["dirs"], data)
        elif bamclean == "remove_extracontigs":
            out_bam = cleanbam.remove_extracontigs(fastq1, data)
        elif sort_method:
            runner = broad.runner_from_path("picard", config)
            out_file = os.path.join(
                data["dirs"]["work"], "{}-sort.bam".format(
                    os.path.splitext(os.path.basename(fastq1))[0]))
            if not utils.file_exists(out_file):
                work_dir = utils.safe_makedir(
                    os.path.join(dd.get_work_dir(data), "bamclean",
                                 dd.get_sample_name(data)))
                out_file = os.path.join(
                    work_dir, "{}-sort.bam".format(dd.get_sample_name(data)))
            out_bam = runner.run_fn("picard_sort", fastq1, sort_method,
                                    out_file)
        else:
            out_bam = _link_bam_file(
                fastq1,
                os.path.join(dd.get_work_dir(data), "prealign",
                             dd.get_sample_name(data)), data)
        bam.index(out_bam, data["config"])
        bam.check_header(out_bam, data["rgnames"], dd.get_ref_file(data),
                         data["config"])
        dedup_bam = postalign.dedup_bam(out_bam, data)
        bam.index(dedup_bam, data["config"])
        data["work_bam"] = dedup_bam
    elif fastq1 and objectstore.file_exists_or_remote(
            fastq1) and fastq1.endswith(".cram"):
        data["work_bam"] = fastq1
    elif fastq1 is None and not dd.get_aligner(data):
        data["config"]["algorithm"]["variantcaller"] = False
        data["work_bam"] = None
    elif not fastq1:
        raise ValueError("No 'files' specified for input sample: %s" %
                         dd.get_sample_name(data))
    elif "kraken" in config["algorithm"]:  # kraken doesn's need bam
        pass
    else:
        raise ValueError(
            "Could not process input file from sample configuration. \n" +
            fastq1 + "\nIs the path to the file correct or is empty?\n" +
            "If it is a fastq file (not pre-aligned BAM or CRAM), "
            "is an aligner specified in the input configuration?")
    if data.get("work_bam"):
        # Add stable 'align_bam' target to use for retrieving raw alignment
        data["align_bam"] = data["work_bam"]
        data = _add_hla_files(data)
    return [[data]]
Example #52
0
def run(items, config):
    """Run third party disambiguation script, resolving into single set of calls.
    """
    assert len(items) == 2, "Can only resolve two organism disambiguation"
    # check aligner, handling tophat/tophat2 distinctions
    aligner = config["algorithm"].get("aligner")
    if items[0]["disambiguate"].get("base"):
        data_a, data_b = items
    else:
        data_b, data_a = items

    # Construct name of sorted input files
    work_bam_a_nsorted = os.path.splitext(
        data_a["work_bam"])[0] + '.nsorted.bam'
    work_bam_b_nsorted = os.path.splitext(
        data_b["work_bam"])[0] + '.nsorted.bam'

    # logger.info('Disambiguate prep of input BAM {} and {}'.format(work_bam_a_nsorted, work_bam_b_nsorted))
    if data_a.get("align_split"):
        base_dir = utils.safe_makedir(
            os.path.normpath(
                os.path.join(os.path.dirname(work_bam_a_nsorted), os.pardir,
                             os.pardir, "disambiguate_%s" % aligner)))
        logger.info(
            'Disambiguate prep of prepped work bam BAM {} with base dir {}'.
            format(work_bam_a_nsorted, base_dir))
        split_name = "_".join(
            [str(x) for x in data_a["align_split"].split("-")])
        out_dir = os.path.join(base_dir, split_name)
        logger.info(
            'Disambiguate prep of prepped work bam BAM {} with out dir {}'.
            format(work_bam_a_nsorted, out_dir))
    else:
        out_dir = os.path.normpath(
            os.path.join(os.path.dirname(work_bam_a_nsorted), os.pardir,
                         "disambiguate_%s" % aligner))

    base_name = os.path.join(
        out_dir,
        os.path.splitext(os.path.basename(work_bam_a_nsorted))[0])
    logger.info(
        'Disambiguate prep of prepped work bam BAM {} with base name {}'.
        format(work_bam_a_nsorted, base_name))

    summary_file = "%s_summary.txt" % base_name
    explant_bam = "%s.explant.sorted.bam" % base_name
    ambiguous_bam = "%s.ambiguous.sorted.bam" % base_name
    work_bam = "%s.human.sorted.bam" % base_name

    logger.info('Disambiguate prep with work bam {}'.format(work_bam))

    logger.info(
        'Deciding if disambiguation is required. Checking for existence of {}, {}, {} and {}'
        .format(summary_file, explant_bam, ambiguous_bam, work_bam))

    if not utils.file_exists(summary_file) or not utils.file_exists(
            explant_bam) or not utils.file_exists(
                ambiguous_bam) or not utils.file_exists(work_bam):
        logger.info(
            'Disambiguating work bam a {} since outputs are not already existing'
            .format(work_bam_a_nsorted))
        work_bam_a = bam.sort(data_a["work_bam"], config, "queryname")
        work_bam_b = bam.sort(data_b["work_bam"], config, "queryname")
        logger.info('Disambiguate run with work bam a {}'.format(work_bam_a))
        logger.info('Disambiguate run with work bam b {}'.format(work_bam_b))
        with file_transaction(items[0], out_dir) as tx_out_dir:
            logger.info(
                'Disambiguate run with sorted prep work bam a {} and tx out dir {}'
                .format(work_bam_a_nsorted, tx_out_dir))
            tmp_base_name = os.path.join(tx_out_dir,
                                         os.path.basename(base_name))
            logger.info(
                'Disambiguate run with sorted prep work bam a {} and tmp_base_name {}'
                .format(work_bam_a_nsorted, tmp_base_name))
            pdx_filter = PDXFilter(
                work_bam_a,
                work_bam_b,
                "%s.human.bam" % tmp_base_name,
                # Must be bam else it will not be merged
                "%s.explant.bam" % tmp_base_name,
                # Must be bam else it will not be merged
                "%s.ambiguous.bam" % tmp_base_name,
                # Must be bam else it will not be merged
                "%s_summary.txt" % tmp_base_name,
                hard_filter=True,
                debug=True)
            pdx_filter.run()

        # Perhaps this can be removed since it has been fixed in bcbio
        if data_a.get("align_split"):
            split_dir = os.path.join(out_dir, split_name)
            logger.info(
                'Disambiguate post-run with sorted prep work bam a {} and split dir {}'
                .format(work_bam_a_nsorted, split_dir))
            if os.path.isdir(split_dir):
                for tmp_file in os.listdir(split_dir):
                    logger.info(
                        'Disambiguate post-run with sorted prep work bam a {} aiming to move file {}'
                        .format(work_bam_a_nsorted, tmp_file))
                    src = os.path.join(split_dir, tmp_file)
                    if os.path.isfile(src):
                        dest = os.path.join(out_dir, tmp_file)
                        logger.info(
                            'Disambiguate post-run with sorted prep work bam a {} moving file {} from {} to {}'
                            .format(work_bam_a_nsorted, tmp_file, src, dest))
                        shutil.move(src, dest)
                shutil.rmtree(split_dir)

        try:
            if work_bam_a != data_a["work_bam"]:
                os.remove(work_bam_a)
        except:
            pass
        try:
            if work_bam_b != data_b["work_bam"]:
                os.remove(work_bam_b)
        except:
            pass

    else:
        logger.info(
            'Skipping disambiguation for work bam a {} since outputs are already existing'
            .format(work_bam_a_nsorted))

    explant_bam = os.path.isfile(explant_bam) and explant_bam or bam.sort(
        "%s.explant.bam" % base_name, config)
    ambiguous_bam = os.path.isfile(
        ambiguous_bam) and ambiguous_bam or bam.sort(
            "%s.ambiguous.bam" % base_name, config)
    work_bam = os.path.isfile(work_bam) and work_bam or bam.sort(
        "%s.human.bam" % base_name, config)
    # logger.info('Disambiguate run with post work_bam {}'.format(work_bam))

    data_a["disambiguate"] = {
        data_b["genome_build"]: explant_bam,
        "%s-ambiguous" % data_a["genome_build"]: ambiguous_bam,
        "summary": summary_file
    }
    data_a["work_bam"] = work_bam
    try:
        os.remove("%s.explant.bam" % base_name)
    except:
        pass
    try:
        os.remove("%s.human.bam" % base_name)
    except:
        pass
    try:
        os.remove("%s.ambiguous.bam" % base_name)
    except:
        pass

    return [[data_a]]
Example #53
0
def combine_calls(*args):
    """Combine multiple callsets into a final set of merged calls.
    """
    if len(args) == 3:
        is_cwl = False
        batch_id, samples, data = args
        caller_names, vrn_files = _organize_variants(samples, batch_id)
    else:
        is_cwl = True
        samples = [utils.to_single_data(x) for x in args]
        samples = [cwlutils.unpack_tarballs(x, x) for x in samples]
        data = samples[0]
        batch_id = data["batch_id"]
        caller_names = data["variants"]["variantcallers"]
        vrn_files = data["variants"]["calls"]
    logger.info("Ensemble consensus calls for {0}: {1}".format(
        batch_id, ",".join(caller_names)))
    edata = copy.deepcopy(data)
    base_dir = utils.safe_makedir(
        os.path.join(edata["dirs"]["work"], "ensemble", batch_id))
    if any([vcfutils.vcf_has_variants(f) for f in vrn_files]):
        # Decompose multiallelic variants and normalize
        passonly = not tz.get_in(
            ["config", "algorithm", "ensemble", "use_filtered"], edata, False)
        vrn_files = [
            normalize.normalize(f,
                                data,
                                passonly=passonly,
                                rerun_effects=False,
                                remove_oldeffects=True,
                                nonrefonly=True,
                                work_dir=utils.safe_makedir(
                                    os.path.join(base_dir, c)))
            for c, f in zip(caller_names, vrn_files)
        ]
        if "classifiers" not in (dd.get_ensemble(edata) or {}):
            callinfo = _run_ensemble_intersection(batch_id, vrn_files,
                                                  caller_names, base_dir,
                                                  edata)
        else:
            config_file = _write_config_file(batch_id, caller_names, base_dir,
                                             edata)
            callinfo = _run_ensemble(batch_id, vrn_files, config_file,
                                     base_dir, dd.get_ref_file(edata), edata)
            callinfo["vrn_file"] = vcfutils.bgzip_and_index(
                callinfo["vrn_file"], data["config"])
        # After decomposing multiallelic variants and normalizing, re-evaluate effects
        ann_ma_file, _ = effects.add_to_vcf(callinfo["vrn_file"], data)
        if ann_ma_file:
            callinfo["vrn_file"] = ann_ma_file

        edata["config"]["algorithm"]["variantcaller"] = "ensemble"
        edata["vrn_file"] = callinfo["vrn_file"]
        edata["ensemble_bed"] = callinfo["bed_file"]
        callinfo["validate"] = validate.compare_to_rm(edata)[0][0].get(
            "validate")
    else:
        out_vcf_file = os.path.join(base_dir,
                                    "{0}-ensemble.vcf".format(batch_id))
        vcfutils.write_empty_vcf(
            out_vcf_file, samples=[dd.get_sample_name(d) for d in samples])
        callinfo = {
            "variantcaller": "ensemble",
            "vrn_file": vcfutils.bgzip_and_index(out_vcf_file, data["config"]),
            "bed_file": None
        }
    if is_cwl:
        callinfo["batch_samples"] = data["batch_samples"]
        callinfo["batch_id"] = batch_id
        return [{"ensemble": callinfo}]
    else:
        return [[batch_id, callinfo]]
Example #54
0
def _get_preseq_params(data, preseq_cmd, read_count):
    """ Get parameters through resources.
        If "step" or "extrap" limit are not provided, then calculate optimal values based on read count.
    """
    defaults = {
        'seg_len':
        100000,  # maximum segment length when merging paired end bam reads
        'steps': 300,  # number of points on the plot
        'extrap_fraction': 3,  # extrapolate up to X times read_count
        'extrap': None,  # extrapolate up to X reads
        'step': None,  # step size (number of reads between points on the plot)
        'options': '',
    }
    params = {}

    main_opts = [("-e", "-extrap"), ("-l", "-seg_len"), ("-s", "-step")]
    other_opts = config_utils.get_resources("preseq",
                                            data["config"]).get("options", [])
    if isinstance(other_opts, str):
        other_opts = [other_opts]
    for sht, lng in main_opts:
        if sht in other_opts:
            i = other_opts.index(sht)
        elif lng in other_opts:
            i = other_opts.index(lng)
        else:
            i = None
        if i is not None:
            params[lng[1:]] = other_opts[i + 1]
            other_opts = other_opts[:i] + other_opts[i + 2:]
    params['options'] = ' '.join(other_opts)
    for k, v in config_utils.get_resources("preseq", data["config"]).items():
        if k != 'options':
            params[k] = v

    params['steps'] = params.get('steps', defaults['steps'])

    if preseq_cmd == 'c_curve':
        params['extrap_fraction'] = 1

    else:
        if params.get('step') is None:
            if params.get('extrap') is None:
                unrounded__extrap = read_count * params.get(
                    'extrap_fraction', defaults['extrap_fraction'])
                unrounded__step = unrounded__extrap // params['steps']
                if params.get(
                        'extrap_fraction'
                ) is not None:  # extrap_fraction explicitly provided
                    params['extrap'] = unrounded__extrap
                    params['step'] = unrounded__step
                else:
                    power_of_10 = 10**math.floor(math.log(unrounded__step, 10))
                    rounded__step = int(
                        math.floor(unrounded__step // power_of_10) *
                        power_of_10)
                    rounded__extrap = int(rounded__step) * params['steps']
                    params['step'] = rounded__step
                    params['extrap'] = rounded__extrap
            else:
                params['step'] = params['extrap'] // params['steps']

        elif params.get('extrap') is None:
            params['extrap'] = params['step'] * params['steps']

    params['step'] = params.get('step', defaults['step'])
    params['extrap'] = params.get('extrap', defaults['extrap'])
    params['seg_len'] = params.get('seg_len', defaults['seg_len'])

    logger.info(
        "Preseq: running {steps} steps of size {step}, extap limit {extrap}".
        format(**params))
    return params
Example #55
0
    def run(self, config, config_file, run_parallel, parallel, dirs,
            lane_items):
        ## Alignment and preparation requiring the entire input file (multicore cluster)
        with global_parallel(parallel, "multicore", ["align_prep_full"],
                             lane_items, dirs["work"], config) as parallel:
            run_parallel = parallel_runner(parallel, dirs, config)
            logger.info("Timing: alignment")
            samples = run_parallel(
                "align_prep_full",
                [list(x) + [config_file] for x in lane_items])
            regions = callable.combine_sample_regions(samples)
            samples = region.add_region_info(samples, regions)
            samples = region.clean_sample_data(samples)
            logger.info("Timing: coverage")
            samples = coverage.summarize_samples(samples, run_parallel)

        ## Variant calling on sub-regions of the input file (full cluster)
        with global_parallel(parallel,
                             "full", ["piped_bamprep", "variantcall_sample"],
                             samples,
                             dirs["work"],
                             config,
                             multiplier=len(regions["analysis"])) as parallel:
            run_parallel = parallel_runner(parallel, dirs, config)
            logger.info("Timing: alignment post-processing")
            samples = region.parallel_prep_region(samples, regions,
                                                  run_parallel)
            logger.info("Timing: variant calling")
            samples = region.parallel_variantcall_region(samples, run_parallel)

        ## Finalize variants (per-sample cluster)
        with global_parallel(parallel, "persample", ["postprocess_variants"],
                             samples, dirs["work"], config) as parallel:
            run_parallel = parallel_runner(parallel, dirs, config)
            logger.info("Timing: variant post-processing")
            samples = run_parallel("postprocess_variants", samples)
            samples = combine_multiple_callers(samples)
            logger.info("Timing: ensemble calling")
            samples = ensemble.combine_calls_parallel(samples, run_parallel)
            logger.info("Timing: prepped BAM merging")
            samples = region.delayed_bamprep_merge(samples, run_parallel)
            logger.info("Timing: validation")
            samples = run_parallel("compare_to_rm", samples)
            samples = validate.summarize_grading(samples)
            logger.info("Timing: population database")
            samples = population.prep_db_parallel(samples, run_parallel)
            logger.info("Timing: quality control")
            samples = qcsummary.generate_parallel(samples, run_parallel)
            logger.info("Timing: finished")
        return samples
Example #56
0
    def run(self, config, config_file, run_parallel, parallel, dirs, samples):
        ## Alignment and preparation requiring the entire input file (multicore cluster)
        with global_parallel(parallel, "multicore", ["process_alignment", "postprocess_alignment"],
                             samples, dirs, config,
                             multiplier=alignprep.parallel_multiplier(samples)) as parallel:
            run_parallel = parallel_runner(parallel, dirs, config)
            logger.info("Timing: alignment")
            samples = run_parallel("prep_align_inputs", samples)
            samples = run_parallel("process_alignment", samples)
            samples = alignprep.merge_split_alignments(samples, run_parallel)
            samples = run_parallel("postprocess_alignment", samples)
            regions = callable.combine_sample_regions(samples)
            samples = region.add_region_info(samples, regions)
            samples = region.clean_sample_data(samples)
            logger.info("Timing: coverage")
            samples = coverage.summarize_samples(samples, run_parallel)

        ## Variant calling on sub-regions of the input file (full cluster)
        with global_parallel(parallel, "full", ["piped_bamprep", "variantcall_sample"],
                             samples, dirs, config,
                             multiplier=len(regions["analysis"]), max_multicore=1) as parallel:
            run_parallel = parallel_runner(parallel, dirs, config)
            logger.info("Timing: alignment post-processing")
            samples = region.parallel_prep_region(samples, regions, run_parallel)
            logger.info("Timing: variant calling")
            samples = region.parallel_variantcall_region(samples, run_parallel)

        ## Finalize variants (per-sample cluster)
        with global_parallel(parallel, "persample", ["postprocess_variants"],
                             samples, dirs, config) as parallel:
            run_parallel = parallel_runner(parallel, dirs, config)
            logger.info("Timing: variant post-processing")
            samples = run_parallel("postprocess_variants", samples)
            logger.info("Timing: validation")
            samples = run_parallel("compare_to_rm", samples)
            samples = combine_multiple_callers(samples)
            logger.info("Timing: ensemble calling")
            samples = ensemble.combine_calls_parallel(samples, run_parallel)
            samples = validate.summarize_grading(samples)
        ## Finalizing BAMs and population databases, handle multicore computation
        with global_parallel(parallel, "multicore2", ["prep_gemini_db", "delayed_bam_merge"],
                             samples, dirs, config) as parallel:
            run_parallel = parallel_runner(parallel, dirs, config)
            logger.info("Timing: prepped BAM merging")
            samples = region.delayed_bamprep_merge(samples, run_parallel)
            logger.info("Timing: structural variation")
            samples = structural.run(samples, run_parallel)
            logger.info("Timing: population database")
            samples = population.prep_db_parallel(samples, run_parallel)
            logger.info("Timing: quality control")
            samples = qcsummary.generate_parallel(samples, run_parallel)
        logger.info("Timing: finished")
        return samples
Example #57
0
def postprocess_variants(items):
    """Provide post-processing of variant calls: filtering and effects annotation.
    """
    data = _get_batch_representative(items, "vrn_file")
    cur_name = "%s, %s" % (dd.get_sample_name(data), get_variantcaller(data))
    logger.info("Finalizing variant calls: %s" % cur_name)
    orig_vrn_file = data.get("vrn_file")
    data = _symlink_to_workdir(data, ["vrn_file"])
    data = _symlink_to_workdir(data,
                               ["config", "algorithm", "variant_regions"])
    if data.get("align_bam") and data.get("vrn_file"):
        logger.info("Calculating variation effects for %s" % cur_name)
        ann_vrn_file, vrn_stats = effects.add_to_vcf(data["vrn_file"], data)
        if ann_vrn_file:
            data["vrn_file"] = ann_vrn_file
        if vrn_stats:
            data["vrn_stats"] = vrn_stats
        orig_items = _get_orig_items(items)
        logger.info("Annotate VCF file: %s" % cur_name)
        data["vrn_file"] = annotation.finalize_vcf(data["vrn_file"],
                                                   get_variantcaller(data),
                                                   orig_items)
        logger.info("Filtering for %s" % cur_name)
        data["vrn_file"] = variant_filtration(
            data["vrn_file"], dd.get_ref_file(data),
            tz.get_in(("genome_resources", "variation"), data, {}), data,
            orig_items)
        logger.info("Prioritization for %s" % cur_name)
        data["vrn_file"] = prioritize.handle_vcf_calls(data["vrn_file"], data,
                                                       orig_items)
        logger.info("Germline extraction for %s" % cur_name)
        data = germline.extract(data, orig_items)

        data = damage.run_filter(data["vrn_file"], dd.get_align_bam(data),
                                 dd.get_ref_file(data), data, orig_items)
    if orig_vrn_file and os.path.samefile(data["vrn_file"], orig_vrn_file):
        data["vrn_file"] = orig_vrn_file
    return [[data]]
Example #58
0
def run(bam_file, data, fastqc_out):
    """Run fastqc, generating report in specified directory and parsing metrics.

    Downsamples to 10 million reads to avoid excessive processing times with large
    files, unless we're running a Standard/smallRNA-seq/QC pipeline.

    Handles fastqc 0.11+, which use a single HTML file and older versions that use
    a directory of files + images. The goal is to eventually move to only 0.11+
    """
    sentry_file = os.path.join(fastqc_out, "fastqc_report.html")
    if not os.path.exists(sentry_file):
        work_dir = os.path.dirname(fastqc_out)
        utils.safe_makedir(work_dir)
        ds_file = (bam.downsample(bam_file, data, 1e7, work_dir=work_dir)
                   if data.get("analysis", "").lower()
                   not in ["standard", "smallrna-seq"] else None)
        if ds_file is not None:
            bam_file = ds_file
        frmt = "bam" if bam_file.endswith("bam") else "fastq"
        fastqc_name = utils.splitext_plus(os.path.basename(bam_file))[0]
        fastqc_clean_name = dd.get_sample_name(data)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        with tx_tmpdir(data, work_dir) as tx_tmp_dir:
            with utils.chdir(tx_tmp_dir):
                cl = [
                    config_utils.get_program("fastqc", data["config"]), "-d",
                    tx_tmp_dir, "-t",
                    str(num_cores), "--extract", "-o", tx_tmp_dir, "-f", frmt,
                    bam_file
                ]
                cl = "%s %s" % (utils.local_path_export(), " ".join(
                    [str(x) for x in cl]))
                do.run(cl, "FastQC: %s" % dd.get_sample_name(data))
                tx_fastqc_out = os.path.join(tx_tmp_dir,
                                             "%s_fastqc" % fastqc_name)
                tx_combo_file = os.path.join(tx_tmp_dir,
                                             "%s_fastqc.html" % fastqc_name)
                if not os.path.exists(sentry_file) and os.path.exists(
                        tx_combo_file):
                    utils.safe_makedir(fastqc_out)
                    # Use sample name for reports instead of bam file name
                    with open(os.path.join(tx_fastqc_out, "fastqc_data.txt"), 'r') as fastqc_bam_name, \
                            open(os.path.join(tx_fastqc_out, "_fastqc_data.txt"), 'w') as fastqc_sample_name:
                        for line in fastqc_bam_name:
                            fastqc_sample_name.write(
                                line.replace(os.path.basename(bam_file),
                                             fastqc_clean_name))
                    shutil.move(
                        os.path.join(tx_fastqc_out, "_fastqc_data.txt"),
                        os.path.join(fastqc_out, 'fastqc_data.txt'))
                    shutil.move(tx_combo_file, sentry_file)
                    if os.path.exists("%s.zip" % tx_fastqc_out):
                        shutil.move(
                            "%s.zip" % tx_fastqc_out,
                            os.path.join(fastqc_out,
                                         "%s.zip" % fastqc_clean_name))
                elif not os.path.exists(sentry_file):
                    raise ValueError(
                        "FastQC failed to produce output HTML file: %s" %
                        os.listdir(tx_tmp_dir))
    logger.info("Produced HTML report %s" % sentry_file)
    parser = FastQCParser(fastqc_out, dd.get_sample_name(data))
    stats = parser.get_fastqc_summary()
    parser.save_sections_into_file()
    return stats
Example #59
0
def run(bam_file, data, out_dir):
    """Run qualimap to assess alignment quality metrics.
    """
    # Qualimap results should be saved to a directory named after sample.
    # MultiQC (for parsing additional data) picks the sample name after the dir as follows:
    #   <sample name>/raw_data_qualimapReport/insert_size_histogram.txt
    results_dir = os.path.join(out_dir, dd.get_sample_name(data))
    resources = config_utils.get_resources("qualimap", data["config"])
    options = " ".join(resources.get("options", ""))
    results_file = os.path.join(results_dir, "genome_results.txt")
    report_file = os.path.join(results_dir, "qualimapReport.html")
    utils.safe_makedir(results_dir)
    pdf_file = "qualimapReport.pdf"
    if not utils.file_exists(results_file) and not utils.file_exists(
            os.path.join(results_dir, pdf_file)):
        if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"),
                                        data, []):
            logger.info("Full qualimap analysis for %s may be slow." %
                        bam_file)
            ds_bam = bam_file
        else:
            ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir)
            bam_file = ds_bam if ds_bam else bam_file
        if options.find("PDF") > -1:
            options = "%s -outfile %s" % (options, pdf_file)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        qualimap = config_utils.get_program("qualimap", data["config"])
        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                             num_cores)

        with file_transaction(data, results_dir) as tx_results_dir:
            utils.safe_makedir(tx_results_dir)

            export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % (
                utils.java_freetype_fix(), utils.local_path_export(), max_mem,
                tx_results_dir)
            cmd = (
                "unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {tx_results_dir} "
                "--skip-duplicated --skip-dup-mode 0 "
                "-nt {num_cores} {options}")
            species = None
            if (tz.get_in(("genome_resources", "aliases", "human"), data, "")
                    or dd.get_genome_build(data).startswith(("hg", "GRCh"))):
                species = "HUMAN"
            elif dd.get_genome_build(data).startswith(("mm", "GRCm")):
                species = "MOUSE"
            if species in ["HUMAN", "MOUSE"]:
                cmd += " -gd {species}"
            regions = (dd.get_coverage(data) if dd.get_coverage(data) not in [
                None, False, "None"
            ] else dd.get_variant_regions_merged(data))
            if regions:
                regions = bedutils.merge_overlaps(
                    bedutils.clean_file(regions, data), data)
                bed6_regions = _bed_to_bed6(regions, out_dir)
                cmd += " -gff {bed6_regions}"
            bcbio_env = utils.get_bcbio_env()
            do.run(cmd.format(**locals()),
                   "Qualimap: %s" % dd.get_sample_name(data),
                   env=bcbio_env)
            tx_results_file = os.path.join(tx_results_dir,
                                           "genome_results.txt")
            cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (
                dd.get_sample_name(data), tx_results_file)
            do.run(cmd,
                   "Fix Name Qualimap for {}".format(dd.get_sample_name(data)))
    # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order
    # to keep its name after upload, we need to put  the base QC file (results_file) into the root directory (out_dir):
    base_results_file = os.path.join(out_dir, os.path.basename(results_file))
    shutil.copyfile(results_file, base_results_file)
    return {
        "base": base_results_file,
        "secondary": _find_qualimap_secondary_files(results_dir,
                                                    base_results_file)
    }
Example #60
0
File: fastqc.py Project: roryk/bipy
 def _memoized_message(self, in_file, out_file):
     logger.info("%s already run on %s and stored as %s, skipping." %
                 (self.stage, in_file, out_file))