def variants(data): if not "vrn_file" in data: return data in_vcf = data['vrn_file'] work_dir = os.path.join(dd.get_work_dir(data), "report", "variants") with chdir(work_dir): in_bam = data['work_bam'] ref_file = dd.get_ref_file(data) assert ref_file, "Need the reference genome fasta file." jvm_opts = broad.get_gatk_framework_opts(data['config']) gatk_jar = config_utils.get_program("gatk", data['config'], "dir") bed_file = dd.get_variant_regions(data) sample = splitext_plus(os.path.basename(in_vcf))[0] in_bam = data["work_bam"] cg_file = os.path.join(sample + "_with-gc.vcf.gz") parse_file = os.path.join(sample + "_cg-depth-parse.tsv") if not file_exists(cg_file): with file_transaction(cg_file) as tx_out: cmd = ("java -jar {gatk_jar}/GenomeAnalysisTK.jar -T VariantAnnotator -R {ref_file} " "-L {bed_file} -I {in_bam} " "-A GCContent --variant {in_vcf} --out {tx_out}") do.run(cmd.format(**locals()), " GC bias for %s" % in_vcf) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: with open(out_tx, 'w') as out_handle: print >>out_handle, "CG\tdepth\tsample" cmd = ("bcftools query -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R {bed_file} {cg_file} >> {out_tx}") do.run(cmd.format(**locals()), " query for %s" % in_vcf) logger.debug('parsing coverage: %s' % sample) # return df return data
def coverage(data): bed_file = dd.get_coverage_experimental(data) if not bed_file: return data work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage") with chdir(work_dir): in_bam = data['work_bam'] sample = os.path.splitext(os.path.basename(in_bam))[0] logger.debug("doing coverage for %s" % sample) region_bed = pybedtools.BedTool(bed_file) parse_file = os.path.join(sample + "_coverage.bed") parse_total_file = os.path.join(sample + "_cov_total.tsv") if not file_exists(parse_file): total_cov = cov_class(0, None, sample) bam_api = pysam.AlignmentFile(in_bam) with file_transaction(parse_file) as out_tx: with open(out_tx, 'w') as out_handle: print >>out_handle, "#chrom\tstart\tend\tregion\treads\tstrand\tsize\tsample\tmean\tsdt\tq10\tq20\tq4\tq50" with tmpfile() as tx_tmp_file: # tx_tmp_file = "tmpintersect" for line in region_bed: chrom = line.chrom start = max(line.start, 0) end = line.end region_file = pybedtools.BedTool(str(line), from_string=True).saveas().fn coords = "%s:%s-%s" % (chrom, start, end) cmd = ("samtools view -b {in_bam} {coords} | " "bedtools coverage -a {region_file} -b - -hist > {tx_tmp_file}") _silence_run(cmd.format(**locals())) total_cov = _get_exome_coverage_stats(os.path.abspath(tx_tmp_file), sample, out_tx, total_cov) total_cov.write_coverage(parse_total_file) data['coverage'] = os.path.abspath(parse_file) return data
def merge_sample(data): """Merge fastq and BAM files for multiple samples. """ logger.debug("Combining fastq and BAM files %s" % str(data["name"])) config = config_utils.update_w_custom(data["config"], data["info"]) if config["algorithm"].get("upload_fastq", False): fastq1, fastq2 = combine_fastq_files(data["fastq_files"], data["dirs"]["work"], config) else: fastq1, fastq2 = None, None out_file = os.path.join(data["dirs"]["work"], data["info"]["rgnames"]["sample"] + ".bam") sort_bam = merge_bam_files(data["bam_files"], data["dirs"]["work"], config, out_file=out_file) return [ [ { "name": data["name"], "metadata": data["info"].get("metadata", {}), "info": data["info"], "genome_build": data["genome_build"], "sam_ref": data["sam_ref"], "work_bam": sort_bam, "fastq1": fastq1, "fastq2": fastq2, "dirs": data["dirs"], "config": config, "config_file": data["config_file"], } ] ]
def _af_filter(data, in_file, out_file): """Soft-filter variants with AF below min_allele_fraction (appends "MinAF" to FILTER) """ min_freq = float(utils.get_in(data["config"], ("algorithm", "min_allele_fraction"), 10)) / 100.0 logger.debug("Filtering MuTect2 calls with allele fraction threshold of %s" % min_freq) ungz_out_file = "%s.vcf" % utils.splitext_plus(out_file)[0] if not utils.file_exists(ungz_out_file) and not utils.file_exists(ungz_out_file + ".gz"): with file_transaction(data, ungz_out_file) as tx_out_file: vcf = cyvcf2.VCF(in_file) vcf.add_filter_to_header({ 'ID': 'MinAF', 'Description': 'Allele frequency is lower than %s%% ' % (min_freq*100) + ( '(configured in bcbio as min_allele_fraction)' if utils.get_in(data["config"], ("algorithm", "min_allele_fraction")) else '(default threshold in bcbio; override with min_allele_fraction in the algorithm section)')}) w = cyvcf2.Writer(tx_out_file, vcf) # GATK 3.x can produce VCFs without sample names for empty VCFs try: tumor_index = vcf.samples.index(dd.get_sample_name(data)) except ValueError: tumor_index = None for rec in vcf: if tumor_index is not None and np.all(rec.format('AF')[tumor_index] < min_freq): vcfutils.cyvcf_add_filter(rec, 'MinAF') w.write_record(rec) w.close() return vcfutils.bgzip_and_index(ungz_out_file, data["config"])
def gatk_realigner_targets(runner, align_bam, ref_file, dbsnp=None, region=None, out_file=None, deep_coverage=False, variant_regions=None): """Generate a list of interval regions for realignment around indels. """ if out_file: out_file = "%s.intervals" % os.path.splitext(out_file)[0] else: out_file = "%s-realign.intervals" % os.path.splitext(align_bam)[0] # check only for file existence; interval files can be empty after running # on small chromosomes, so don't rerun in those cases if not os.path.exists(out_file): with file_transaction(out_file) as tx_out_file: logger.debug("GATK RealignerTargetCreator: %s %s" % (os.path.basename(align_bam), region)) params = ["-T", "RealignerTargetCreator", "-I", align_bam, "-R", ref_file, "-o", tx_out_file, "-l", "INFO", ] region = subset_variant_regions(variant_regions, region, tx_out_file) if region: params += ["-L", region, "--interval_set_rule", "INTERSECTION"] if dbsnp: params += ["--known", dbsnp] if deep_coverage: params += ["--mismatchFraction", "0.30", "--maxIntervalSize", "650"] runner.run_gatk(params) return out_file
def run(cmd, descr=None, data=None, checks=None, region=None, log_error=True, log_stdout=False, env=None): """Run the provided command, logging details and checking for errors. """ if descr: descr = _descr_str(descr, data, region) logger.debug(descr) cmd_id = diagnostics.start_cmd(cmd, descr or "", data) try: logger_cl.debug(" ".join( str(x) for x in cmd) if not isinstance(cmd, basestring) else cmd) _do_run(cmd, checks, log_stdout, env=env) except: diagnostics.end_cmd(cmd_id, False) if log_error: logger.exception() raise finally: diagnostics.end_cmd(cmd_id)
def coverage_region_detailed_stats(data, out_dir): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) if not bed_file: return None work_dir = safe_makedir(out_dir) cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True) with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) parse_total_file = os.path.join(sample + "_cov_total.tsv") parse_file = os.path.join(sample + "_coverage.bed") if utils.file_uptodate(parse_file, cleaned_bed) and utils.file_uptodate(parse_file, in_bam): pass else: with file_transaction(parse_file) as out_tx: cmdl = sambamba.make_command(data, "depth region", in_bam, cleaned_bed, depth_thresholds=[1, 5, 10, 20, 40, 50, 60, 70, 80, 100], max_cov=1000) cmdl += " | sed 's/# chrom/chrom/' > " + out_tx do.run(cmdl, "Run coverage regional analysis for {}".format(sample)) parse_file = _add_high_covered_regions(parse_file, cleaned_bed, sample) parse_file = _calculate_percentiles(os.path.abspath(parse_file), sample) return os.path.abspath(parse_file)
def _move_file_with_sizecheck(tx_file, final_file): """Move transaction file to final location, with size checks avoiding failed transfers. Creates an empty file with '.bcbiotmp' extention in the destination location, which serves as a flag. If a file like that is present, it means that transaction didn't finish successfully. """ logger.debug("Moving %s to %s" % (tx_file, final_file)) tmp_file = final_file + ".bcbiotmp" open(tmp_file, 'wb').close() want_size = utils.get_size(tx_file) shutil.move(tx_file, final_file) transfer_size = utils.get_size(final_file) assert want_size == transfer_size, ( 'distributed.transaction.file_transaction: File copy error: ' 'file or directory on temporary storage ({}) size {} bytes ' 'does not equal size of file or directory after transfer to ' 'shared storage ({}) size {} bytes'.format(tx_file, want_size, final_file, transfer_size)) utils.remove_safe(tmp_file)
def process_lane(lane, pruned_fc, rawdata_fc, analysis_fc): """Models bcbio process lane""" multiplex = lane.get_samples() logger.info("Processing lane %s; reference genome %s" % (lane.get_name(), lane.get_genome_build())) if multiplex: logger.debug("Project %s is multiplexed as: %s" % (lane.get_name(), multiplex)) fq = _get_barcoded_fastq_files(lane, multiplex, pruned_fc.get_fc_date(), pruned_fc.get_fc_name(), pruned_fc.get_fc_dir()) ## Move data along with fastq files fc_data_dir = rawdata_fc.get_fc_dir() _make_dir(fc_data_dir, "data delivery directory") if options.install_data: data, fastqc = _get_analysis_results(pruned_fc, lane) _deliver_data(data, fastqc, analysis_fc.get_fc_dir()) fastq_targets = list() for fqpair in fq: for fastq_src in fqpair: fastq_tgt = fastq_src if options.customer_delivery or options.barcode_id_to_name: fastq_tgt = _convert_barcode_id_to_name( multiplex, rawdata_fc.get_fc_name(), fastq_src) fastq_tgt = fastq_tgt.replace("_nophix_", "_") _deliver_fastq_file(fastq_src, os.path.basename(fastq_tgt), fc_data_dir) fastq_targets.append( os.path.join(fc_data_dir, os.path.basename(fastq_tgt))) lane.set_files(fastq_targets) return lane
def _do_run(cmd, checks): """Perform running and check results, raising errors for issues. """ cmd, shell_arg, executable_arg = _normalize_cmd_args(cmd) s = subprocess.Popen(cmd, shell=shell_arg, executable=executable_arg, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) debug_stdout = collections.deque(maxlen=100) with contextlib.closing(s.stdout) as stdout: while 1: line = stdout.readline() exitcode = s.poll() if exitcode is not None: if exitcode is not None and exitcode != 0: error_msg = " ".join(cmd) if not isinstance(cmd, basestring) else cmd error_msg += "\n" error_msg += "".join(debug_stdout) raise subprocess.CalledProcessError(exitcode, error_msg) else: break if line: debug_stdout.append(line) logger.debug(line.rstrip()) # Check for problems not identified by shell return codes if checks: for check in checks: if not check(): raise IOError("External command failed")
def shared_variantcall(call_fn, name, align_bams, ref_file, items, assoc_files, region=None, out_file=None): """Provide base functionality for prepping and indexing for variant calling. """ config = items[0]["config"] if out_file is None: if vcfutils.is_paired_analysis(align_bams, items): out_file = "%s-paired-variants.vcf.gz" % config["metdata"]["batch"] else: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): logger.debug("Genotyping with {name}: {region} {fname}".format( name=name, region=region, fname=os.path.basename(align_bams[0]))) variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0]) target_regions = subset_variant_regions(variant_regions, region, out_file) if (variant_regions is not None and isinstance(target_regions, basestring) and not os.path.isfile(target_regions)): vcfutils.write_empty_vcf(out_file, config) else: with file_transaction(config, out_file) as tx_out_file: call_fn(align_bams, ref_file, items, target_regions, tx_out_file) if out_file.endswith(".gz"): out_file = vcfutils.bgzip_and_index(out_file, config) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def coverage_region_detailed_stats(data, out_dir): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) if not bed_file: return None work_dir = safe_makedir(out_dir) cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True) with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) os.path.join(sample + "_cov_total.tsv") parse_file = os.path.join(sample + "_coverage.bed") if utils.file_uptodate(parse_file, cleaned_bed) and utils.file_uptodate(parse_file, in_bam): pass else: with file_transaction(data, parse_file) as out_tx: cmdl = sambamba.make_command(data, "depth region", in_bam, cleaned_bed, depth_thresholds=[1, 5, 10, 20, 40, 50, 60, 70, 80, 100], max_cov=1000) cmdl += " | sed 's/# chrom/chrom/' > " + out_tx do.run(cmdl, "Run coverage regional analysis for {}".format(sample)) parse_file = _add_high_covered_regions(parse_file, cleaned_bed, sample, data=data) parse_file = _calculate_percentiles(os.path.abspath(parse_file), sample, data=data) return os.path.abspath(parse_file)
def _make_isomir_counts(data, srna_type="seqbuster", out_dir=None, stem=""): """ Parse miraligner files to create count matrix. """ work_dir = dd.get_work_dir(data[0][0]) if not out_dir: out_dir = op.join(work_dir, "mirbase") out_novel_isomir = append_stem(op.join(out_dir, "counts.tsv"), stem) out_novel_mirna = append_stem(op.join(out_dir, "counts_mirna.tsv"), stem) if file_exists(out_novel_mirna): return [out_novel_mirna, out_novel_isomir] out_dts = [] for sample in data: if sample[0].get(srna_type): miraligner_fn = sample[0][srna_type] reads = _read_miraligner(miraligner_fn) if reads: out_file, dt, dt_pre = _tab_output(reads, miraligner_fn + ".back", dd.get_sample_name(sample[0])) out_dts.append(dt) else: logger.debug("WARNING::%s has NOT miRNA annotated for %s. Check if fasta files is small or species value." % (dd.get_sample_name(sample[0]), srna_type)) if out_dts: out_files = _create_counts(out_dts, out_dir) out_files = [move_safe(out_files[0], out_novel_isomir), move_safe(out_files[1], out_novel_mirna)] return out_files else: logger.debug("WARNING::any samples have miRNA annotated for %s. Check if fasta files is small or species value." % srna_type)
def run(data): config = data[0][0]['config'] work_dir = dd.get_work_dir(data[0][0]) genome = dd.get_ref_file(data[0][0]) mirdeep2 = os.path.join(os.path.dirname(sys.executable), "miRDeep2.pl") perl_exports = get_perl_exports() hairpin, mature, species = "none", "none", "na" rfam_file = dd.get_mirdeep2_file(data[0][0]) if file_exists(dd.get_mirbase_hairpin(data[0][0])): species = dd.get_species(data[0][0]) hairpin = dd.get_mirbase_hairpin(data[0][0]) mature = dd.get_mirbase_mature(data[0][0]) logger.debug("Preparing for mirdeep2 analysis.") bam_file = op.join(work_dir, "align", "seqs.bam") seqs_dir = op.join(work_dir, "seqcluster", "prepare") collapsed = op.join(seqs_dir, "seqs.ma") out_dir = op.join(work_dir, "mirdeep2") out_file = op.join(out_dir, "result_res.csv") safe_makedir(out_dir) with chdir(out_dir): collapsed, bam_file = _prepare_inputs(collapsed, bam_file, out_dir) cmd = ("{perl_exports} && perl {mirdeep2} {collapsed} {genome} {bam_file} {mature} none {hairpin} -f {rfam_file} -r simple -c -P -t {species} -z res").format(**locals()) if file_exists(mirdeep2) and not file_exists(out_file) and file_exists(rfam_file): do.run(cmd.format(**locals()), "Running mirdeep2.") if file_exists(out_file): novel_db = _parse_novel(out_file, dd.get_species(data[0][0])) return novel_db
def jexl_hard(broad_runner, snp_file, ref_file, filter_type, expressions): """Perform hard filtering with GATK using JEXL expressions. Variant quality score recalibration will not work on some regions; it requires enough positions to train the model. This provides a general wrapper around GATK to do cutoff based filtering. """ base, ext = os.path.splitext(snp_file) out_file = "{base}-filter{ftype}{ext}".format(base=base, ext=ext, ftype=filter_type) if not utils.file_exists(out_file): logger.debug("Hard filtering %s with %s" % (snp_file, expressions)) with file_transaction(out_file) as tx_out_file: params = [ "-T", "VariantFiltration", "-R", ref_file, "-l", "ERROR", "--out", tx_out_file, "--variant", snp_file ] for exp in expressions: params.extend([ "--filterName", "GATKStandard{e}".format(e=exp.split()[0]), "--filterExpression", exp ]) broad_runner.run_gatk(params) return out_file
def variants(data): if not "vrn_file" in data: return data in_vcf = data['vrn_file'] work_dir = os.path.join(dd.get_work_dir(data), "report", "variants") with chdir(work_dir): in_bam = data['work_bam'] ref_file = dd.get_ref_file(data) assert ref_file, "Need the reference genome fasta file." jvm_opts = broad.get_gatk_framework_opts(data['config']) gatk_jar = config_utils.get_program("gatk", data['config'], "dir") bed_file = dd.get_variant_regions(data) sample = splitext_plus(os.path.basename(in_vcf))[0] in_bam = data["work_bam"] cg_file = os.path.join(sample + "_with-gc.vcf.gz") parse_file = os.path.join(sample + "_cg-depth-parse.tsv") if not file_exists(cg_file): with file_transaction(cg_file) as tx_out: cmd = ( "java -jar {gatk_jar}/GenomeAnalysisTK.jar -T VariantAnnotator -R {ref_file} " "-L {bed_file} -I {in_bam} " "-A GCContent --variant {in_vcf} --out {tx_out}") do.run(cmd.format(**locals()), " GC bias for %s" % in_vcf) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: with open(out_tx, 'w') as out_handle: print >> out_handle, "CG\tdepth\tsample" cmd = ( "bcftools query -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R {bed_file} {cg_file} >> {out_tx}" ) do.run(cmd.format(**locals()), " query for %s" % in_vcf) logger.debug('parsing coverage: %s' % sample) # return df return data
def long_term_storage(remote_info, config_file): """Securely copy files from remote directory to the storage server. This requires ssh public keys to be setup so that no password entry is necessary, Fabric is used to manage setting up copies on the remote storage server. """ import fabric.api as fabric import fabric.contrib.files as fabric_files logger.info("Copying run data over to remote storage: %s" % config["store_host"]) logger.debug("The contents from AMQP for this dataset are:\n %s" % remote_info) with open(config_file) as in_handle: config = yaml.load(in_handle) base_dir = config["store_dir"] fabric.env.host_string = "%s@%s" % (config["store_user"], config["store_host"]) fc_dir = os.path.join(base_dir, os.path.basename(remote_info['directory'])) if not fabric_files.exists(fc_dir): fabric.run("mkdir %s" % fc_dir) for fcopy in remote_info['to_copy']: target_loc = os.path.join(fc_dir, fcopy) if not fabric_files.exists(target_loc): target_dir = os.path.dirname(target_loc) if not fabric_files.exists(target_dir): fabric.run("mkdir -p %s" % target_dir) cl = ["scp", "-r", "%s@%s:%s/%s" % ( remote_info["user"], remote_info["hostname"], remote_info["directory"], fcopy), target_loc] fabric.run(" ".join(cl))
def shared_variantcall(call_fn, name, align_bams, ref_file, items, assoc_files, region=None, out_file=None): """Provide base functionality for prepping and indexing for variant calling. """ config = items[0]["config"] if out_file is None: if vcfutils.is_paired_analysis(align_bams, items): out_file = "%s-paired-variants.vcf.gz" % config["metdata"]["batch"] else: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): logger.debug("Genotyping with {name}: {region} {fname}".format( name=name, region=region, fname=os.path.basename(align_bams[0]))) variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0]) target_regions = subset_variant_regions(variant_regions, region, out_file, items=items) if (variant_regions is not None and isinstance(target_regions, basestring) and not os.path.isfile(target_regions)): vcfutils.write_empty_vcf(out_file, config) else: with file_transaction(config, out_file) as tx_out_file: call_fn(align_bams, ref_file, items, target_regions, tx_out_file) if out_file.endswith(".gz"): out_file = vcfutils.bgzip_and_index(out_file, config) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def coverage(data, out_dir): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) sambamba = config_utils.get_program("sambamba", data["config"]) work_dir = safe_makedir(out_dir) if not bed_file: return None cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True) with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) parse_file = os.path.join(sample + "_coverage.bed") parse_total_file = os.path.join(sample + "_cov_total.tsv") cores = dd.get_num_cores(data) if not file_exists(parse_file): with tx_tmpdir(data, work_dir) as tmp_dir: with file_transaction(parse_file) as out_tx: cmd = ( "{sambamba} depth region -F \"not unmapped\" -t {cores} " "%s -T 1 -T 5 -T 10 -T 20 -T 40 -T 50 -T 60 -T 70 " "-T 80 -T 100 -L {cleaned_bed} {in_bam} | sed 's/# " "chrom/chrom/' > {out_tx}") do.run( cmd.format(**locals()) % "-C 1000", "Run coverage for {}".format(sample)) parse_file = _add_high_covered_regions(parse_file, cleaned_bed, sample) parse_file = _calculate_percentiles(os.path.abspath(parse_file), sample) return os.path.abspath(parse_file)
def cpg_postprocessing(data): mC = data["cpg_file"] if not "control" in data: return [[data]] hmC = data["control"] out_file = append_stem(mC, "_hmC") pos = 0 pos_hmC = 0 data["hmc_file"] = out_file if file_exists(out_file): return [[data]] logger.debug("processing %s versus %s" % (mC, hmC)) with file_transaction(out_file) as out_tx: with open(out_tx, "w") as out_handle: with open(mC) as mC_h: with open(hmC) as hmC_h: for line in mC_h: cols = line.strip().split("\t") if cols[3] != "CG": continue pos = int(cols[1]) counts = [int(float(cols[5])), int(cols[6])] if pos < pos_hmC: continue elif pos > pos_hmC: hmC_h, hmC = _sync_pos(hmC_h, pos) if not hmC_h: break pos_hmC = hmC["pos"] if counts[0] < 9 or hmC["counts"][0] < 9: continue if pos == hmC["pos"]: pvalue = _call_hmc(counts, hmC["counts"]) print >>out_handle, "%s\t%s\t%s" % (line.strip(), "\t".join(hmC["info"]), pvalue) return [[data]]
def coverage(data): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) if not bed_file: return data work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage") with chdir(work_dir): in_bam = data['work_bam'] sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) parse_file = os.path.join(sample + "_coverage.bed") parse_total_file = os.path.join(sample + "_cov_total.tsv") cores = dd.get_num_cores(data) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: cmd = ( "sambamba depth region -F \"not unmapped\" -t {cores} -C 200 -T 1 -T 5 -T 10 -T 20 -T 40 -T 50 -T 60 -T 70 -T 80 -T 100 -L {bed_file} {in_bam} | sed 's/# chrom/chrom/' > {parse_file}" ) do.run(cmd.format(**locals()), "Run coverage for {}".format(sample)) _calculate_percentiles(parse_file, sample) data['coverage'] = os.path.abspath(parse_file) return data
def _prep_bed(data, work_dir): """Selecting the bed file, cleaning, and properly annotating for Seq2C """ bed_file = regions.get_sv_bed(data) if bed_file: bed_file = clean_file(bed_file, data, prefix="svregions-") else: bed_file = clean_file(dd.get_variant_regions(data), data) col_num = bt.BedTool(bed_file).field_count() if col_num < 4: annotated_file = annotate.add_genes(bed_file, data, max_distance=0) if annotated_file == bed_file: raise ValueError("BED file for Seq2C must be annotated with gene names, " "however the input BED is 3-columns and we have no transcript " "data to annotate with " + bed_file) annotated_file = annotate.gene_one_per_line(annotated_file, data) else: annotated_file = bed_file ready_file = "%s-seq2cclean.bed" % (utils.splitext_plus(annotated_file)[0]) if not utils.file_uptodate(ready_file, annotated_file): bed = bt.BedTool(annotated_file) if col_num > 4 and col_num != 8: bed = bed.cut(range(4)) bed = bed.filter(lambda x: x.name not in ["", ".", "-"]) with file_transaction(data, ready_file) as tx_out_file: bed.saveas(tx_out_file) logger.debug("Saved Seq2C clean annotated ready input BED into " + ready_file) return ready_file
def coverage(data): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) sambamba = config_utils.get_program("sambamba", data["config"]) work_dir = safe_makedir(os.path.join(dd.get_work_dir(data), "report", "coverage")) if not bed_file: return data cleaned_bed = os.path.join(work_dir, os.path.splitext(os.path.basename(bed_file))[0] + ".cleaned.bed") cleaned_bed = bed.decomment(bed_file, cleaned_bed) with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) parse_file = os.path.join(sample + "_coverage.bed") parse_total_file = os.path.join(sample + "_cov_total.tsv") cores = dd.get_num_cores(data) if not file_exists(parse_file): with tx_tmpdir(data, work_dir) as tmp_dir: with file_transaction(parse_file) as out_tx: cmd = ("{sambamba} depth region -F \"not unmapped\" -t {cores} " "%s -T 1 -T 5 -T 10 -T 20 -T 40 -T 50 -T 60 -T 70 " "-T 80 -T 100 -L {cleaned_bed} {in_bam} | sed 's/# " "chrom/chrom/' > {out_tx}") do.run(cmd.format(**locals()) % "-C 1000", "Run coverage for {}".format(sample)) parse_file = _add_high_covered_regions(parse_file, cleaned_bed, sample) _calculate_percentiles(os.path.abspath(parse_file), sample) data['coverage'] = os.path.abspath(parse_file) return data
def get_average_coverage(data, bam_file, bed_file=None, target_name="genome", file_prefix=None): logger.debug("Calculation average coverage of " + bam_file + " on " + target_name + ((" " + bed_file) if bed_file else "")) file_prefix = file_prefix or os.path.join( utils.safe_makedir( os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))), "%s-coverage" % (dd.get_sample_name(data))) cache_file = file_prefix + "-" + target_name + "-stats.yaml" if utils.file_uptodate(cache_file, bam_file): with open(cache_file) as in_handle: stats = yaml.safe_load(in_handle) return stats["avg_coverage"] if bed_file: avg_cov = _average_target_coverage(data, bed_file, bam_file, target_name=target_name) else: avg_cov = _average_genome_coverage(data, bam_file) stats = {"avg_coverage": avg_cov} with open(cache_file, "w") as out_handle: yaml.safe_dump(stats, out_handle, default_flow_style=False, allow_unicode=False) return avg_cov
def get_qc_tools(data): """Retrieve a list of QC tools to use based on configuration and analysis type. Uses defaults if previously set. """ if dd.get_algorithm_qc(data): return dd.get_algorithm_qc(data) analysis = data["analysis"].lower() to_run = [] if "fastqc" not in dd.get_tools_off(data): to_run.append("fastqc") if any([ tool in dd.get_tools_on(data) for tool in ["qualimap", "qualimap_full"] ]): to_run.append("qualimap") if analysis.startswith("rna-seq"): if gtf.is_qualimap_compatible(dd.get_gtf_file(data)): to_run.append("qualimap_rnaseq") else: logger.debug("GTF not compatible with Qualimap, skipping.") if analysis.startswith("smallrna-seq"): to_run.append("small-rna") if not analysis.startswith("smallrna-seq"): to_run.append("samtools") to_run.append("gemini") if tz.get_in(["config", "algorithm", "kraken"], data): to_run.append("kraken") if analysis.startswith(("standard", "variant", "variant2")): to_run += ["qsignature", "coverage", "variants", "picard"] return to_run
def sample_annotation(data): """ Annotate miRNAs using miRBase database with seqbuster tool """ names = data["rgnames"]['sample'] tools = dd.get_expression_caller(data) work_dir = os.path.join(dd.get_work_dir(data), "mirbase") out_dir = os.path.join(work_dir, names) utils.safe_makedir(out_dir) out_file = op.join(out_dir, names) if dd.get_mirbase_hairpin(data): mirbase = op.abspath(op.dirname(dd.get_mirbase_hairpin(data))) data['seqbuster'] = _miraligner(data["collapse"], out_file, dd.get_species(data), mirbase, data['config']) else: logger.debug("No annotation file from miRBase.") sps = dd.get_species(data) if dd.get_species(data) else "None" logger.debug("Looking for mirdeep2 database for %s" % names) if file_exists(op.join(dd.get_work_dir(data), "mirdeep2", "novel", "hairpin.fa")): data['seqbuster_novel'] = _miraligner(data["collapse"], "%s_novel" % out_file, sps, op.join(dd.get_work_dir(data), "mirdeep2", "novel"), data['config']) if "trna" in tools: data['trna'] = _trna_annotation(data) data = spikein.counts_spikein(data) return [[data]]
def _do_run(cmd, checks): """Perform running and check results, raising errors for issues. """ s = subprocess.Popen(cmd, shell=isinstance(cmd, basestring), stdout=subprocess.PIPE, stderr=subprocess.STDOUT) debug_stdout = collections.deque(maxlen=100) with contextlib.closing(s.stdout) as stdout: while 1: line = stdout.readline() exitcode = s.poll() if exitcode is not None: if exitcode is not None and exitcode != 0: error_msg = " ".join(cmd) if not isinstance( cmd, basestring) else cmd error_msg += "\n" error_msg += "".join(debug_stdout) raise subprocess.CalledProcessError(exitcode, error_msg) else: break if line: debug_stdout.append(line) logger.debug(line.rstrip()) # Check for problems not identified by shell return codes if checks: for check in checks: if not check(): raise IOError("External command failed")
def coverage(data): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) if not bed_file: return data work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage") with chdir(work_dir): in_bam = data['work_bam'] sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) parse_file = os.path.join(sample + "_coverage.bed") parse_total_file = os.path.join(sample + "_cov_total.tsv") cores = dd.get_num_cores(data) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: cmd = ("sambamba depth region -F \"not unmapped\" -t {cores} -C 1000 -T 1 -T 5 -T 10 -T 20 -T 40 -T 50 -T 60 -T 70 -T 80 -T 100 -L {bed_file} {in_bam} | sed 's/# chrom/chrom/' > {parse_file}") do.run(cmd.format(**locals()), "Run coverage for {}".format(sample)) parse_file = _add_high_covered_regions(parse_file, bed_file, sample) _calculate_percentiles(parse_file, sample) data['coverage'] = os.path.abspath(parse_file) return data
def coverage(data): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) if not bed_file: return data cleaned_bed = os.path.splitext( os.path.basename(bed_file))[0] + ".cleaned.bed" work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage") with chdir(work_dir): in_bam = data['work_bam'] sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) parse_file = os.path.join(sample + "_coverage.bed") parse_total_file = os.path.join(sample + "_cov_total.tsv") cores = dd.get_num_cores(data) if not file_exists(parse_file): with tx_tmpdir(data, work_dir) as tmp_dir: cleaned_bed = os.path.join(tmp_dir, os.path.basename(bed_file)) cleaned_bed = bed.decomment(bed_file, cleaned_bed) with file_transaction(parse_file) as out_tx: cmd = ( "sambamba depth region -F \"not unmapped\" -t {cores} " "-C 1000 -T 1 -T 5 -T 10 -T 20 -T 40 -T 50 -T 60 -T 70 " "-T 80 -T 100 -L {cleaned_bed} {in_bam} | sed 's/# " "chrom/chrom/' > {out_tx}") do.run(cmd.format(**locals()), "Run coverage for {}".format(sample)) parse_file = _add_high_covered_regions(parse_file, bed_file, sample) _calculate_percentiles(parse_file, sample) data['coverage'] = os.path.abspath(parse_file) return data
def coverage_region_detailed_stats(data, out_dir, extra_cutoffs=None): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) if not bed_file or not utils.file_exists(bed_file): return [] work_dir = safe_makedir(out_dir) cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True) cutoffs = {1, 5, 10, 20, 50, 100, 250, 500, 1000, 5000, 10000, 50000} with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) parse_file = os.path.join(sample + "_coverage.bed") if utils.file_uptodate(parse_file, cleaned_bed) and utils.file_uptodate(parse_file, in_bam): pass else: with file_transaction(data, parse_file) as out_tx: depth_thresholds = sorted(list(cutoffs | extra_cutoffs)) cmdl = sambamba.make_command(data, "depth region", in_bam, cleaned_bed, depth_thresholds=depth_thresholds) cmdl += " | sed 's/# chrom/chrom/' > " + out_tx do.run(cmdl, "Run coverage regional analysis for {}".format(sample)) out_files = _calculate_percentiles(os.path.abspath(parse_file), sample, data=data, cutoffs=cutoffs) return [os.path.abspath(x) for x in out_files]
def get_coverage(data): """Calculate coverage for a sample.bam, account for GC content data is single sample """ data = utils.to_single_data(data) bed_file = tz.get_in(["config", "algorithm", "purecn_bed_ready"], data) sample_name = dd.get_sample_name(data) work_dir = _sv_workdir(data) rscript = utils.Rscript_cmd("r36") coverage_r = utils.R_package_script("r36", "PureCN", "extdata/Coverage.R") intervals = tz.get_in(["config", "algorithm", "purecn_bed_ready"], data) # PureCN resolves symlinks and the actual output PureCN coverage file name # is derived from the end bam not from bam_file bam_file = os.path.realpath(dd.get_align_bam(data)) bam_name = os.path.basename(bam_file) (bname, ext) = os.path.splitext(bam_name) result_file = os.path.join(work_dir, bname + "_coverage_loess.txt.gz") if not os.path.exists(result_file): cmd = [rscript, coverage_r, "--outdir", work_dir, "--bam", bam_file, "--intervals", intervals] try: cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(env = "r36"), utils.get_R_exports(env = "r36"), " ".join([str(x) for x in cmd])) do.run(cmd_line, "PureCN coverage") except subprocess.CalledProcessError as msg: logger.info("PureCN failed to calculate coverage") logger.debug("Saved PureCN coverage files to " + result_file) return result_file
def sample_annotation(data): """ Annotate miRNAs using miRBase database with seqbuster tool """ names = data["rgnames"]['sample'] tools = dd.get_expression_caller(data) work_dir = os.path.join(dd.get_work_dir(data), "mirbase") out_dir = os.path.join(work_dir, names) utils.safe_makedir(out_dir) out_file = op.join(out_dir, names) if dd.get_mirbase_hairpin(data): mirbase = op.abspath(op.dirname(dd.get_mirbase_hairpin(data))) data['seqbuster'] = _miraligner(data["collapse"], out_file, dd.get_species(data), mirbase, data['config']) else: logger.debug("No annotation file from miRBase.") sps = dd.get_species(data) if dd.get_species(data) else "None" logger.debug("Looking for mirdeep2 database for %s" % names) if file_exists( op.join(dd.get_work_dir(data), "mirdeep2", "novel", "hairpin.fa")): data['seqbuster_novel'] = _miraligner( data["collapse"], "%s_novel" % out_file, sps, op.join(dd.get_work_dir(data), "mirdeep2", "novel"), data['config']) if "trna" in tools: data['trna'] = _trna_annotation(data) data = spikein.counts_spikein(data) return [[data]]
def trim_srna_sample(data): """ Remove 3' adapter for smallRNA-seq Uses cutadapt but with different parameters than for other pipelines. """ in_file = data["files"][0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) trim_reads = data["config"]["algorithm"].get("trim_reads", True) adapter = dd.get_adapters(data) if trim_reads and not adapter and error_dnapi: raise ValueError(error_dnapi) adapters = adapter if adapter else _dnapi_prediction(in_file) times = "" if len(adapters) == 1 else "--times %s" % len(adapters) if trim_reads and adapters: adapter_cmd = " ".join(map(lambda x: "-a " + x, adapters)) out_noadapter_file = replace_directory( append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) log_out = os.path.join(out_dir, "%s.log" % names) atropos = _get_atropos() options = " ".join( data.get('resources', {}).get('atropos', {}).get("options", "")) cores = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "") if " ".join( data.get('resources', {}).get('cutadapt', {}).get("options", "")): raise ValueError( "Atropos is now used, but cutadapt options found in YAML file." "See https://atropos.readthedocs.io/en/latest/") cmd = _cmd_atropos() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter for %s" % names) if utils.file_exists(log_out): content = open(log_out).read().replace( out_short_file, names) open(log_out, 'w').write(content) if options: in_file = append_stem(tx_out_file, ".tmp") utils.move_safe(tx_out_file, in_file) cmd = "{atropos} {cores} {options} -se {in_file} -o {tx_out_file} -m 17" do.run(cmd.format(**locals()), "cutadapt with this %s for %s" % (options, names)) else: if not trim_reads: logger.debug("Skip trimming for: %s" % names) elif not adapters: logger.info("No adapter founds in %s, this is an issue related" " to no small RNA enrichment in your sample." % names) symlink_plus(in_file, out_file) data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]
def trim_srna_sample(data): """ Remove 3' adapter for smallRNA-seq Uses cutadapt but with different parameters than for other pipelines. """ in_file = data["files"][0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) trim_reads = data["config"]["algorithm"].get("trim_reads", True) adapter = dd.get_adapters(data) if trim_reads and adapter: adapter = adapter[0] out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) log_out = os.path.join(out_dir, "%s.log" % names) cutadapt = os.path.join(os.path.dirname(sys.executable), "cutadapt") cmd = _cmd_cutadapt() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter") if utils.file_exists(log_out): content = open(log_out).read().replace(out_short_file, names) open(log_out, 'w').write(content) else: logger.debug("Skip trimming for: %s" % names) symlink_plus(in_file, out_file) data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]
def query_gsm(gsm, out_file, config = {}): gsm = gsm[0] out_dir = os.path.dirname(os.path.abspath(out_file)) name = utils.splitext_plus(os.path.basename(out_file))[0] url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra\&term={0}\&retmode=json".format(gsm) cmd = "curl {0}".format(url) process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) out = process.stdout.read() data = json.loads(out) ids = data.get("esearchresult", {}).get("idlist", []) logger.debug("Get id sample for %s" % gsm) if ids: gsm_info = _query_info("sra", ids[-1]) logger.debug("gsm_info:%s" % gsm_info) srrall = [] for srr in gsm_info: srrall.append(_create_link(srr)) logger.debug("Get FTP link for %s : %s" % (ids[-1], srrall)) outs = [] for srx in srrall: sra_dir = utils.safe_makedir(os.path.join(out_dir, name)) srafiles = _download_srx(srx, sra_dir) if srafiles: logger.debug("Get SRA for %s: %s" % (gsm, " ".join(srafiles))) for sra in srafiles: fastq_fn = _convert_fastq(sra, out_dir) if fastq_fn: outs.extend(fastq_fn) logger.debug("Get FASTQ for %s: %s" % (gsm, " ".join(outs))) if outs: files = combine_pairs(outs) out_file = fastq.merge(files, out_file, config) return out_file
def prepare_sample(data): """Prepare a sample to be run, potentially converting from BAM to FASTQ and/or downsampling the number of reads for a test run """ logger.debug("Preparing %s" % data["rgnames"]["sample"]) data["files"] = get_fastq_files(data) return [[data]]
def process_lane(lane, pruned_fc, rawdata_fc, analysis_fc): """Models bcbio process lane""" multiplex = lane.get_samples() logger.info("Processing lane %s; reference genome %s" % (lane.get_name(), lane.get_genome_build())) if multiplex: logger.debug("Project %s is multiplexed as: %s" % (lane.get_name(), multiplex)) fq = _get_barcoded_fastq_files(lane, multiplex, pruned_fc.get_fc_date(), pruned_fc.get_fc_name(), pruned_fc.get_fc_dir()) ## Move data along with fastq files fc_data_dir = rawdata_fc.get_fc_dir() _make_dir(fc_data_dir, "data delivery directory") if options.install_data: data, fastqc = _get_analysis_results(pruned_fc, lane) _deliver_data(data, fastqc, analysis_fc.get_fc_dir()) fastq_targets = list() for fqpair in fq: for fastq_src in fqpair: fastq_tgt = fastq_src if options.customer_delivery or options.barcode_id_to_name: fastq_tgt = _convert_barcode_id_to_name(multiplex, rawdata_fc.get_fc_name(), fastq_src) fastq_tgt = fastq_tgt.replace("_nophix_", "_") _deliver_fastq_file(fastq_src, os.path.basename(fastq_tgt), fc_data_dir) fastq_targets.append(os.path.join(fc_data_dir, os.path.basename(fastq_tgt))) lane.set_files(fastq_targets) return lane
def process_intervals(data): """Prepare intervals file""" bed_file = regions.get_sv_bed(data) if not bed_file: bed_file = bedutils.clean_file(dd.get_variant_regions(data), data) if not bed_file: return None basename = os.path.splitext(bed_file)[0] ready_file = basename + ".txt" if os.path.exists(ready_file): return ready_file optimized_bed = basename + ".optimized.bed" rscript = utils.Rscript_cmd("r36") interval_file_r = utils.R_package_script("r36", "PureCN", "extdata/IntervalFile.R") ref_file = dd.get_ref_file(data) mappability_resource = dd.get_variation_resources(data)["purecn_mappability"] genome = dd.get_genome_build(data) cmd = [rscript, interval_file_r, "--infile", bed_file, "--fasta", ref_file, "--outfile", ready_file, "--offtarget", "--genome", genome, "--export", optimized_bed, "--mappability", mappability_resource] try: cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(env = "r36"), utils.get_R_exports(env = "r36"), " ".join([str(x) for x in cmd])) do.run(cmd_line, "PureCN intervals") except subprocess.CalledProcessError as msg: logger.info("PureCN failed to prepare intervals") logger.debug("Saved PureCN interval file into " + ready_file) return ready_file
def merge_sample(data): """Merge fastq and BAM files for multiple samples. """ logger.debug("Combining fastq and BAM files %s" % str(data["name"])) config = config_utils.update_w_custom(data["config"], data["info"]) if config["algorithm"].get("upload_fastq", False): fastq1, fastq2 = combine_fastq_files(data["fastq_files"], data["dirs"]["work"], config) else: fastq1, fastq2 = None, None out_file = os.path.join(data["dirs"]["work"], data["info"]["rgnames"]["sample"] + ".bam") sort_bam = merge_bam_files(data["bam_files"], data["dirs"]["work"], config, out_file=out_file) return [[{ "name": data["name"], "metadata": data["info"].get("metadata", {}), "info": data["info"], "genome_build": data["genome_build"], "sam_ref": data["sam_ref"], "work_bam": sort_bam, "fastq1": fastq1, "fastq2": fastq2, "dirs": data["dirs"], "config": config, "config_file": data["config_file"] }]]
def get_qc_tools(data): """Retrieve a list of QC tools to use based on configuration and analysis type. Uses defaults if previously set. """ if dd.get_algorithm_qc(data): return dd.get_algorithm_qc(data) analysis = data["analysis"].lower() to_run = [] if "fastqc" not in dd.get_tools_off(data): to_run.append("fastqc") if any([tool in dd.get_tools_on(data) for tool in ["qualimap", "qualimap_full"]]): to_run.append("qualimap") if analysis.startswith("rna-seq"): if gtf.is_qualimap_compatible(dd.get_gtf_file(data)): to_run.append("qualimap_rnaseq") else: logger.debug("GTF not compatible with Qualimap, skipping.") if analysis.startswith("smallrna-seq"): to_run.append("small-rna") if not analysis.startswith("smallrna-seq"): to_run.append("samtools") to_run.append("gemini") if tz.get_in(["config", "algorithm", "kraken"], data): to_run.append("kraken") if analysis.startswith(("standard", "variant", "variant2")): to_run += ["qsignature", "coverage", "variants", "picard"] return to_run
def priority_total_coverage(data, out_dir): """ calculate coverage at 10 depth intervals in the priority regions """ from bcbio.structural import prioritize bed_file = dd.get_svprioritize(data) if not bed_file and not file_exists(bed_file) or prioritize.is_gene_list( bed_file): return {} in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) cleaned_bed = clean_file(bed_file, data, prefix="svprioritize-") work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed") if utils.file_uptodate(out_file, cleaned_bed) and utils.file_uptodate( out_file, in_bam): return out_file cmdl = sambamba.make_command( data, "depth region", in_bam, cleaned_bed, depth_thresholds=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) with file_transaction(out_file) as tx_out_file: message = "Calculating region coverage of {bed_file} in {in_bam}" do.run(cmdl + " -o " + tx_out_file, message.format(**locals())) logger.debug("Saved svprioritize coverage into " + out_file) return out_file
def long_term_storage(remote_info, config_file): """Securely copy files from remote directory to the storage server. This requires ssh public keys to be setup so that no password entry is necessary, Fabric is used to manage setting up copies on the remote storage server. """ import fabric.api as fabric import fabric.contrib.files as fabric_files logger.info("Copying run data over to remote storage: %s" % config["store_host"]) logger.debug("The contents from AMQP for this dataset are:\n %s" % remote_info) with open(config_file) as in_handle: config = yaml.load(in_handle) base_dir = config["store_dir"] fabric.env.host_string = "%s@%s" % (config["store_user"], config["store_host"]) fc_dir = os.path.join(base_dir, os.path.basename(remote_info['directory'])) if not fabric_files.exists(fc_dir): fabric.run("mkdir %s" % fc_dir) for fcopy in remote_info['to_copy']: target_loc = os.path.join(fc_dir, fcopy) if not fabric_files.exists(target_loc): target_dir = os.path.dirname(target_loc) if not fabric_files.exists(target_dir): fabric.run("mkdir -p %s" % target_dir) cl = [ "scp", "-r", "%s@%s:%s/%s" % (remote_info["user"], remote_info["hostname"], remote_info["directory"], fcopy), target_loc ] fabric.run(" ".join(cl))
def _run_purecn_normaldb(paired, out): """Run PureCN with normaldb and native segmentation paired is one t/n pair or only """ sample = utils.to_single_data(paired.tumor_data) bed_file = tz.get_in(["config", "algorithm", "purecn_bed_ready"], sample) sample_name = dd.get_sample_name(sample) work_dir = _sv_workdir(sample) rscript = utils.Rscript_cmd("r36") purecn_r = utils.R_package_script("r36", "PureCN", "extdata/PureCN.R") intervals = tz.get_in(["config", "algorithm", "purecn_bed_ready"], sample) bam_file = dd.get_align_bam(sample) # termline and somatic - just annotated and filters assigned variants_vcf = tz.get_in(["variants"], sample)[0].get("germline") # in a T/N case, there is no germline file - vrn file with all variants if not variants_vcf: variants_vcf = tz.get_in(["variants"], sample)[0].get("vrn_file") normaldb = tz.get_in(["config", "algorithm", "background", "cnv_reference", "purecn_normaldb"], sample) mappingbiasfile = tz.get_in(["config", "algorithm", "background", "cnv_reference", "purecn_mapping_bias"], sample) sample_coverage = tz.get_in(["depth", "bins", "purecn"], sample) simple_repeat_bed = dd.get_variation_resources(sample)["simple_repeat"] result_file = os.path.join(work_dir, sample_name + ".rds") genome = dd.get_genome_build(sample) cmd = [ rscript, purecn_r, "--out", work_dir, "--tumor", sample_coverage, "--sampleid", sample_name, "--vcf", variants_vcf, "--normaldb", normaldb, "--mappingbiasfile", mappingbiasfile, "--intervals", intervals, "--snpblacklist", simple_repeat_bed, "--genome", genome, "--force", "--postoptimize", "--seed", "123", "--bootstrapn", "500", "--cores", dd.get_num_cores(sample)] resources = config_utils.get_resources("purecn", sample) if "options" in resources: cmd += [str(x) for x in resources.get("options", [])] # it is not recommended to use matched normal sample in PureCN analysis, # because then it skips PON coverage normalization and denoising steps! # but still, if it is supplied, we useit if paired.normal_data: normal_sample = utils.to_single_data(paired.normal_data) if normal_sample: normal_coverage = tz.get_in(["depth", "bins", "purecn"], normal_sample) cmd.extend(["--normal", normal_coverage]) if not os.path.exists(result_file): try: cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(env = "r36"), utils.get_R_exports(env = "r36"), " ".join([str(x) for x in cmd])) do.run(cmd_line, "PureCN copy number calling") logger.debug("Saved PureCN output to " + work_dir) except subprocess.CalledProcessError as msg: logger.info("PureCN failed") out_base, out, all_files = _get_purecn_files(paired, work_dir, require_exist = True) return out
def variants(data, out_dir): """Variants QC metrics""" if not "variants" in data: return None work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) bcfstats = _run_bcftools(data, work_dir) bed_file = dd.get_coverage(data) bcf_out = os.path.join(sample + "_bcbio_variants_stats.txt") cg_file = os.path.join(sample + "_with-gc.vcf.gz") parse_file = os.path.join(sample + "_gc-depth-parse.tsv") qc_file = os.path.join(sample + "_bcbio_variants.txt") with chdir(work_dir): if not file_exists(bcf_out): with open(bcf_out, "w") as out_handle: yaml.safe_dump(bcfstats, out_handle, default_flow_style=False, allow_unicode=False) if "vrn_file" not in data or not bed_file: return None in_vcf = data['vrn_file'] cleaned_bed = clean_file(bed_file, data) if file_exists(qc_file): return qc_file in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) ref_file = dd.get_ref_file(data) assert ref_file, "Need the reference genome fasta file." bed_file = dd.get_variant_regions(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) num_cores = dd.get_num_cores(data) broad_runner = broad.runner_from_config_safe(data["config"]) if in_bam and broad_runner and broad_runner.has_gatk(): if not file_exists(parse_file): with file_transaction(cg_file) as tx_out: params = [ "-T", "VariantAnnotator", "-R", ref_file, "-L", cleaned_bed, "-I", in_bam, "-A", "GCContent", "-A", "Coverage", "--variant", in_vcf, "--out", tx_out ] broad_runner.run_gatk(params) cg_file = vcfutils.bgzip_and_index(cg_file, data["config"]) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: with open(out_tx, 'w') as out_handle: print >> out_handle, "CG\tdepth\tsample" cmd = ( "bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R " "{bed_file} {cg_file} >> {out_tx}") do.run(cmd.format(**locals()), "Calculating GC content and depth for %s" % in_vcf) logger.debug('parsing coverage: %s' % sample) if not file_exists(qc_file): # This files will be copied to final _summary_variants(parse_file, qc_file) if file_exists(qc_file) and file_exists(parse_file): remove_plus(cg_file)
def variants(data, out_dir): """Variants QC metrics""" if not "variants" in data: return None work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) bcfstats = _run_bcftools(data, work_dir) bed_file = dd.get_coverage(data) bcf_out = os.path.join(sample + "_bcbio_variants_stats.txt") cg_file = os.path.join(sample + "_with-gc.vcf.gz") parse_file = os.path.join(sample + "_gc-depth-parse.tsv") qc_file = os.path.join(sample + "_bcbio_variants.txt") with chdir(work_dir): if not file_exists(bcf_out): with open(bcf_out, "w") as out_handle: yaml.safe_dump(bcfstats, out_handle, default_flow_style=False, allow_unicode=False) if "vrn_file" not in data or not bed_file: return None in_vcf = data['vrn_file'] cleaned_bed = clean_file(bed_file, data) if file_exists(qc_file): return qc_file in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) ref_file = dd.get_ref_file(data) assert ref_file, "Need the reference genome fasta file." bed_file = dd.get_variant_regions(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) num_cores = dd.get_num_cores(data) broad_runner = broad.runner_from_config_safe(data["config"]) if in_bam and broad_runner and broad_runner.has_gatk(): if not file_exists(parse_file): with file_transaction(cg_file) as tx_out: params = ["-T", "VariantAnnotator", "-R", ref_file, "-L", cleaned_bed, "-I", in_bam, "-A", "GCContent", "-A", "Coverage", "--variant", in_vcf, "--out", tx_out] broad_runner.run_gatk(params) cg_file = vcfutils.bgzip_and_index(cg_file, data["config"]) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: with open(out_tx, 'w') as out_handle: print >>out_handle, "CG\tdepth\tsample" cmd = ("bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R " "{bed_file} {cg_file} >> {out_tx}") do.run(cmd.format(**locals()), "Calculating GC content and depth for %s" % in_vcf) logger.debug('parsing coverage: %s' % sample) if not file_exists(qc_file): # This files will be copied to final _summary_variants(parse_file, qc_file) if file_exists(qc_file) and file_exists(parse_file): remove_plus(cg_file)
def trim_srna_sample(data): """ Remove 3' adapter for smallRNA-seq Uses cutadapt but with different parameters than for other pipelines. """ in_file = data["files"][0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) trim_reads = data["config"]["algorithm"].get("trim_reads", True) if utils.file_exists(out_file): data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]] adapter = dd.get_adapters(data) if trim_reads and not adapter and error_dnapi: raise ValueError(error_dnapi) adapters = adapter if adapter else _dnapi_prediction(in_file, out_dir) times = "" if len(adapters) == 1 else "--times %s" % len(adapters) if trim_reads and adapters: adapter_cmd = " ".join(map(lambda x: "-a " + x, adapters)) out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) log_out = os.path.join(out_dir, "%s.log" % names) atropos = _get_atropos() options = " ".join(data.get('resources', {}).get('atropos', {}).get("options", "")) cores = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "") if " ".join(data.get('resources', {}).get('cutadapt', {}).get("options", "")): raise ValueError("Atropos is now used, but cutadapt options found in YAML file." "See https://atropos.readthedocs.io/en/latest/") cmd = _cmd_atropos() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter for %s" % names) if utils.file_exists(log_out): content = open(log_out).read().replace(out_short_file, names) open(log_out, 'w').write(content) if options: in_file = append_stem(tx_out_file, ".tmp") utils.move_safe(tx_out_file, in_file) cmd = "{atropos} {cores} {options} -se {in_file} -o {tx_out_file} -m 17" do.run(cmd.format(**locals()), "cutadapt with this %s for %s" %(options, names)) else: if not trim_reads: logger.debug("Skip trimming for: %s" % names) elif not adapters: logger.info("No adapter founds in %s, this is an issue related" " to no small RNA enrichment in your sample." % names) symlink_plus(in_file, out_file) data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]
def _dnapi_prediction(fn, out_dir): end_file = _prepare_file(fn, out_dir) iterative_result = iterative_adapter_prediction(end_file, [1.2, 1.3, 1.4, 1.7, 2], [7, 11], 500000) max_score = iterative_result[1][1] adapters = list() for a in iterative_result: if a[1] > max_score * 0.40: logger.debug("Adding adapter to the list: %s with score %s" % (a[0], a[1])) adapters.append(a[0]) return adapters
def _remove_transferred_files(remote_info, config): """Remove the files transferred in a previous test. """ copy_to = os.path.realpath("../transfer_data/copy_to") with fabric.settings(host_string="%s@%s" % \ (config["store_user"], config["store_host"])): rm_str = "rm -r %s/%s" % \ (copy_to, os.path.split(remote_info["directory"])[1]) logger.debug(rm_str) fabric.run(rm_str)
def _calculate_coverage(data, work_dir, bed_file, bam_file, sample_name): sambamba_depth_file = regions_coverage(data, bed_file, bam_file, "sv_regions") out_file = os.path.join(work_dir, sample_name + '-coverage.tsv') if not utils.file_exists(out_file): logger.debug('Converting sambamba depth output to cov2lr.pl input in ' + sample_name) with file_transaction(data, out_file) as tx_out_file: _sambabma_depth_to_seq2cov(sambamba_depth_file, tx_out_file, sample_name) logger.debug("Saved to " + out_file) return out_file
def long_term_storage(remote_info, config_file): """Securely copy files from remote directory to the storage server. This requires ssh public keys to be setup so that no password entry is necessary, Fabric is used to manage setting up copies on the remote storage server. """ config = load_config(config_file) logger.info("Copying run data over to remote storage: %s" % config["store_host"]) logger.debug("The contents from AMQP for this dataset are:\n %s" % remote_info) _copy_for_storage(remote_info, config)
def _af_annotate_and_filter(paired, items, in_file, out_file): """Populating FORMAT/AF, and dropping variants with AF<min_allele_fraction Strelka2 doesn't report exact AF for a variant, however it can be calculated as alt_counts/dp from existing fields: somatic snps: GT:DP:FDP:SDP:SUBDP:AU:CU:GU:TU dp=DP {ALT}U[0] = alt_counts(tier1,tier2) indels: GT:DP:DP2:TAR:TIR:TOR:DP50:FDP50:SUBDP50:BCN50 dp=DP TIR = alt_counts(tier1,tier2) germline snps: GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL(:PS) dp=sum(alt_counts) AD = ref_count,alt_counts indels: GT:GQ:GQX:DPI:AD:ADF:ADR:FT:PL(:PS) dp=sum(alt_counts) AD = ref_count,alt_counts """ data = paired.tumor_data if paired else items[0] min_freq = float(utils.get_in(data["config"], ("algorithm", "min_allele_fraction"), 10)) / 100.0 logger.debug("Filtering Strelka2 calls with allele fraction threshold of %s" % min_freq) ungz_out_file = "%s.vcf" % utils.splitext_plus(out_file)[0] if not utils.file_exists(ungz_out_file) and not utils.file_exists(ungz_out_file + ".gz"): with file_transaction(data, ungz_out_file) as tx_out_file: vcf = cyvcf2.VCF(in_file) vcf.add_format_to_header({ 'ID': 'AF', 'Description': 'Allele frequency, as calculated in bcbio: AD/DP (germline), <ALT>U/DP (somatic snps), ' 'TIR/DPI (somatic indels)', 'Type': 'Float', 'Number': '.'}) vcf.add_filter_to_header({ 'ID': 'MinAF', 'Description': 'Allele frequency is lower than %s%% ' % (min_freq*100) + ( '(configured in bcbio as min_allele_fraction)' if utils.get_in(data["config"], ("algorithm", "min_allele_fraction")) else '(default threshold in bcbio; override with min_allele_fraction in the algorithm section)')}) w = cyvcf2.Writer(tx_out_file, vcf) tumor_index = vcf.samples.index(data['description']) for rec in vcf: if paired: # somatic? if rec.is_snp: # snps? alt_counts = rec.format(rec.ALT[0] + 'U')[:,0] # {ALT}U=tier1_depth,tier2_depth else: # indels alt_counts = rec.format('TIR')[:,0] # TIR=tier1_depth,tier2_depth dp = rec.format('DP')[:,0] elif rec.format("AD") is not None: # germline? alt_counts = rec.format('AD')[:,1:] # AD=REF,ALT1,ALT2,... dp = np.sum(rec.format('AD')[:,0:], axis=1)[:, None] else: # germline gVCF record alt_counts, dp = (None, None) if dp is not None: with np.errstate(divide='ignore', invalid='ignore'): # ignore division by zero and put AF=.0 af = np.true_divide(alt_counts, dp) af[~np.isfinite(af)] = .0 # -inf inf NaN -> .0 rec.set_format('AF', af) if paired and np.all(af[tumor_index] < min_freq): vcfutils.cyvcf_add_filter(rec, 'MinAF') w.write_record(rec) w.close() return vcfutils.bgzip_and_index(ungz_out_file, data["config"])
def variants(data): if "vrn_file" not in data: return data if not dd.get_coverage(data): return data in_vcf = data["vrn_file"] work_dir = os.path.join(dd.get_work_dir(data), "report", "variants") with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) ref_file = dd.get_ref_file(data) assert ref_file, "Need the reference genome fasta file." bed_file = dd.get_variant_regions(data) sample = dd.get_sample_name(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) cg_file = os.path.join(sample + "_with-gc.vcf.gz") parse_file = os.path.join(sample + "_gc-depth-parse.tsv") num_cores = dd.get_num_cores(data) broad_runner = broad.runner_from_config_safe(data["config"]) if in_bam and broad_runner and broad_runner.has_gatk(): if not file_exists(cg_file): with file_transaction(cg_file) as tx_out: params = [ "-T", "VariantAnnotator", "-R", ref_file, "-L", bed_file, "-I", in_bam, "-A", "GCContent", "-A", "Coverage", "--variant", in_vcf, "--out", tx_out, ] broad_runner.run_gatk(params) cg_file = vcfutils.bgzip_and_index(cg_file, data["config"]) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: with open(out_tx, "w") as out_handle: print >> out_handle, "CG\tdepth\tsample" cmd = ( "bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R " "{bed_file} {cg_file} >> {out_tx}" ) do.run(cmd.format(**locals()), "Calculating GC content and depth for %s" % in_vcf) logger.debug("parsing coverage: %s" % sample) return data
def prepare_sample(data): """Prepare a sample to be run, potentially converting from BAM to FASTQ and/or downsampling the number of reads for a test run """ data = utils.to_single_data(data) logger.debug("Preparing %s" % data["rgnames"]["sample"]) data["files"] = get_fastq_files(data) # get_fastq_files swaps over quality scores to standard, unless trimming if not(dd.get_trim_reads(data)): data = dd.set_quality_format(data, "standard") return [[data]]