def split(args): assert (args.min_len >= 2 * args.bin_size) # We only need the bam to get the chromosome names and lengths. in_bam = tk_bam.create_bam_infile(args.possorted_bam) if args.targets is None: target_regions = None else: with open(args.targets, 'r') as f: target_regions = tk_io.get_target_regions(f) primary_contigs = tk_reference.load_primary_contigs(args.reference_path) all_loci = [] for (chrom_name, chrom_size) in zip(in_bam.references, in_bam.lengths): if not chrom_name in primary_contigs: continue # The chunks will overlap by min_len. This will ensure that we don't # miss any regions of low depth that span chunk boundaries. new_loci = generate_chrom_loci(target_regions, chrom_name, chrom_size, PARALLEL_LOCUS_SIZE / 2, overlap=args.min_len) all_loci.extend(new_loci) in_bam.close() # Group loci locus_sets = pack_loci(all_loci) chunk_defs = [{'loci': loci, '__mem_gb': 16} for loci in locus_sets] return {'chunks': chunk_defs}
def do_blacklist_filtering(events, blacklist_map): for bl_name, bl_list in blacklist_map.iteritems(): with open(bl_list) as fBL: blacklist = tk_io.get_target_regions(fBL) for e in events: e.check_blacklist(blacklist, bl_name)
def main(args, outs): args.coerce_strings() outs.coerce_strings() if args.confident_regions is None: confident_regions = None else: confident_regions = tk_io.get_target_regions( open(args.confident_regions)) outfile = open(outs.confident_windows, "w") for (chrom, start, end) in (tk_io.get_locus_info(l) for l in args.loci): conf_regions = get_conf_regions(chrom, confident_regions) location = start while location < end: region = tk_regions.Regions(regions=[(location, location + args.window_size)]) isect = region.intersect(conf_regions) size = isect.get_total_size() percent = tk_stats.robust_divide(float(size), float(args.window_size)) row = [chrom, location, location + args.window_size, percent] outfile.write("\t".join(map(str, row)) + "\n") location += args.window_size outfile.close()
def get_coverage_regions(args): (chrom, start, stop) = tk_io.get_locus_info(args.locus) regions = Regions( tk_io.get_target_regions(open( args.high_coverage_excluded_bed)).get(chrom)) if regions == None: regions = Regions() return regions
def annotate_bed_info(counts, bed_file): regs = tk_io.get_target_regions(open(bed_file)) for c in counts: in_bed = False if regs.has_key(c['chrom']): in_bed = regs[c['chrom']].contains_point(c['pos']) c['in_bed'] = in_bed
def merge(bed1, bed2, bedOut): if not bed2: shutil.copyfile(bed1, bedOut) return with open(bed1) as f: bed_dict1 = tk_io.get_target_regions(f) with open(bed2) as f: bed_dict2 = tk_io.get_target_regions(f) for chrom in bed_dict2: for start, end in bed_dict2[chrom]: if chrom not in bed_dict1: bed_dict1[chrom] = tk_regions.Regions([]) bed_dict1[chrom].add_region((start, end)) writeOut(bed_dict1, bedOut)
def no_overlap(bed1, bed2, bedOut): if not bed2: shutil.copyfile(bed1, bedOut) return with open(bed1) as f: bed_dict1 = tk_io.get_target_regions(f) with open(bed2) as f: bed_dict2 = tk_io.get_target_regions(f) bed_dict_no_overlap = {} for chrom in bed_dict1: if not chrom in bed_dict_no_overlap: bed_dict_no_overlap[chrom] = tk_regions.Regions([]) for start, end in bed_dict1[chrom]: if not chrom in bed_dict2 or \ not bed_dict2[chrom].overlaps_region(start, end): bed_dict_no_overlap[chrom].add_region((start, end)) writeOut(bed_dict_no_overlap, bedOut)
def intersect(bed1, bed2, bedOut): if not bed2: shutil.copyfile(bed1, bedOut) return with open(bed1) as f: bed_dict1 = tk_io.get_target_regions(f) with open(bed2) as f: bed_dict2 = tk_io.get_target_regions(f) all_common_chroms = [ chrom for chrom in bed_dict1.keys() if chrom in bed_dict2 ] bed_dict_intersect = {} for chrom in all_common_chroms: bed_dict_intersect[chrom] = bed_dict1[chrom].intersect( bed_dict2[chrom]) writeOut(bed_dict_intersect, bedOut)
def main(args, outs): args.coerce_strings() outs.coerce_strings() outs.raw_matrix_mex = None if args.fragments is None: outs.raw_matrix = None return with open(args.peaks, 'r') as infile: full_peaks = tk_bio.get_target_regions(infile) with open(args.peaks, 'r') as pfile: peaks_dict = OrderedDict( ("{}:{}-{}".format(*peak.strip("\n").split("\t")), num) for num, peak in enumerate(pfile)) with open(args.barcodes, 'r') as barcode_file: barcodes_dict = OrderedDict( (bc.strip('\n'), num) for num, bc in enumerate(barcode_file)) if len(barcodes_dict) == 0: outs.raw_matrix = None return # get matrix counts peak_bc_counts = Counter() for contig, start, stop, barcode, _ in open_fragment_file(args.fragments): if barcode not in barcodes_dict: continue for pos in (start, stop): if contig in full_peaks.keys(): peak = full_peaks[contig].get_region_containing_point(pos) if peak is not None: peak_bc_counts[barcodes_dict[barcode], peaks_dict['{}:{}-{}'.format( contig, peak[0], peak[1])]] += 1 data, col, row = (), (), () if len(peak_bc_counts) > 0: data, col, row = zip(*[(val, key[0], key[1]) for key, val in peak_bc_counts.iteritems()]) sp_matrix = csc_matrix( coo_matrix((data, (row, col)), shape=(len(peaks_dict), len(barcodes_dict)), dtype=int)) # save as a CountMatrix genomes = utils.generate_genome_tag(args.reference_path) peaks_def = atac_feature_ref.from_peaks_bed(args.peaks, genomes) raw_matrix = cr_matrix.CountMatrix(peaks_def, barcodes_dict.keys(), sp_matrix) raw_matrix.save_h5_file(outs.raw_matrix, sw_version=martian.get_pipelines_version())
def subtract(bed1, bed2, bedOut): if not bed2: shutil.copyfile(bed1, bedOut) return with open(bed1) as f: bed_dict1 = tk_io.get_target_regions(f) with open(bed2) as f: bed_dict2 = tk_io.get_target_regions(f) bed_dict_subtract = {} for chrom in bed_dict1: if not chrom in bed_dict_subtract: bed_dict_subtract[chrom] = tk_regions.Regions([]) for start, end in bed_dict1[chrom]: overlappings = [] if chrom in bed_dict2: overlappings = bed_dict2[chrom].overlapping_regions(start, end) for interval in interval_subtract(start, end, overlappings): bed_dict_subtract[chrom].add_region(interval) writeOut(bed_dict_subtract, bedOut)
def split(args): in_bam = tk_bam.create_bam_infile(args.possorted_bam) # Load pull-down targets if args.targets is None: target_regions = None else: with open(args.targets, 'r') as f: target_regions = tk_io.get_target_regions(f) all_loci = [] for (chrom_name, chrom_size) in zip(in_bam.references, in_bam.lengths): all_loci.extend(generate_chrom_loci(target_regions, chrom_name, chrom_size, PARALLEL_LOCUS_SIZE)) in_bam.close() locus_sets = pack_loci(all_loci) chunk_defs = [{'loci': loci, '__mem_gb':16} for loci in locus_sets] return {'chunks': chunk_defs}
def main_report_basic(args, outs): bam_in = pysam.Samfile(args.input, check_sq=False) targets_filename = args.targets_file references = bam_in.references if args.input_pos is not None: bam_in_pos = tk_bam.create_bam_infile(args.input_pos) n_mapped = bam_in_pos.mapped n_chunk = math.ceil(n_mapped / args.n_chunks) bam_in_pos.close() else: n_mapped = 0 n_chunk = 0 if targets_filename is None or targets_filename == '': target_regions = None else: targets_file = open(targets_filename, 'r') target_regions = tk_io.get_target_regions(targets_file) if args.barcode_whitelist: barcode_whitelist = bc_utils.load_barcode_whitelist( args.barcode_whitelist) else: barcode_whitelist = None bam_slice = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end)) # do basic counting misc_sm, qual_sms = \ compute_basic_stats(bam_slice, target_regions, n_chunk, references, barcode_whitelist) misc_sm.save(outs.misc_sm) with open(outs.qual_sms, 'wb') as out_handle: pickle.dump(qual_sms, out_handle)
def main_report_single_partition(args, outs): # Bail out if there no valid barcodes if args.barcode_whitelist is None or args.input is None: outs.fragments = None return bam_in = tk_bam.create_bam_infile(args.input) if args.targets_file is None: target_regions = None else: target_regions = tk_io.get_target_regions(open(args.targets_file)) # Bail out if we're on a small genome if sum(bam_in.lengths) < 3e6: outs.fragments = None return bam_chunk = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end)) # Skip reads without a barcode bam_chunk_filt = itertools.ifilter(read_has_barcode, bam_chunk) bc_read_iter = itertools.groupby(bam_chunk_filt, lambda x: tk_io.get_read_barcode(x)) bc_data = (summarize_barcode(bc, list(reads), args.read_link_distance, bam_in.references, target_regions) for (bc, reads) in bc_read_iter) bc_data_filt = (x for x in bc_data if x is not None) frag_tbl, bc_tbl = make_output_dataframes(bc_data_filt) if frag_tbl is not None: # Sort and index fragment table, so that we can combine the fragments files per-chromosome to reduce memory consumption frag_tbl.sort(['chrom', 'start_pos'], inplace=True) tenkit.hdf5.write_data_frame(outs.fragments, frag_tbl) tenkit.hdf5.create_tabix_index(outs.fragments, 'chrom', 'start_pos', 'end_pos') if bc_tbl is not None: tenkit.hdf5.write_data_frame(outs.barcodes, bc_tbl)
def main(args, outs): if not args.wgsmode: # blacklist curation if args.blacklist_map: # If we got an explicit blacklist, use it blacklist_map = args.blacklist_map else: blacklist_map = {} # We did not get an explicit blacklist -- in this case, combine the built-in segdup file and the homo_del_blacklist # Combine the homo_del_blacklist with the internal segdup file to get the filter set if not "LowCov" in blacklist_map: blacklist_map["LowCov"] = args.low_coverage_blacklist genome = tk_reference.get_genome(args.reference_path) if not "SEGDUP" in blacklist_map: cnv_segdups = tenkit.constants.find_sv_file( genome, "cnv_segdup_filter.bed") blacklist_map["SEGDUP"] = cnv_segdups if not "whitelist" in blacklist_map: blacklist_map["whitelist"] = {} if not "h**o" in blacklist_map["whitelist"]: pilot_accs = tenkit.constants.find_sv_file( genome, "20141020.pilot_mask.whole_genome.bed") blacklist_map["whitelist"]["h**o"] = pilot_accs if not "het" in blacklist_map["whitelist"]: strict_accs = tenkit.constants.find_sv_file( genome, "20141020.strict_mask.whole_genome.bed") blacklist_map["whitelist"]["het"] = strict_accs outs.blacklist_map = blacklist_map # generate filtered target regions for het del and h**o del callers ## h**o if "LowCov" in blacklist_map: bedtools.no_overlap(args.target_regions, blacklist_map["LowCov"], outs.hom_del_query_region + "_tmp.bed") else: shutil.copyfile(args.target_regions, outs.hom_del_query_region + "_tmp.bed") bedtools.overlap(outs.hom_del_query_region + "_tmp.bed", blacklist_map["whitelist"]["h**o"], outs.hom_del_query_region) #shutil.copyfile(outs.hom_del_query_region+"_tmp.bed", outs.hom_del_query_region) ## het bedtools.no_overlap(outs.hom_del_query_region + "_tmp.bed", blacklist_map["SEGDUP"], outs.het_del_query_region + "_tmp.bed") bedtools.overlap(outs.het_del_query_region + "_tmp.bed", blacklist_map["whitelist"]["het"], outs.het_del_query_region) # ## # with open(blacklist_map["whitelist"]["h**o"]) as f: # accs_1000g_pilot = tk_io.get_target_regions(f) # with open(blacklist_map["whitelist"]["het"]) as f: # accs_1000g_strict = tk_io.get_target_regions(f) if args.wgs_deletions_gt: ## het evetns with open(outs.het_del_query_region) as f: bed_target = tk_io.get_target_regions(f) fhet_sen = open(outs.het_gt_sen, "w") fhet_ppv = open(outs.het_gt_ppv, "w") with open(args.wgs_deletions_gt) as f: for line in f: if line[0] == "#": continue infos = line.strip().split() chrom, start, end = infos[0], int(infos[1]), int(infos[5]) if chrom in bed_target: overlappings = bed_target[chrom].overlapping_regions( start, end) overlap_size = 0 for s, e in overlappings: overlap_size += (min(e, end) - max(s, start)) if overlap_size >= args.min_overlap: #and \ #chrom in accs_1000g_strict and\ #accs_1000g_strict[chrom].overlaps_region(start, end): record = "\t".join( (infos[0], infos[1], infos[5])) + "\n" if ("HET" in line) and ("TIER=1" in line): fhet_sen.write(record) fhet_ppv.write(record) ## h**o events with open(outs.hom_del_query_region) as f: bed_target = tk_io.get_target_regions(f) fhom_sen = open(outs.hom_gt_sen, "w") fhom_ppv = open(outs.hom_gt_ppv, "w") with open(args.wgs_deletions_gt) as f: for line in f: if line[0] == "#": continue infos = line.strip().split() chrom, start, end = infos[0], int(infos[1]), int(infos[5]) if chrom in bed_target: overlappings = bed_target[chrom].overlapping_regions( start, end) has_full_exon = False for s, e in overlappings: if start <= s + 1 and end >= e - 1: print start, end, s, e has_full_exon = True break if has_full_exon: #and \ #chrom in accs_1000g_pilot and\ #accs_1000g_pilot[chrom].overlaps_region(start, end): record = "\t".join( (infos[0], infos[1], infos[5])) + "\n" if ("HOM" in line) and ("TIER=1" in line): fhom_sen.write(record) fhom_ppv.write(record) fhet_sen.flush() fhet_sen.close() fhet_ppv.flush() fhet_ppv.close() fhom_sen.flush() fhom_sen.close() fhom_ppv.flush() fhom_ppv.close() else: outs.het_gt_sen = None outs.het_gt_ppv = None outs.hom_gt_sen = None outs.hom_gt_ppv = None else: outs.hom_gt_sen = None outs.hom_gt_ppv = None outs.het_del_query_region = None outs.hom_del_query_region = None outs.blacklist_map = None fhet_sen = open(outs.het_gt_sen, "w") fhet_ppv = open(outs.het_gt_ppv, "w") if args.wgs_deletions_gt: with open(args.wgs_deletions_gt) as f: for line in f: if line[0] == "#": continue infos = line.strip().split() record = "\t".join((infos[0], infos[1], infos[5])) + "\n" if "TIER=1" in line: fhet_sen.write(record) fhet_ppv.write(record) fhet_sen.flush() fhet_sen.close() fhet_ppv.flush() fhet_ppv.close() else: outs.het_gt_sen = None outs.het_gt_ppv = None
def do_homo_whiltelist_filtering(events, whitelist_file, whitelist_name): with open(whitelist_file) as fWT: whitelist = tk_io.get_target_regions(fWT) for e in events: e.check_whitelist(whitelist, whitelist_name)
def read_bed_file(bed): # output chrom_idx_truth is a chr-indexed dictionary with tk_regions.Regions value with open(bed) as f: bed_dict = tk_io.get_target_regions(f) return bed_dict
def main(args, outs): if args.fragments is None: outs.bc_cnv = None outs.bc_large_cnv = None return rust_env = os.environ.copy() rust_env["RUST_BACKTRACE"] = "1" final_blacklist = lr_gt.get_genomic_track(args.blacklist, "terminal_cnv", args.reference_path, "default_blacklist.bed") if final_blacklist is None: final_blacklist = args.possorted_bam + "_tmp" open(final_blacklist, 'w').close() if args.subcommand == "bc" and args.fragments: frag_csv = outs.bc_cnv + ".csv" bin_size, frag_version = convert_fragments_to_csv( args.fragments, frag_csv, args.bin_size, args.allow_bin_size_adj) cnv_args = [ 'hmm-bc-cnv', args.subcommand, frag_csv, args.possorted_bam, final_blacklist, outs.bc_cnv, "--fragver", str(frag_version), "--binsize", str(bin_size), "--probchange", str(args.status_change_penalty), "--minprob", str(args.min_prob) ] elif args.subcommand == "read": cnv_args = [ 'hmm-bc-cnv', args.subcommand, args.possorted_bam, final_blacklist, outs.bc_cnv, "--binsize", str(args.bin_size), "--probchange", str(args.status_change_penalty) ] elif args.subcommand == "asread": frag_csv = outs.bc_cnv + ".csv" bin_size, frag_version = convert_fragments_to_csv( args.fragments, frag_csv, args.bin_size, args.allow_bin_size_adj) cnv_args = [ 'hmm-bc-cnv', args.subcommand, frag_csv, args.possorted_bam, final_blacklist, outs.bc_cnv, "--fragver", str(frag_version), "--binsize", str(bin_size), "--probchange", str(args.status_change_penalty), "--minprob", str(args.min_prob) ] print cnv_args subprocess.check_call(cnv_args, env=rust_env) outs.final_bin_size = bin_size chroms = [] starts1 = [] end1 = [] starts2 = [] end2 = [] info_strs = [] quals = [] primary_contigs = tk_reference.load_primary_contigs(args.reference_path) spikes = tk_io.get_target_regions(open(args.spikes)) with open(outs.bc_cnv) as fin: for line in fin.readlines(): if line.startswith('#') or line.startswith( 'browser') or line.startswith('track') or line.startswith( '-browser') or line.startswith('-track'): continue infos = line.strip().split("\t") cp = int(infos[3]) ch = infos[0] s = int(infos[1]) e = int(infos[2]) # Some basic filtering if primary_contigs and ch not in primary_contigs: continue if cp == 2 or (e - s) < args.minimal_cnv_size: continue if cp > 2: if ch not in spikes: continue overlaps = spikes[ch].overlapping_regions( max(0, s - bin_size), e + bin_size) ln = len(overlaps) if ln > 0 and \ overlap(s-bin_size, s+bin_size, overlaps[0][0], overlaps[0][1]) and \ overlap(e-bin_size, e+bin_size, overlaps[ln-1][0], overlaps[ln-1][1]): continue chroms.append(infos[0]) starts1.append(s) end1.append(s + 1) starts2.append(e) end2.append(e + 1) pval = float(infos[4]) #if pval > args.max_pval: # continue if pval < 1e-100: qual = 1000 else: qual = int(-log10(pval) * 10) quals.append(qual) if cp > 2: info_strs.append('TYPE=DUP;COPY=%d' % cp) elif cp < 2: info_strs.append('TYPE=DEL;COPY=%d' % cp) sv_df = tk_sv_io.create_sv_df(chroms, starts1, end1, chroms, starts2, end2, np.arange(len(chroms)), quals, info_strs=info_strs) tk_sv_io.write_sv_df_to_bedpe(sv_df, outs.bc_large_cnv)