def filter_itxs(feature): n = len(feature.fields) / 2 del_interval_idp = map(int, feature.fields[7].split("-")) del_interval_itx_1 = map(int, feature.fields[n + 7].split(",")[0].split("-")) del_interval_itx_2 = map(int, feature.fields[n + 7].split(",")[1].split("-")) if filter(lambda x: abs(x[0] - del_interval_idp[0]) + abs( x[1] - del_interval_idp[1]) == 0, [del_interval_itx_1, del_interval_itx_2]) and "LowQual" not in \ feature.fields[n + 4]: return None return pybedtools.Interval(feature.chrom, feature.start, feature.end, name=feature.name, score=feature.score, otherfields=feature.fields[6:n])
def chop(chr, start,end, wsize=5): """ writes some sort of phastcons thing..., not quite sure For circos plotting, not used add back if we want more output later, ignore for now """ file = open("phastcons.txt", 'w') i = start while i < end: x = pybedtools.Interval(chr, i, (i+wsize)) p = get_mean_phastcons(x, species="hg19") file.write("\t".join(map(str, [chr, i, (i+wsize-1), p])) + "\n") i += wsize file.close()
def __getitem__(self, key): iterator = self.fileobj.fetch( str(key.chrom), key.start, key.stop) for r in iterator: start = r.pos curr_end = r.pos for op, bp in r.cigar: start = curr_end curr_end += bp if op == 0: interval = pybedtools.Interval( self.fileobj.references[r.rname], start, curr_end, strand=strand_lookup[r.flag & 0x0010]) interval.file_type = 'bed' yield interval
def add_weighted_score(in_bed, score_bed): out_bed = in_bed.intersect(score_bed, wao=True).saveas(os.path.join(args.tmpdir, "score.bed")) bed_array = [] last_interval = pybedtools.Interval("", 0, 0) map_value = 0.0 for interval in out_bed: if interval.chrom != last_interval.chrom or interval.start != last_interval.start or interval.end != last_interval.end: if last_interval.chrom: bed_array.append(tuple(last_interval.fields[:-5]) + (str(map_value),)) map_value = 0.0 last_interval = interval if float(interval.fields[-1]) > 0: map_value += float(interval.fields[-1]) * float(interval.fields[-2]) / float(interval.length) if last_interval.chrom: bed_array.append(tuple(last_interval.fields[:-5]) + (str(map_value),)) return pybedtools.BedTool(bed_array)
def find_idp(feature, wiggle): n = len(feature.fields) / 2 if feature.chrom != feature.fields[n]: return None start_dup = feature.start end_dup = feature.end start_del = int(feature.fields[n + 1]) end_del = int(feature.fields[n + 2]) if abs(start_del - end_del) > (abs(start_dup - end_dup) - wiggle): return None dist_ends = [abs(start_del - start_dup), abs(end_del - end_dup)] if min(dist_ends) > wiggle: return None del_pos = start_del if dist_ends[0] > dist_ends[1] else end_del name = "%s,%s" % (feature.name, feature.fields[n + 3]) score = "%s,%s" % (feature.score, feature.fields[n + 4]) return pybedtools.Interval(feature.chrom, feature.start, feature.end, name=name, score=score, otherfields=["%d" % del_pos, "%d-%d" % (start_del, end_del)])
def extract_candidate_split_regions( work, filtered_candidates_vcfs, split_regions, ensemble_beds, reference, matrix_base_pad, merge_d_for_short_read): logger = logging.getLogger(extract_candidate_split_regions.__name__) candidates_split_regions = [] for i, (filtered_vcf, split_region_) in enumerate(zip(filtered_candidates_vcfs, split_regions)): candidates_region_file = os.path.join( work, "candidates_region_{}.bed".format(i)) candidates_bed = pybedtools.BedTool(filtered_vcf).each( lambda x: pybedtools.Interval(x[0], int(x[1]), int(x[1]) + len(x[3]))).sort().slop( g=reference + ".fai", b=matrix_base_pad + 3).merge(d=merge_d_for_short_read) if ensemble_beds: candidates_bed = candidates_bed.cat(ensemble_beds[i], postmerge=False).sort( ).merge(d=merge_d_for_short_read) candidates_bed.intersect(split_region_).sort().saveas( candidates_region_file) candidates_split_regions.append(candidates_region_file) return candidates_split_regions
def check_duplicates(interval1, interval2, max_dist=10): if interval1.chrom != interval2.chrom or \ abs(interval1.start - interval2.start) > max_dist or \ abs(interval1.end - interval2.end) > max_dist or \ interval1.fields[3].split(",")[1] != \ interval2.fields[3].split(",")[1]: return None info1 = json.loads(base64.b64decode(interval1.fields[3].split(",")[0])) info2 = json.loads(base64.b64decode(interval2.fields[3].split(",")[0])) svmethods = sorted(list(set(info1["SVMETHOD"] + info2["SVMETHOD"]))) sources = [] if "SOURCES" in info1: sources.append(info1["SOURCES"]) if "SOURCES" in info2: sources.append(info2["SOURCES"]) sources = ",".join(sources) if sources: info1["SOURCES"] = sources if "PASS" in [ interval1.fields[7], interval2.fields[7] ] or ("AS" not in svmethods and len(set(svmethods) - {"SC", "AS"}) > 1): sv_filter = "PASS" else: sv_filter = "LowQual" end = max(interval1.end, interval2.end) start = min(interval1.start, interval2.start) info1.update({ "END": end, "SVMETHOD": svmethods, "NUM_SVMETHODS": len(svmethods) }) return pybedtools.Interval( interval1.chrom, start, end, name="%s,%s,%d,%s" % (base64.b64encode(json.dumps(info1)), info1["SVTYPE"], end - start, ";".join(svmethods)), score=interval1.score, otherfields=[interval1.fields[6], sv_filter])
def _regions_for_coverage(data, region, out_file): """Retrieve BedTool iterator over regions we need to calculate coverage in. """ variant_regions = utils.get_in(data, ("config", "algorithm", "variant_regions")) ready_region = shared.subset_variant_regions(variant_regions, region, out_file) if not ready_region: return get_ref_bedtool(data["sam_ref"], data["config"]) elif os.path.isfile(ready_region): return pybedtools.BedTool(ready_region).intervals elif isinstance(ready_region, (list, tuple)): c, s, e = ready_region return [pybedtools.Interval(c, s, e)] else: assert isinstance(ready_region, basestring) out = [] for r in [x for x in get_ref_bedtool(data["sam_ref"], data["config"]) if x.chrom == ready_region]: # If we have variant regions but none in this region, don't calculate coverage r.attrs["no_coverage"] = variant_regions is not None out.append(r) return out
def add_pairwise_bedtool_track(pairs_bedpe_bt, track, action, binsize): """ Extract feature values (as list) for all pairs vs. a BedTool (as BEDPE) """ # Multiple actions all start with computing bedtools pairtopair -a both if action in 'count-pairs pairwise-coverage any-pairwise-overlap'.split(): track_hits = pairs_bedpe_bt.pairtopair(b=track, type='both') if action in 'count-pairs any-pairwise-overlap'.split(): counts_only = True else: counts_only = False hits_per_bin = _split_pairtopair_by_binpairs(track_hits, pairs_bedpe_bt, counts_only=counts_only) if action == 'count-pairs': values = list(hits_per_bin.values()) elif action == 'any-pairwise-overlap': values = [min([1, k]) for k in hits_per_bin.values()] elif action == 'pairwise-coverage': values = [] for pair, hits in hits_per_bin.items(): if len(hits) > 0: fields = pair.split('_') pair_interval = pbt.Interval(fields[0], int(fields[1]), int(fields[2])) pairbt = _split_pairs(pair_interval, binsize) covdf_names = 'chr start end items bp total frac'.split() covdf = pairbt.coverage(hits).to_dataframe(names=covdf_names) values.append(covdf.bp.sum() / covdf.total.sum()) else: values.append(0) else: from sys import exit exit('INPUT ERROR: --action {0} not recognized.'.format(action)) return values
def plot_multiple_regions_coverage(samples, out_file, region_bed=None, stem_bed=None): """ given a list of bcbio samples and a bed file or BedTool of regions, makes a plot of the coverage in the regions for the set of samples if given a bed file or BedTool of locations in stem_bed with a label, plots lollipops at those locations """ mpl.use('Agg', force=True) PAD = 100 if file_exists(out_file): return out_file in_bams = [dd.get_align_bam(x) for x in samples] samplenames = [dd.get_sample_name(x) for x in samples] if isinstance(region_bed, six.string_types): region_bed = pybedtools.BedTool(region_bed) if isinstance(stem_bed, six.string_types): stem_bed = pybedtools.BedTool(stem_bed) if stem_bed is not None: # tabix indexed bedtools eval to false stem_bed = stem_bed.tabix() plt.clf() plt.cla() with file_transaction(out_file) as tx_out_file: with backend_pdf.PdfPages(tx_out_file) as pdf_out: sns.despine() for line in region_bed: for chrom, start, end in _split_regions(line.chrom, max(line.start - PAD, 0), line.end + PAD): df = _combine_regional_coverage(in_bams, samplenames, chrom, start, end, os.path.dirname(tx_out_file)) plot = sns.tsplot(df, time="position", unit="chrom", value="coverage", condition="sample") if stem_bed is not None: # tabix indexed bedtools eval to false interval = pybedtools.Interval(chrom, start, end) _add_stems_to_plot(interval, stem_bed, samples, plot) plt.title("{chrom}:{start}-{end}".format(**locals())) pdf_out.savefig(plot.get_figure()) plt.close() return out_file
def build_chr2_ins(feature,thr_top=0.15): sc_chr2_str=feature.fields[6] if sc_chr2_str==".": return [] sub_str=map(lambda x:[x.split(";")[0],map(int,x.split(";")[1:])],sc_chr2_str.split(",")) chr2_dict={} for chr2,poses in sub_str: if chr2 not in chr2_dict: chr2_dict[chr2]=[] chr2_dict[chr2].append(poses) chr2_dict={k:[sum(map(lambda x:x[0],v)),min(map(lambda x:x[1],v)),max(map(lambda x:x[2],v))] for k,v in chr2_dict.iteritems()} sorted_chr2=sorted(chr2_dict.items(),key=lambda x: x[1][0],reverse=True) n_reads=sum(map(lambda x:x[1][0],sorted_chr2)) top_chr2s=filter(lambda x: x[1][0]>(thr_top*n_reads) and x[0] not in ["-1",feature.chrom],sorted_chr2) if not top_chr2s: return [] ctx_intervals=[] for chr2,[cnt,start,end] in top_chr2s: ctx_intervals.append(pybedtools.Interval(chr2, start, end, name=feature.name,score=feature.score)) return ctx_intervals
def test_indexing(): """ Indexing into BedTools """ a = pybedtools.example_bedtool('a.bed') # This is the first line interval = pybedtools.Interval('chr1', 1, 100, 'feature1', '0', '+') # just to make sure assert interval == iter(a).next() # test slice behavior results = list(a[0:2]) assert len(results) == 2 assert results[0] == interval # test single-integer indexing assert a[0] == interval # only slices and integers allowed.... assert_raises(ValueError, a.__getitem__, 'key')
def find_itx(feature, wiggle): n = len(feature.fields) / 2 start_idp1 = feature.start end_idp1 = feature.end start_idp2 = int(feature.fields[n + 1]) end_idp2 = int(feature.fields[n + 2]) dist_ends = [abs(start_idp1 - start_idp2), abs(end_idp1 - end_idp2)] if min(dist_ends) > wiggle: return None del_pos1 = int(feature.fields[6]) del_pos2 = int(feature.fields[n + 6]) if abs(del_pos1 - del_pos2) > wiggle: return None del_interval1 = map(int, feature.fields[7].split("-")) del_interval2 = map(int, feature.fields[n + 7].split("-")) lr_1 = 1 if abs(del_pos1 - del_interval1[0]) < abs(del_pos1 - del_interval1[1]) else 0 lr_2 = 1 if abs(del_pos2 - del_interval2[0]) < abs(del_pos2 - del_interval2[1]) else 0 if lr_1 == lr_2 or lr_2 < lr_1: return None del_id_2 = feature.name.split(",")[-1] del_filter_2 = feature.score.split(",")[-1] name = "%s,%s" % (feature.name, del_id_2) score = "%s,%s" % (feature.score, del_filter_2) return pybedtools.Interval(feature.chrom, feature.start, feature.end, name=name, score=score, otherfields=[ "%d" % ((del_pos1 + del_pos2) / 2), "%d-%d,%d-%d" % (del_interval1[0], del_interval1[1], del_interval2[0], del_interval2[1]) ])
def get_wg_coverage(self): """Generator that takes as input a sorted bam and a merged bam of the circles in the whole genome and returns a numpy array for every interval with the coverage""" reference_contigs = self.bam.header['SQ'] header_dict = {} for reference in reference_contigs: header_dict[reference['SN']] = reference['LN'] merged_bed = self.bed.sort().merge() for interval in merged_bed: coverage_dict = {} if interval.start - self.ext < 0: start = 0 else: start = interval.start - self.ext if header_dict[interval.chrom] < (interval.end + self.ext): end = interval.end + self.ext else: end = interval.end cov = self.bam.count_coverage(contig=interval.chrom, start=start, end=end, quality_threshold=self.mapq) summarized_cov = np.array([cov[0], cov[1], cov[2], cov[3]]).sum(axis=0) # save memory, convert to uint32. summ_cov = np.uint32(summarized_cov) print("Computing coverage on interval %s:%s-%s" % (interval.chrom,interval.start,interval.end)) coverage_dict[bt.Interval(interval.chrom, start, end)] = summ_cov yield(coverage_dict,header_dict)
def merged_interval_features(feature, bam_handle): support_list = feature.name.split(",") locations = sorted(map(int, support_list[0::2])) num_unique_locations = len(set(locations)) count_str = ",".join([ "%s,%s" % (i, c) for (i, c) in collections.Counter(locations).items() ]) plus_support = len([i for i in support_list[1::2] if i == "+"]) minus_support = len(locations) - plus_support locations_span = max(locations) - min(locations) name = "%s,INS,0,SC,%d,%d,%d,%d,%s" % (base64.b64encode(json.dumps( dict())), plus_support, minus_support, locations_span, num_unique_locations, count_str) interval_readcount = bam_handle.count(reference=feature.chrom, start=feature.start, end=feature.end) return pybedtools.Interval(feature.chrom, feature.start, feature.end, name=name, score=feature.score, otherfields=[str(interval_readcount)])
def parallel_array_test(self): features = [(0, 20), (61, 81), (200, 220)] features = [pybedtools.Interval('chr2L', *i) for i in features] arr0 = self.m.array(features, bins=20, fragment_size=5) assert np.all(arr0 == np.array( [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])) arr1 = self.m.array(features, bins=20, fragment_size=5, chunksize=1, processes=PROCESSES) assert np.all(arr0 == arr1) # mix up the chunksize and processes arr2 = self.m.array(features, bins=20, fragment_size=5, chunksize=3, processes=PROCESSES) assert np.all(arr0 == arr2) # use more features and test for identity again features *= 1000 print len(features) arr0 = self.m.array(features, bins=20, fragment_size=5) arr1 = self.m.array(features, bins=20, fragment_size=5, chunksize=5, processes=PROCESSES) print arr0.shape print arr1.shape print(arr0 != arr1) assert np.all(arr0 == arr1)
def coords_to_bedtool(self, single_exon_coords): """Convert exon coordinates to bedtool intervals Assumes that the coordinates are in the exact same order as the original miso ids. Parameters ---------- single_exon_coords : list List of (chrom, start, stop, strand) tuples of a single exon's coordinates Returns ------- bedtool : pybedtools.BedTool A bedtool object of the exon intervals """ if len(single_exon_coords) != len(self.miso_ids): raise ValueError("Number of coordinates must equal the number of " "original miso ids") intervals = [] for miso_id, exon in zip(self.miso_ids, single_exon_coords): chrom, start, stop, strand = exon # Base-0-ify start = int(start) - 1 stop = int(stop) intervals.append( pybedtools.Interval(chrom, start, stop, strand=strand, name=miso_id, score='1000')) return pybedtools.BedTool(intervals)
def parse(BAM,BED): ''' Parse BAM and get overall and queries-specific statistics ''' S_dict=dict() S_dict['BAM_PRIM']=0 #reads S_dict['BAM_SUPP']=0 #reads S_dict['BAM_SEC']=0 #reads S_dict['BAM_UNMAP']=0 #reads S_dict['BAM_ONTARGET']=0 #reads S_dict['BAM_OFFTARGET']=0 #reads S_dict['BAM_CSOFT_CLIP']=0 #bps S_dict['BAM_CMATCH']=0 #bps S_dict['BAM_CINS']=0 #bps S_dict['BAM_CDEL']=0 #bps S_dict['BAM_CDIFF']=0 #bps S_dict['BAM_LEN'] = [] #all lengths in list S_dict['BAM_QUAL'] = [] #all qualities in list S_dict['BAM_PID'] = [] #all PID in list bamfile=pysam.AlignmentFile(BAM, 'rb') try: bedfile=pybedtools.BedTool(BED) bedsrtd=bedfile.sort() except: now=datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + ']' + '[Error] Invalid BED file format') sys.exit(1) now=datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + ']' + '[Message] Parsing BAM file') ivf=bedsrtd.as_intervalfile() for read in bamfile.fetch(): if read.has_tag('MD') and read.has_tag('NM'): if not read.is_unmapped: if not read.is_supplementary: if not read.is_secondary: S_dict['BAM_PRIM']+=1 query=pybedtools.Interval(read.reference_name,read.reference_start,read.reference_end) if ivf.any_hits(query) >=1: #parse CIGAR only in targeted regions S_dict['BAM_ONTARGET']+=1 Cdict=tuptodict(read.cigartuples) MD=read.get_aligned_pairs(with_seq=True) S_dict['BAM_CSOFT_CLIP']+=Cdict[4] S_dict['BAM_CINS']+=Cdict[1] S_dict['BAM_CDEL']+=Cdict[2] S_dict['BAM_CMATCH']+=sum(1 for x,y,z in MD if x is not None and z is not None and z[0].isupper()) S_dict['BAM_CDIFF']+=sum(1 for x,y,z in MD if x is not None and z is not None and z[0].islower()) S_dict['BAM_QUAL'].append(np.mean(read.query_qualities)) NM=read.get_tag('NM') refcoords=read.get_reference_positions() reflen=refcoords[-1]-refcoords[0]#reference spans from first aligned to last aligned seqlen=len(read.get_reference_positions(full_length=True))#this is the read length S_dict['BAM_LEN'].append(seqlen) PID=100-100*NM/max(reflen,seqlen) S_dict['BAM_PID'].append(PID) else: S_dict['BAM_OFFTARGET']+=1 else: S_dict['BAM_SEC']+=1 else: S_dict['BAM_SUPP']+=1 else: S_dict['BAM_UNMAP']+=1 else: now=datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[Error] BAM misses the required MD/NM tags') bamfile.close() sys.exit(1) #calculate coverage in regions from BED for query in bedsrtd: key=query.chrom+':'+str(query.start)+'-'+str(query.end) now=datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + ']' + '[Message] Calculating coverage in region ' + key) query_arr=bamfile.count_coverage(query.chrom,query.start,query.end,quality_threshold=0, read_callback=check_read) perbasecov=np.sum(query_arr,axis=0).tolist() S_dict[key]=perbasecov bamfile.close() return S_dict
def split_region(work, region_bed_file, num_splits, max_region=1000000, min_region=20, shuffle_intervals=False): logger.info("-----------------------------------------------------------") logger.info("Split region") logger.info("-----------------------------------------------------------") regions_bed = pybedtools.BedTool(region_bed_file).sort().merge(d=0) intervals = [] for region in regions_bed: chrom, start, end = region.chrom, region.start, region.end if region.length + 1 > max_region: for i in range(start, end + 1, max_region): intervals.append( pybedtools.Interval(chrom, i, min(end, i + max_region - 1))) else: intervals.append(region) if shuffle_intervals: shuffle(intervals) regions_bed = pybedtools.BedTool(intervals) total_len = sum(map(lambda x: int(x[2]) - int(x[1]) + 1, regions_bed)) logger.info("Total length: {}".format(total_len)) split_len = total_len / num_splits split_regions = [] current_regions = [] sofar_len = 0 current_len = 0 split_lens = [] for region in regions_bed: chrom, start, end = region[0:3] start, end = int(start), int(end) s = start e = -1 while (current_len < split_len): s = max(s, e + 1) e = min(s + split_len - current_len - 1, end) if (e - s + 1) < 2 * min_region: e = min(s + 2 * min_region - 1, end) if (end - e) < 2 * min_region: e = end current_regions.append(pybedtools.Interval(chrom, s, e)) current_len += e - s + 1 if (current_len >= split_len): sofar_len += current_len split_lens.append(current_len) current_len = 0 split_regions.append(current_regions) current_regions = [] if split_len < (total_len - sofar_len) < 1.5 * split_len: split_len = total_len - sofar_len if e >= end: break if current_regions: split_lens.append(current_len) split_regions.append(current_regions) split_region_files = [] for i, split_region_ in enumerate(split_regions): split_region_file = os.path.join(work, "region_{}.bed".format(i)) pybedtools.BedTool(split_region_).saveas(split_region_file) logger.info("Split {}: {}".format(i, split_lens[i])) split_region_files.append(split_region_file) sum_len = sum(split_lens) logger.info("Total splitted length: {}".format(sum_len)) return split_region_files
giremi_to_vcf(editor_pred, vcf_file) editor_bed = vcf_to_bed(vcf_file, all_otherfields=True) cmd = "java -jar %s vcfcompare -true_vcf %s -prefix %s.NISTHCnonDB %s" % ( varsim_jar, NIST_HC_nonDB, pred_file, vcf_file) if not os.path.exists("%s.NISTHCnonDB_TP.vcf" % (pred_file)): a = os.system(cmd) print cmd if a != 0: print a # In[17]: pred_edited = {} edit_bed = pybedtools.BedTool([ pybedtools.Interval(x[1], int(x[2]) - 1, int(x[2]), x[17], find_er(x[17][0], x[17][1], x[3], x[18:22])) for x in editor_pred if int(x[22]) > 0 ]) for region, region_bed in [["Alu", Alu_regions], ["nonAlu-reps", nonAlu_rep_regions], ["nonreps", ""], ["all", ""]]: if region in ["Alu", "nonAlu-reps"]: my_edit_bed = edit_bed.window(region_bed, w=0, u=True) elif region == "nonreps": my_edit_bed = edit_bed.window(Alu_regions, w=0, v=True) my_edit_bed = my_edit_bed.window(nonAlu_rep_regions, w=0, v=True) elif region == "all": my_edit_bed = edit_bed.sort() edit_types = [x[3] for x in my_edit_bed] edit_ratios = [x[4] for x in my_edit_bed]
def get_insertion_breakpoints(age_records, intervals, window=20, start=0): func_logger = logging.getLogger("%s-%s" % (get_insertion_breakpoints.__name__, multiprocessing.current_process())) bedtools_intervals = [ pybedtools.Interval("1", interval[0], interval[1]) for interval in sorted(intervals) ] func_logger.info("bedtools_intervals %s" % (str(bedtools_intervals))) if not bedtools_intervals: return [] potential_breakpoints = sorted( list( set([interval.start for interval in bedtools_intervals] + [interval.end for interval in bedtools_intervals]))) breakpoints = [] for breakpoint in potential_breakpoints[1:-1]: # Check if the breakpoint is within window distance of a validated breakpoint if min([window + 1] + [abs(b[0] - breakpoint) for b in breakpoints]) <= window: continue func_logger.info("\tExamining potential breakpoint %d for support" % breakpoint) left_support = [ interval[0] for interval in intervals if abs(interval[0] - breakpoint) <= window ] right_support = [ interval[1] for interval in intervals if abs(interval[1] - breakpoint) <= window ] counter_examples = [ age_record for age_record in age_records if age_record.has_long_ref_flanks() and ( age_record.has_ref_deletion(window) or age_record.has_insertion(min_diff=1, max_diff=49)) and age_record.breakpoint_match(breakpoint, window) ] if counter_examples: counter_example_ends = [ age_record.start1_end1s for age_record in counter_examples ] func_logger.info("\t\tSkipping breakpoint %d due to %s" % (breakpoint, str(counter_example_ends))) continue if left_support: func_logger.info("\t\tLeft support %s" % (str(left_support))) if right_support: func_logger.info("\t\tRight support %s" % (str(right_support))) if (left_support and right_support) and min( [window + 1] + [abs(b[0] - breakpoint) for b in breakpoints]) > window: both_support = [ age_record for age_record in age_records if age_record.has_insertion(min_diff=50, max_diff=1000000000) and age_record.breakpoint_match(breakpoint, window) ] if both_support: func_logger.info("\t\tboth_support = %s" % (str(both_support))) func_logger.info("\t\tinsertion lengths = %s" % (str([ age_record.insertion_length() for age_record in both_support ]))) insertion_length = max( [0] + [age_record.insertion_length() for age_record in both_support]) func_logger.info("\t\tInsertion length = %d" % insertion_length) breakpoints.append((breakpoint, insertion_length)) func_logger.info("Gathered breakpoints as %s" % (str(breakpoints))) return [(start + b[0], b[1]) for b in breakpoints]
def segments_as_bedtool_intervals(self, segments, name='.'): yield from (pybedtools.Interval(s.contig, s.start, s.stop, strand=s.strand, name=name) for s in segments)
def CR(stranded_bam, ivf): ''' Extract Crick reads from strand-seq BAM. Switch Watson and Crick if hit in ivf ''' CR = defaultdict(lambda: [None, None]) for read in stranded_bam.fetch(until_eof=True): if read.is_proper_pair and not read.is_secondary and not read.is_supplementary: if ivf is None: #no need to check for intervals match if (read.is_read1 and read.is_reverse) or ( read.is_read2 and not read.is_reverse): #read2 forward and read1 reverse read.set_tag('OS', 'C', 'Z') #used for debugging if read.query_name not in CR: if read.is_read1: CR[read.query_name][0] = read else: CR[read.query_name][1] = read else: if read.is_read1: yield read, CR[read.query_name][1] else: yield CR[read.query_name][0], read del CR[read.query_name] else: #there is a region to perform W-C switch in query = pybedtools.Interval(read.reference_name, read.reference_start, read.reference_end) if ivf.any_hits(query) >= 1: #yeld Watson as Crick if (read.is_read1 and not read.is_reverse) or (read.is_read2 and read.is_reverse): read.set_tag('OS', 'W', 'Z') #used for debugging if read.query_name not in CR: if read.is_read1: CR[read.query_name][0] = read else: CR[read.query_name][1] = read else: if read.is_read1: yield read, CR[read.query_name][1] else: yield CR[read.query_name][0], read del CR[read.query_name] else: #Classic Crick Read if (read.is_read1 and read.is_reverse) or ( read.is_read2 and not read.is_reverse ): #read2 forward and read1 reverse read.set_tag('OS', 'C', 'Z') #used for debugging if read.query_name not in CR: if read.is_read1: CR[read.query_name][0] = read else: CR[read.query_name][1] = read else: if read.is_read1: yield read, CR[read.query_name][1] else: yield CR[read.query_name][0], read del CR[read.query_name]
def select_read_pair_one_overlap_TE_annot(self, TE_annot, int_size, min_mapq, db, bin_size=50000000): """ output bam file of read pairs where exactly one read overlaps with an annotation in supplied gff file\ also returns a BedTools object """ # print "selecting discordant reads that overlap with a TE in annotation " + TE_annot + " ..." #use pysam to open the bam file because it has better object definition for the reads valid_discordant_bam = pysam.Samfile(self.bam_file_name, "rb") #file to save the discordant read pairs where exactly one read overlaps a TE annotation #overlap_TE_bam_file = pysam.Samfile(self.prefix + ".one_read_overlap_TE.bam", mode="wb", referencenames=valid_discordant_bam.references, referencelengths=valid_discordant_bam.lengths) #use pybedtools to look up the overlap of the Interval defined by the read with the Intervals defined by the gff file TE_annot_intervals = pybedtools.IntervalFile(TE_annot) #make a list of AlignedReadPair objects for each read pair in the list that has exactly one read overlapping a TE read_pairs_xor_overlap_TE = [] print '######' print db print '######' read_pair_database = y_serial.Main(db) bin_list = list() try: read1 = valid_discordant_bam.next() read2 = valid_discordant_bam.next() except StopIteration: print "ERROR: no reads are found in %s, exiting" % (bam_file_name) sys.exit(2) while 1: #if verbose: # print read1 # print read2 #check that the reads are truly a pair: #if not, scoot down one in the iteration if read1.qname != read2.qname: print "unmatched pair in valid discordant reads. Problem!!" #sys.exit(2) read1 = read2 try: read2 = valid_discordant_bam.next() except StopIteration: break continue read_pair = AlignedReadPair(read1, read2) #see if read1 is a TE read1_all_mappings = get_all_mapping_pos(read1, valid_discordant_bam) for (chr, start, end) in read1_all_mappings: map_interval = pybedtools.Interval(chr, start, end, strand='+') overlapping_TE_annots = TE_annot_intervals.all_hits( map_interval) if len(overlapping_TE_annots) > 0: read_pair.TE_annot_attr_list.extend([ gff_interval.attrs for gff_interval in overlapping_TE_annots ]) read_pair.TE_map_gff_list.extend([ str(gff_interval) for gff_interval in overlapping_TE_annots ]) read_pair.read1_is_TE = True #print "read1 TE" #if read1 is TE, then read2 is the anchor, so set the interval chr to the chr of read2 read_pair.interval_chr = valid_discordant_bam.getrname( read2.rname) #see if read2 is a TE read2_all_mappings = get_all_mapping_pos(read2, valid_discordant_bam) for (chr, start, end) in read2_all_mappings: map_interval = pybedtools.Interval(chr, start, end, strand='+') overlapping_TE_annots = TE_annot_intervals.all_hits( map_interval) if len(overlapping_TE_annots) > 0: read_pair.TE_annot_attr_list.extend([ gff_interval.attrs for gff_interval in overlapping_TE_annots ]) read_pair.TE_map_gff_list.extend([ str(gff_interval) for gff_interval in overlapping_TE_annots ]) read_pair.read2_is_TE = True #print "read2 TE" #if read2 is TE, then read1 is the anchor, so set the interval chr to the chr of read1 read_pair.interval_chr = valid_discordant_bam.getrname( read1.rname) #only add the AlignedRead to the list if exactly one read maps to a TE location, and the anchor is not repetitive if read_pair.read1_is_TE and not read_pair.read2_is_TE and not is_mapped_mult_times( read2): if min_mapq: if read2.mapq >= min_mapq: read_pair.calculate_outside_interval( int_size, read1, read2) read_pair.calc_anchor_is_softclipped(read1, read2) ###### TODO: this is where the Aligned ReadPair objects are deleted #read_pair.read1 = None #read_pair.read2 = None #print read_pair.read1 ############# #read_pairs_xor_overlap_TE.append(read_pair) tabname = "[c%s_%d_%d_%s]" % ( read_pair.interval_chr, bin_size * (int(read_pair.interval_start) / bin_size), bin_size * (1 + int(read_pair.interval_start) / bin_size), read_pair.interval_direction) #read_pair_database.insert(read_pair,'',tabname) read_pairs_xor_overlap_TE.append([read_pair, tabname]) ##overlap_TE_bam_file.write(read_pair.read1) ##overlap_TE_bam_file.write(read_pair.read2) else: read_pair.calculate_outside_interval( int_size, read1, read2) read_pair.calc_anchor_is_softclipped(read1, read2) #read_pairs_xor_overlap_TE.append(read_pair) tabname = "[c%s_%d_%d_%s]" % ( read_pair.interval_chr, bin_size * (int(read_pair.interval_start) / bin_size), bin_size * (1 + int(read_pair.interval_start) / bin_size), read_pair.interval_direction) #read_pair_database.insert(read_pair,'',tabname) read_pairs_xor_overlap_TE.append([read_pair, tabname]) elif read_pair.read2_is_TE and not read_pair.read1_is_TE and not is_mapped_mult_times( read1): if min_mapq: if read1.mapq >= min_mapq: read_pair.calculate_outside_interval( int_size, read1, read2) read_pair.calc_anchor_is_softclipped(read1, read2) #read_pairs_xor_overlap_TE.append(read_pair) tabname = "[c%s_%d_%d_%s]" % ( read_pair.interval_chr, bin_size * (int(read_pair.interval_start) / bin_size), bin_size * (1 + int(read_pair.interval_start) / bin_size), read_pair.interval_direction) #read_pair_database.insert(read_pair,'',tabname) read_pairs_xor_overlap_TE.append([read_pair, tabname]) ##overlap_TE_bam_file.write(read_pair.read1) ##overlap_TE_bam_file.write(read_pair.read2) else: read_pair.calculate_outside_interval( int_size, read1, read2) read_pair.calc_anchor_is_softclipped(read1, read2) #read_pairs_xor_overlap_TE.append(read_pair) tabname = "[c%s_%d_%d_%s]" % ( read_pair.interval_chr, bin_size * (int(read_pair.interval_start) / bin_size), bin_size * (1 + int(read_pair.interval_start) / bin_size), read_pair.interval_direction) #read_pair_database.insert(read_pair,'',tabname) read_pairs_xor_overlap_TE.append([read_pair, tabname]) if len(read_pairs_xor_overlap_TE) >= 100000: read_pair_database.ingenerator(read_pairs_xor_overlap_TE, 'read_pairs') #try to get the unique tmp = list() tmp = [bin_ for pair, bin_ in read_pairs_xor_overlap_TE] bin_list.extend(list(set(tmp))) del read_pairs_xor_overlap_TE read_pairs_xor_overlap_TE = list() #shift to next pair try: read1 = valid_discordant_bam.next() read2 = valid_discordant_bam.next() except StopIteration: break if len(read_pairs_xor_overlap_TE) > 0: read_pair_database.ingenerator(read_pairs_xor_overlap_TE, 'read_pairs') #try to get the unique tmp = list() tmp = [bin_ for pair, bin_ in read_pairs_xor_overlap_TE] bin_list.extend(list(set(tmp))) del read_pairs_xor_overlap_TE read_pairs_xor_overlap_TE = list() bin_list = list(set(bin_list)) read_pair_database.insert(bin_list, '', 'bin_list') print bin_list print "number discordant read pairs with exactly one read overlapping a TE: %d" % len( read_pairs_xor_overlap_TE) #print "\n".join(pair.str() for pair in read_pairs_xor_overlap_TE) #overlap_TE_bam_file.close() valid_discordant_bam.close() return read_pairs_xor_overlap_TE
def _local_coverage(reader, features, read_strand=None, fragment_size=None, shift_width=0, bins=None, use_score=False, accumulate=True, preserve_total=False, method=None, function="mean", zero_inf=True, zero_nan=True, processes=None, stranded=True, verbose=False): """ Returns a binned vector of coverage. Computes a 1D vector of coverage at the coordinates for each feature in `features`, extending each read by `fragmentsize` bp. Some arguments cannot be used for bigWig files due to the structure of these files. The parameters docstring below indicates whether or not an argument can be used with bigWig files. Depending on the arguments provided, this method can return a vector containing values from a single feature or from concatenated features. An example of the flexibility afforded by the latter case: `features` can be a 3-tuple of pybedtools.Intervals representing (TSS + 1kb upstream, gene, TTS + 1kb downstream) and `bins` can be [100, 1000, 100]. This will return a vector of length 1200 containing the three genomic intervals binned into 100, 1000, and 100 bins respectively. Note that is up to the caller to construct the right axes labels in the final plot! Parameters ---------- features : str, interval-like object, or list Can be a single interval or an iterable yielding intervals. Interval-like objects must have chrom, start, and stop attributes, and optionally a strand attribute. One exception to this that if `features` is a single string, it can be of the form "chrom:start-stop" or "chrom:start-stop[strand]". If `features` is a single interval, then return a 1-D array for that interval. If `features` is an iterable of intervals, then return a 1-D array that is a concatenation of signal for these intervals. Available for bigWig. bins : None, int, list If `bins` is None, then each value in the returned array will correspond to one bp in the genome. If `features` is a single Interval, then `bins` is an integer or None. If `features` is an iterable of Intervals, `bins` is an iterable of integers of the same length as `features`. Available for bigWig. fragment_size : None or int If not None, then each item from the genomic signal (e.g., reads from a BAM file) will be extended `fragment_size` bp in the 3' direction. Higher fragment sizes will result in smoother signal. Not available for bigWig. shift_width : int Each item from the genomic signal (e.g., reads from a BAM file) will be shifted `shift_width` bp in the 3' direction. This can be useful for reconstructing a ChIP-seq profile, using the shift width determined from the peak-caller (e.g., modeled `d` in MACS). Not available for bigWig. read_strand : None or str If `read_strand` is one of "+" or "-", then only items from the genomic signal (e.g., reads from a BAM file) on that strand will be considered and reads on the opposite strand ignored. Useful for plotting genomic signal for stranded libraries. Not available for bigWig. stranded : bool If True, then the profile will be reversed for features whose strand attribute is "-". use_score : bool If True, then each bin will contain the sum of the *score* attribute of genomic features in that bin instead of the *number* of genomic features falling within each bin. Not available for bigWig. accumulate : bool If False, then only record *that* there was something there, rather than acumulating reads. This is useful for making matrices with called peaks. Available for bigWig. preserve_total : bool If True, re-scales the returned value so that each binned row's total is equal to the sum of the original, un-binned data. The units of the returned array will be in "total per bin". This is useful for, e.g., counting reads in features. If `preserve_total` is False, then the returned array will have units of "density"; this is more generally useful and is the default behavior. Available for bigWig, but not when using method="ucsc_summarize". method : str; one of [ "summarize" | "get_as_array" | "ucsc_summarize" ] Only used for bigWig. The method specifies how data are extracted from the bigWig file. "summarize" is the default. It's quite fast, but may yield slightly different results when compared to running this same function on the BAM file from which the bigWig was created. "summarize" uses bx-python. The values returned will not be exactly the same as the values returned when local_coverage is called on a BAM, BED, or bigBed file, but they will be close. This method is quite fast, and is the default when bins is not None. "get_as_array" uses bx-python, but does a separate binning step. This can be slower than the other two methods, but the results are exactly the same as those from a BAM, BED, or bigBed file. This method is always used if bins=None. "ucsc_summarize" is an alternative version of "summarize". It uses the UCSC program `bigWigSummary`, which must already installed and on your path. function : str; one of ['sum' | 'mean' | 'min' | 'max' | 'std'] Determine the nature of the values returned. Only valid if `method` is "summarize" or "ucsc_summarize", which also implies bigWig. Default is "mean". If `method="ucsc_summarize", then there is an additional option for function, "coverage", which returns the percent of region that is covered. zero_inf, zero_nan : bool Only used for bigWig. If either are True, sets any missing or inf values to zero before returning. If `method="ucsc_summarize"`, missinv values are always reported as zero. If `method="get_as_array"`, missing values always reported as nan. Values can be -inf, inf, or nan for missing values when `method="summarize"` according to the following table: ========== ======================== `function` missing values appear as ========== ======================== "sum" 0 "mean" nan "min" inf "max" -inf "std" nan ========== ======================== processes : int or None The feature can be split across multiple processes. Returns ------- 1-d NumPy array Notes ----- If a feature has a "-" strand attribute, then the resulting profile will be *relative to a minus-strand feature*. That is, the resulting profile will be reversed. Returns arrays `x` and `y`. `x` is in genomic coordinates, and `y` is the coverage at each of those coordinates after extending fragments. The total number of reads is guaranteed to be the same no matter how it's binned. (with ideas from http://www-huber.embl.de/users/anders/HTSeq/doc/tss.html) """ # bigWig files are handled differently, so we need to know if we're working # with one; raise exeception if a kwarg was supplied that's not supported. if isinstance(reader, filetype_adapters.BigWigAdapter): is_bigwig = True defaults = ( ('read_strand', read_strand, None), ('fragment_size', fragment_size, None), ('shift_width', shift_width, 0), ('use_score', use_score, False), ('preserve_total', preserve_total, False), ) for name, check, default in defaults: if (((default is None) and (check is not default)) or (check != default)): raise ArgumentError("Argument '%s' not supported for bigWig" % name) if method == 'ucsc_summarize': if preserve_total: raise ArgumentError( "preserve_total=True not supported when using " "method='ucsc_summarize'") else: is_bigwig = False if isinstance(reader, filetype_adapters.BamAdapter): if use_score: raise ArgumentError("Argument 'use_score' not supported for " "bam") # e.g., features = "chr1:1-1000" if isinstance(features, basestring): features = helpers.tointerval(features) if not ((isinstance(features, list) or isinstance(features, tuple))): if bins is not None: if not isinstance(bins, int): raise ArgumentError("bins must be an int, got %s" % type(bins)) features = [features] bins = [bins] else: if bins is None: bins = [None for i in features] if not len(bins) == len(features): raise ArgumentError("bins must have same length as feature list") # nomenclature: # "window" is region we're getting data for # "alignment" is one item in that region # profiles = [] xs = [] for window, nbin in zip(features, bins): window = helpers.tointerval(window) chrom = window.chrom start = window.start stop = window.stop strand = window.strand if not is_bigwig: # Extend the window to catch reads that would extend into the # requested window _fs = fragment_size or 0 padded_window = pybedtools.Interval( chrom, max(start - _fs - shift_width, 0), stop + _fs + shift_width, ) window_size = stop - start # start off with an array of zeros to represent the window profile = np.zeros(window_size, dtype=float) for interval in reader[padded_window]: if read_strand: if interval.strand != read_strand: continue # Shift interval by modeled distance, if specified. if shift_width: if interval.strand == '-': interval.start -= shift_width interval.stop -= shift_width else: interval.start += shift_width interval.stop += shift_width # Extend fragment size from 3' if fragment_size: if interval.strand == '-': interval.start = interval.stop - fragment_size else: interval.stop = interval.start + fragment_size # Convert to 0-based coords that can be used as indices into # array start_ind = interval.start - start # If the feature goes out of the window, then only include the # part that's inside the window start_ind = max(start_ind, 0) # Same thing for stop stop_ind = interval.stop - start stop_ind = min(stop_ind, window_size) # Skip if the feature is shifted outside the window. This can # happen with large values of `shift_width`. if start_ind >= window_size or stop_ind < 0: continue # Finally, increment profile if use_score: score = float(interval.score) else: score = 1 if accumulate: if preserve_total: profile[start_ind:stop_ind] += (score / float( (stop_ind - start_ind))) else: profile[start_ind:stop_ind] += score else: profile[start_ind:stop_ind] = score else: # it's a bigWig profile = reader.summarize( window, method=method, function=function, bins=(nbin or len(window)), zero_inf=zero_inf, zero_nan=zero_nan, ) # If no bins, return genomic coords if (nbin is None): x = np.arange(start, stop) # Otherwise do the downsampling; resulting x is stll in genomic # coords else: if preserve_total: total = float(profile.sum()) if not is_bigwig or method == 'get_as_array': xi, profile = rebin(x=np.arange(start, stop), y=profile, nbin=nbin) if not accumulate: nonzero = profile != 0 profile[profile != 0] = 1 x = xi else: x = np.linspace(start, stop - 1, nbin) # Minus-strand profiles should be flipped left-to-right. if stranded and strand == '-': profile = profile[::-1] xs.append(x) if preserve_total: scale = profile.sum() / total profile /= scale profiles.append(profile) stacked_xs = np.hstack(xs) stacked_profiles = np.hstack(profiles) del xs del profiles return stacked_xs, stacked_profiles
def scan_alignments(work, scan_alignments_binary, input_bam, regions_bed_file, reference, num_threads, window_size, maf, min_mapq, max_dp, restart=True, split_region_files=[], calc_qual=True): logger = logging.getLogger(scan_alignments.__name__) logger.info("-------------------Scan Alignment BAM----------------------") if not split_region_files: if regions_bed_file: regions_bed = pybedtools.BedTool(regions_bed_file).sort().merge( d=0) else: intervals = [] with pysam.AlignmentFile(input_bam, "rb") as samfile: for chrom, length in zip(samfile.references, samfile.lengths): intervals.append(pybedtools.Interval(chrom, 1, length - 1)) regions_bed = pybedtools.BedTool(intervals) if not os.path.exists(work): os.mkdir(work) total_len = sum(map(lambda x: int(x[2]) - int(x[1]) + 1, regions_bed)) if not restart: split_region_files = glob.glob(os.path.join(work, "region_*.bed")) spilt_total_len = sum( map(lambda x: sum([y.length for y in pybedtools.BedTool(x)]), split_region_files)) if spilt_total_len >= 0.98 * total_len: split_region_files = sorted( split_region_files, key=lambda x: int( os.path.basename(x).split(".bed")[0].split("_")[1])) if not split_region_files: regions_bed_file = os.path.join(work, "all_regions.bed") regions_bed.saveas(regions_bed_file) num_split = max( int( np.ceil( (total_len / 10000000) / num_threads) * num_threads), num_threads) split_region_files = split_region(work, regions_bed_file, num_split, min_region=window_size, max_region=1e20) else: logger.info( "split_regions to be used (will ignore region_bed): {}".format( " ".join(split_region_files))) map_args = [] all_outputs = [[]] * len(split_region_files) not_done = [] for i, split_region_file in enumerate(split_region_files): if restart or not os.path.exists(os.path.join(work, "work.{}".format(i), "region.bed")) \ or not os.path.exists(os.path.join(work, "work.{}".format(i), "candidates.vcf")) \ or not os.path.exists(os.path.join(work, "work.{}".format(i), "count.bed.gz")): work_ = os.path.join(work, "work.{}".format(i)) if os.path.exists(work_): shutil.rmtree(work_) map_args.append( (os.path.join(work, "work.{}".format(i)), reference, scan_alignments_binary, split_region_file, input_bam, window_size, maf, min_mapq, max_dp, calc_qual, 1)) not_done.append(i) else: all_outputs[i] = [ os.path.join(work, "work.{}".format(i), "candidates.vcf"), os.path.join(work, "work.{}".format(i), "count.bed.gz"), os.path.join(work, "work.{}".format(i), "region.bed") ] pool = multiprocessing.Pool(num_threads) try: outputs = pool.map_async(run_scan_alignments, map_args).get() pool.close() except Exception as inst: pool.close() logger.error(inst) traceback.print_exc() raise Exception for o in outputs: if o is None: raise Exception("scan_alignments failed!") for i, output in zip(not_done, outputs): all_outputs[i] = output return all_outputs
def find_resolved_variants((chrom, start, end, variants, input_bam, reference)): thread_logger = logging.getLogger( "{} ({})".format(find_resolved_variants.__name__, multiprocessing.current_process().name)) try: ref = pysam.FastaFile(reference) out_variants = [] start, end = map(int, [start, end]) region = [chrom, start, end] vartypes = map(lambda x: x[-1], variants) scores = map(lambda x: x[5], variants) if len(set(vartypes)) > 1: out_variants.extend( map(lambda x: [x[0], int(x[1]), x[3], x[4], x[10], x[5]], variants)) else: vartype = vartypes[0] score = max(scores) if vartype == "DEL": intervals = [] dels = [] with pysam.AlignmentFile(input_bam) as samfile: for record in samfile.fetch(chrom, start, end): if record.cigarstring and "D" in record.cigarstring: dels.extend(extract_del(record)) dels = filter(lambda x: ( start <= x[1] <= end) or start <= x[2] <= end, dels) if dels: intervals = map(lambda x: pybedtools.Interval( x[0], x[1], x[2]), dels) bed = pybedtools.BedTool(intervals) del_strs = map(lambda x: "---".join(x[0:3]), bed) uniq_dels = list(set(del_strs)) uniq_dels_count = {} for del_ in uniq_dels: uniq_dels_count[del_] = del_strs.count(del_) max_count = max(uniq_dels_count.values()) for del_ in uniq_dels: if uniq_dels_count[del_] <= max_count * 0.5: del uniq_dels_count[del_] new_bed = pybedtools.BedTool(map(lambda x: pybedtools.Interval(x[0], int(x[1]), int(x[2])), map(lambda x: x.split("---"), uniq_dels_count.keys()))) new_bed = new_bed.sort().merge(c=[1], o="count") out_variants.extend(map(lambda x: [x[0], int(x[1]), ref.fetch(x[0], int( x[1]) - 1, int(x[2])), ref.fetch(x[0], int(x[1]) - 1, int(x[1])), "0/1", score], new_bed)) elif vartype == "INS": intervals = [] inss = [] with pysam.AlignmentFile(input_bam) as samfile: for record in samfile.fetch(chrom, start, end): if record.cigarstring and "I" in record.cigarstring: inss.extend(extract_ins(record)) inss = filter(lambda x: ( start <= x[1] <= end) or start <= x[2] <= end, inss) if inss: intervals = map(lambda x: pybedtools.Interval( x[0], x[1], x[2], x[3]), inss) bed = pybedtools.BedTool(intervals) ins_strs = map(lambda x: "---".join(x[0:4]), bed) uniq_inss = list(set(ins_strs)) uniq_inss_count = {} for ins_ in uniq_inss: uniq_inss_count[ins_] = ins_strs.count(ins_) max_ins, max_count = sorted( uniq_inss_count.items(), key=lambda x: x[1])[-1] max_pos = int(max_ins.split("---")[1]) for ins_ in uniq_inss: if uniq_inss_count[ins_] <= max_count * 0.5 or 0 < abs(int(ins_.split("---")[1]) - max_pos) < 4: del uniq_inss_count[ins_] new_bed = pybedtools.BedTool(map(lambda x: pybedtools.Interval(x[0], int(x[1]), int(x[2]), x[3]), map(lambda x: x.split("---"), uniq_inss_count.keys()))).sort() out_variants.extend(map(lambda x: [x[0], int(x[1]), ref.fetch(x[0], int( x[1]) - 1, int(x[1])), ref.fetch(x[0], int(x[1]) - 1, int(x[1])) + x[3], "0/1", score], new_bed)) return out_variants except Exception as ex: thread_logger.error(traceback.format_exc()) thread_logger.error(ex) return None
def resolve_for_IDP_ITX_CTX(vcf_records,fasta_file,pad=0,wiggle=10,overlap_ratio=0.9): del_records = filter(lambda x: (x.INFO["SVTYPE"] == "DEL") ,vcf_records) dup_records = filter(lambda x: (x.INFO["SVTYPE"] == "DUP") ,vcf_records) ins_records = filter(lambda x: (x.INFO["SVTYPE"] == "INS") ,vcf_records) other_records = filter(lambda x: (x.INFO["SVTYPE"] not in ["DEL","DUP","INS"]),vcf_records) del_bedtool = pybedtools.BedTool([pybedtools.Interval(x.CHROM, x.POS, (x.POS+abs(x.INFO["SVLEN"])), name="DEL_%d"%i,score=x.FILTER[0]) for i,x in enumerate(del_records)]) dup_bedtool = pybedtools.BedTool([pybedtools.Interval(x.CHROM, x.POS, (x.POS+abs(x.INFO["SVLEN"])), name="DUP_%d"%i,score=x.FILTER[0]) for i,x in enumerate(dup_records)]) ins_bedtool = pybedtools.BedTool([pybedtools.Interval(x.CHROM, x.POS, (x.POS+1), name="INS_%d"%i,score=x.FILTER[0],otherfields=[x.INFO["SC_CHR2_STR"] if "SC_CHR2_STR" in x.INFO else "."]) for i,x in enumerate(ins_records)]) chr2_intervals=[] for interval in ins_bedtool: chr2_intervals.extend(build_chr2_ins(interval)) chr2_ins_bedtool = pybedtools.BedTool(chr2_intervals).sort() idp_bedtool=dup_bedtool.window(del_bedtool,w=wiggle).each(partial(find_idp,wiggle=wiggle)).sort() remained_dup_bedtool=dup_bedtool.intersect(idp_bedtool,f=0.95,r=True,wa=True,v=True).sort() remained_del_bedtool=del_bedtool.intersect(idp_bedtool.each(partial(extract_del_interval)).sort(),f=0.95,r=True,wa=True,v=True) itx_bedtool=idp_bedtool.window(idp_bedtool,w=wiggle).each(partial(find_itx,wiggle=wiggle)).sort() remained_idp_bedtool_1=idp_bedtool.window(itx_bedtool,w=wiggle).each(partial(filter_itxs)).sort() remained_idp_bedtool_2=idp_bedtool.window(itx_bedtool,w=wiggle,c=True).filter(lambda x:x.fields[-1]=="0").sort() ctx_bedtool=remained_del_bedtool.intersect(chr2_ins_bedtool,r=True,f=overlap_ratio,wa=True,wb=True).each( partial(find_ctx,overlap_ratio=overlap_ratio)).sort() remained_del_bedtool=remained_del_bedtool.intersect(ctx_bedtool,f=0.95,r=True,wa=True,v=True).sort() if len(remained_idp_bedtool_2)>0: remained_idp_bedtool_2=remained_idp_bedtool_2.cut(range(idp_bedtool.field_count())).sort() recoverd_pass_del_dup_ins=[] removed_pass_del_dup_ins=[] for bed in remained_idp_bedtool_1,remained_idp_bedtool_2,itx_bedtool,ctx_bedtool: recoverd_pass_del_dup_ins.append(",".join(map(lambda y: y.name,filter(lambda x: "LowQual" in x.score,bed)))) removed_pass_del_dup_ins.append(",".join(map(lambda y: y.name,filter(lambda x: "LowQual" not in x.score,bed)))) recoverd_pass_del_dup_ins=set((",".join(recoverd_pass_del_dup_ins)).split(","))-set(['']) removed_pass_del_dup_ins=set((",".join(removed_pass_del_dup_ins)).split(","))-set(['']) recoverd_pass_del_dup_ins = recoverd_pass_del_dup_ins - removed_pass_del_dup_ins recoverd_dups=list(set([x.name for x in remained_dup_bedtool])|set(filter(lambda x: "DUP" in x,recoverd_pass_del_dup_ins))) recoverd_dels=list(set([x.name for x in remained_del_bedtool])|set(filter(lambda x: "DEL" in x,recoverd_pass_del_dup_ins))) recoverd_inss=list(set([x.name for x in ins_bedtool])-(set(filter(lambda x: "INS" in x,removed_pass_del_dup_ins)))) vcf_records = other_records + [dup_records[int(x.split("_")[-1])] for x in recoverd_dups] + \ [del_records[int(x.split("_")[-1])] for x in recoverd_dels] + \ [ins_records[int(x.split("_")[-1])] for x in recoverd_inss] + \ [merge_idp_itx(fasta_file,dup_records[int(x.name.split(",")[0].split("_")[-1])], [del_records[int(x.name.split(",")[1].split("_")[-1])]], int(x.fields[6]),x.fields[7],x.score,"IDP") for x in remained_idp_bedtool_1] + \ [merge_idp_itx(fasta_file,dup_records[int(x.name.split(",")[0].split("_")[-1])], [del_records[int(x.name.split(",")[1].split("_")[-1])]], int(x.fields[6]),x.fields[7],x.score,"IDP") for x in remained_idp_bedtool_2] + \ [merge_idp_itx(fasta_file,dup_records[int(x.name.split(",")[0].split("_")[-1])], [del_records[int(x.name.split(",")[1].split("_")[-1])], del_records[int(x.name.split(",")[2].split("_")[-1])]], int(x.fields[6]),x.fields[7],x.score,"ITX") for x in itx_bedtool] + \ [merge_ctx(fasta_file,del_records[int(x.name.split(",")[0].split("_")[-1])], ins_records[int(x.name.split(",")[1].split("_")[-1])], x.score) for x in ctx_bedtool] vcf_records = sorted(map(lambda x: remove_info_fields(x,["SC_CHR2_STR"]),vcf_records), key = lambda x: (x.CHROM, x.POS)) return vcf_records
def filter_candidates( (candidates_vcf, filtered_candidates_vcf, reference, dbsnp, min_dp, good_ao, min_ao, snp_min_af, snp_min_bq, snp_min_ao, ins_min_af, del_min_af, del_merge_min_af, ins_merge_min_af, merge_r)): logger.info("-----------------------------------------------------------") logger.info("Filter Candidates") logger.info("-----------------------------------------------------------") records = {} with open(candidates_vcf) as v_f: for line in v_f: if line[0] == "#": continue if len(line.strip().split()) != 10: raise RuntimeError( "Bad VCF line (<10 fields): {}".format(line)) chrom, pos, _, ref, alt, _, _, info_, _, info = line.strip().split( ) pos = int(pos) loc = "{}.{}".format(chrom, pos) dp, ro, ao = map(int, info.split(":")[1:4]) info_dict = dict( map(lambda x: x.split("="), filter(lambda x: x, info_.split(";")))) mq_ = safe_read_info_dict(info_dict, "MQ", int, -100) bq_ = safe_read_info_dict(info_dict, "BQ", int, -100) nm_ = safe_read_info_dict(info_dict, "NM", int, -100) as_ = safe_read_info_dict(info_dict, "AS", int, -100) xs_ = safe_read_info_dict(info_dict, "XS", int, -100) pr_ = safe_read_info_dict(info_dict, "PR", int, -100) cl_ = safe_read_info_dict(info_dict, "CL", int, -100) st_ = safe_read_info_dict(info_dict, "ST", str, "-100,-100") ls_ = safe_read_info_dict(info_dict, "LS", int, -100) rs_ = safe_read_info_dict(info_dict, "RS", int, -100) if ao < min(ro, min_ao): continue if loc not in records: records[loc] = [] if ref == "N" or "\t".join(line.split()[0:5]) \ not in map(lambda x: "\t".join(x[-1].split()[0:5]), records[loc]): records[loc].append([ chrom, pos, ref, alt, dp, ro, ao, mq_, bq_, st_, ls_, rs_, nm_, as_, xs_, pr_, cl_, line ]) elif "\t".join(line.split()[0:5]) \ in map(lambda x: "\t".join(x[-1].split()[0:5]), records[loc]): for i, x in enumerate(records[loc]): if "\t".join(line.split()[0:5]) == "\t".join(x[-1].split()[0:5]) \ and ao / float(ro + 0.0001) > x[6] / float(x[5] + 0.0001): records[loc][i] = [ chrom, pos, ref, alt, dp, ro, ao, mq_, bq_, st_, ls_, rs_, nm_, as_, xs_, pr_, cl_, line ] break fasta_file = pysam.Fastafile(reference) good_records = [] dels = [] for loc, rs in sorted(records.iteritems(), key=lambda x: x[1][0:2]) + \ [["", [["", 0, "", "", 0, 0, 0, ""]]]]: ins = filter(lambda x: x[2] == "N", rs) if len(ins) > 1: # emit ins afs = map(lambda x: x[6] / float(x[5] + x[6]), ins) max_af = max(afs) ins = filter( lambda x: x[6] / float(x[5] + x[6]) >= (max_af * merge_r), ins) chrom, pos, ref = ins[0][0:3] dp = max(map(lambda x: x[4], ins)) ro = max(map(lambda x: x[5], ins)) ao = max(map(lambda x: x[6], ins)) mq_ = max(map(lambda x: x[7], ins)) bq_ = max(map(lambda x: x[8], ins)) st_ = "{},{}".format( max(map(lambda x: int(x[9].split(",")[0]), ins)), max(map(lambda x: int(x[9].split(",")[1]), ins))) ls_ = max(map(lambda x: x[10], ins)) rs_ = max(map(lambda x: x[11], ins)) nm_ = max(map(lambda x: x[12], ins)) as_ = max(map(lambda x: x[13], ins)) xs_ = max(map(lambda x: x[14], ins)) pr_ = max(map(lambda x: x[15], ins)) cl_ = max(map(lambda x: x[16], ins)) alt = "".join(map(lambda x: x[3], ins)) if (max_af >= ins_merge_min_af) or (ao >= good_ao): ins = [[ chrom, pos, ref, alt, dp, ro, ao, mq_, bq_, st_, ls_, rs_, nm_, as_, xs_, pr_, cl_ ]] else: ins = [] elif len(ins) == 1: # emit 1-base ins dp, ro, ao = ins[0][4:7] if (ao / float(ro + ao) < (ins_min_af) and ao < good_ao) or dp <= 5: ins = [] else: ins = [ins[0][:-1]] good_records.extend(ins) if dels and (ins or filter(lambda x: x[3] != "N" and x[2] != "N", rs)): # emit del if len(dels) == 1: ro = dels[0][5] ao = dels[0][6] chrom, pos, ref = dels[0][0:3] if ao / float(ro + ao) >= ((del_min_af)) or ao >= good_ao: good_records.extend(dels) else: afs = map(lambda x: x[6] / float(x[5] + x[6]), dels) max_af = max(afs) merge_r_thr = merge_r * max_af dels = filter( lambda x: x[6] / float(x[5] + x[6]) >= merge_r_thr, dels) chrom, pos = dels[0][0:2] dp = max(map(lambda x: x[4], dels)) ro = max(map(lambda x: x[5], dels)) ao = max(map(lambda x: x[6], dels)) mq_ = max(map(lambda x: x[7], dels)) bq_ = max(map(lambda x: x[8], dels)) st_ = "{},{}".format( max(map(lambda x: int(x[9].split(",")[0]), dels)), max(map(lambda x: int(x[9].split(",")[1]), dels))) ls_ = max(map(lambda x: x[10], dels)) rs_ = max(map(lambda x: x[11], dels)) nm_ = max(map(lambda x: x[12], dels)) as_ = max(map(lambda x: x[13], dels)) xs_ = max(map(lambda x: x[14], dels)) pr_ = max(map(lambda x: x[15], dels)) cl_ = max(map(lambda x: x[16], dels)) ref = "".join(map(lambda x: x[2], dels)) alt = "N" good_records.append([ chrom, pos, ref, alt, dp, ro, ao, mq_, bq_, st_, ls_, rs_, nm_, as_, xs_, pr_, cl_ ]) dels = [] if not loc: continue for record in rs: dp = record[4] if dp <= min_dp: continue ro, ao = record[5:7] if record[2] != "N" and record[3] != "N" and record[2] != record[3]: bq = record[8] if (ao / float(ro + ao) >= (snp_min_af) or ao >= snp_min_ao) and bq >= snp_min_bq: # emit SNP good_records.append(record[:-1]) elif record[2] != "N" and record[3] == "N": if ao / float(ro + ao) >= (del_merge_min_af) or ao >= good_ao: chrom, pos = record[0:2] if dels and pos - dels[-1][1] != 1: # emit del if len(dels) == 1: ro = dels[0][5] ao = dels[0][6] chrom, pos, ref = dels[0][0:3] pos = int(pos) if ao / float(ro + ao) >= ((del_min_af)): good_records.extend(dels) else: afs = map(lambda x: x[6] / float(x[5] + x[6]), dels) max_af = max(afs) merge_r_thr = merge_r * max_af dels = filter( lambda x: x[6] / float(x[5] + x[6]) >= merge_r_thr, dels) chrom, pos = dels[0][0:2] dp = max(map(lambda x: x[4], dels)) ro = max(map(lambda x: x[5], dels)) ao = max(map(lambda x: x[6], dels)) mq_ = max(map(lambda x: x[7], dels)) bq_ = max(map(lambda x: x[8], dels)) st_ = "{},{}".format( max( map(lambda x: int(x[9].split(",")[0]), dels)), max( map(lambda x: int(x[9].split(",")[1]), dels))) ls_ = max(map(lambda x: x[10], dels)) rs_ = max(map(lambda x: x[11], dels)) nm_ = max(map(lambda x: x[12], dels)) as_ = max(map(lambda x: x[13], dels)) xs_ = max(map(lambda x: x[14], dels)) pr_ = max(map(lambda x: x[15], dels)) cl_ = max(map(lambda x: x[16], dels)) ref = "".join(map(lambda x: x[2], dels)) alt = "N" good_records.append([ chrom, pos, ref, alt, dp, ro, ao, mq_, bq_, st_, ls_, rs_, nm_, as_, xs_, pr_, cl_ ]) dels = [] # accumulate dels dels.append(record[:-1]) final_records = [] dels = [] for i, record in enumerate(good_records): chrom, pos, ref, alt, dp, ro, ao, mq_, bq_, st_, ls_, rs_, nm_, as_, xs_, pr_, cl_ = record ref = ref.upper() alt = alt.upper() info_str = "" if st_ != "-100,-100": info_str += ";ST={}".format(st_) if ls_ != -100: info_str += ";LS={}".format(ls_) if rs_ != -100: info_str += ";RS={}".format(rs_) if nm_ != -100: info_str += ";NM={}".format(nm_) if as_ != -100: info_str += ";AS={}".format(as_) if xs_ != -100: info_str += ";XS={}".format(xs_) if pr_ != -100: info_str += ";PR={}".format(pr_) if cl_ != -100: info_str += ";CL={}".format(cl_) if mq_ != -100: info_str += ";MQ={}".format(mq_) if bq_ != -100: info_str += ";BQ={}".format(bq_) af = np.round(ao / float(ao + ro), 4) info_str += ";AF={}".format(af) if ref != "N" and alt != "N": line = "\t".join([ chrom, str(pos), ".", ref, alt, "100", ".", "DP={};RO={};AO={}".format(dp, ro, ao) + info_str, "GT:DP:RO:AO:AF", "0/1:{}:{}:{}:{}".format(dp, ro, ao, af) ]) final_records.append([chrom, pos, ref, alt, line]) elif alt == "N": ref = fasta_file.fetch(chrom, pos - 2, pos + len(ref) - 1).upper() alt = fasta_file.fetch(chrom, pos - 2, pos - 1).upper() line = "\t".join([ chrom, str(pos - 1), ".", ref, alt, "100", ".", "DP={};RO={};AO={}".format(dp, ro, ao) + info_str, "GT:DP:RO:AO:AF", "0/1:{}:{}:{}:{}".format(dp, ro, ao, af) ]) final_records.append([chrom, pos - 1, ref, alt, line]) elif ref == "N": ref = fasta_file.fetch(chrom, pos - 2, pos - 1).upper() alt = ref + alt line = "\t".join([ chrom, str(pos - 1), ".", ref, alt, "100", ".", "DP={};RO={};AO={}".format(dp, ro, ao) + info_str, "GT:DP:RO:AO:AF", "0/1:{}:{}:{}:{}".format(dp, ro, ao, af) ]) final_records.append([chrom, pos - 1, ref, alt, line]) final_records = sorted(final_records, key=lambda x: x[0:2]) if dbsnp: filtered_bed = pybedtools.BedTool( map( lambda x: pybedtools.Interval(x[1][0], int(x[1][1]), int(x[1][1]) + 1, x[1][2], x[1][ 3], str(x[0])), enumerate(final_records))).sort() dbsnp = pybedtools.BedTool(dbsnp).each( lambda x: pybedtools.Interval(x[0], int(x[1]), int(x[1]) + 1, x[3], x[4])).sort() non_in_dbsnp_1 = filtered_bed.window(dbsnp, w=0, v=True) non_in_dbsnp_2 = filtered_bed.window(dbsnp, w=0).filter( lambda x: x[1] != x[7] or x[3] != x[9] or x[4] != x[10]).sort() non_in_dbsnp_ids = [] for x in non_in_dbsnp_1: non_in_dbsnp_ids.append(int(x[5])) for x in non_in_dbsnp_2: non_in_dbsnp_ids.append(int(x[5])) final_records = map( lambda x: x[1], filter(lambda x: x[0] in non_in_dbsnp_ids, enumerate(final_records))) with open(filtered_candidates_vcf, "w") as o_f: o_f.write("##fileformat=VCFv4.2\n") o_f.write( "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n") for record in final_records: o_f.write(record[-1] + "\n") return filtered_candidates_vcf
def convert_metasv_bed_to_vcf(bedfile=None, vcf_out=None, workdir=None, vcf_template_file=vcf_template, sample=None, reference=None, pass_calls=True): func_logger = logging.getLogger("%s" % (convert_metasv_bed_to_vcf.__name__)) if not os.path.exists(workdir): os.makedirs(workdir) intervals = [] if bedfile: for interval in pybedtools.BedTool(bedfile): interval_info = get_interval_info(interval,pass_calls) if interval_info: updated_interval = pybedtools.Interval(interval.chrom, interval_info["pos"], interval_info["end"], name="%s,%s,%d,%s" % ( base64.b64encode(json.dumps(interval_info["info"])), interval_info["sv_type"], interval_info["sv_length"], ";".join(interval_info["svmethods"])), score = interval.score, otherfields=[interval_info["genotype"] , interval_info["sv_filter"]]) if not intervals: intervals.append(updated_interval) else: merged_interval=check_duplicates(updated_interval,intervals[-1]) if merged_interval: func_logger.info("Merging intervals: %s and %s" % (updated_interval,intervals[-1])) intervals.pop() intervals.append(merged_interval) else: intervals.append(updated_interval) else: func_logger.info("Skip interval: %s" % (interval)) nonfilterd_bed = os.path.join(workdir, "final_nonfilterd.bed") filterd_bed = os.path.join(workdir, "final_filterd.bed") bedtool = pybedtools.BedTool(intervals).sort().moveto(nonfilterd_bed) filterd_bed = filter_confused_INS_calls(nonfilterd_bed,filterd_bed) vcf_template_reader = vcf.Reader(open(vcf_template_file, "r")) # The following are hacks to ensure sample name and contig names are put in the VCF header vcf_template_reader.samples = [sample] contigs = [] fasta_file = None if reference: contigs = fasta_utils.get_contigs(reference) contigs_order_dict = {contig.name: index for (index, contig) in enumerate(contigs)} vcf_template_reader.contigs = OrderedDict([(contig.name, (contig.name, contig.length)) for contig in contigs]) vcf_template_reader.metadata["reference"] = reference fasta_file = pysam.Fastafile(reference) vcf_template_reader.metadata["fileDate"] = str(datetime.date.today()) vcf_template_reader.metadata["source"] = [" ".join(sys.argv)] vcf_writer = vcf.Writer(open(vcf_out, "w"), vcf_template_reader) vcf_records = [] if filterd_bed: bedtool = pybedtools.BedTool(filterd_bed) for interval in bedtool: name_split=interval.name.split(",") info = json.loads(base64.b64decode(name_split[0])) sv_type = name_split[1] sv_id = "." ref = fasta_file.fetch(str(interval.chrom), interval.start, interval.start + 1) if fasta_file else "." alt = [vcf.model._SV(sv_type)] qual = "." sv_filter = [interval.fields[7]] genotype = interval.fields[6] sv_format = "GT" sample_indexes = [0] vcf_record = vcf.model._Record(interval.chrom, interval.start, sv_id, ref, alt, qual, sv_filter, info, sv_format, sample_indexes) vcf_record.samples = vcf_template_reader._parse_samples([genotype], "GT", vcf_record) vcf_records.append(vcf_record) if contigs: vcf_records.sort(key=lambda x: (contigs_order_dict[x.CHROM], x.POS)) else: vcf_records.sort(key=lambda x: (x.CHROM, x.POS)) resolved_vcf_records = resolve_for_IDP_ITX_CTX(vcf_records,fasta_file) for vcf_record in resolved_vcf_records: vcf_writer.write_record(vcf_record) vcf_writer.close() func_logger.info("Tabix compressing and indexing %s" % vcf_out) pysam.tabix_index(vcf_out, force=True, preset="vcf")