def ProcessRegion(RawGEMs): Temp = RawGEMs.groupby(g=[1, 2, 3, 5], c=[5, 6, 7, 8], o=['count', 'collapse', 'collapse', 'collapse']) RefineGEMs = Temp.filter(lambda F: int(F[4]) > 1) # Need this to keep the result of filter! All these files can be manually removed after the job Test = BedTool(RefineGEMs).saveas() Tempstr = '' for i in range(Test.count()): Start = np.fromstring(Test[i][6], dtype=np.int, sep=', ') End = np.fromstring(Test[i][7], dtype=np.int, sep=', ') Mcount = Test[i][5].count('P') Start.sort() End.sort() # chrom, start_min, end_min, GEM ID, #(fragments) start_max, end_max, for j in range(len(Start)): if j == 0: Tempstr += Test[i][0] + ' ' + str(Start[j]) + ' ' + str( End[j]) + ' ' + Test[i][3] + ' ' + str( len(Start)) + ' ' + str(Mcount) + ' ' elif len(Start) != 2 and j != (len(Start) - 1): Tempstr += str(Start[j]) + ',' + str(End[j]) + ',' elif j == (len(Start) - 1): Tempstr += str('-1,-1') + ' ' + str(Start[j]) + ' ' + str( End[j]) + '\n' FinalGEMs = BedTool(Tempstr, from_string=True) return FinalGEMs
def compute_all_overlaps(*peak_calls): """ Each peak_call in peak_calls should be a namedtuple with fields name, and file. file is a bed file defining peak regions Computes overlaps for every 2-pair, 3-group, 4-group, etc between input peak calls Returns a summary object """ result = list(); Group = namedtuple("Group", ["peak_calls", "overlap"]); for num in range(len(peak_calls)+1): for group in combinations(peak_calls, num): if(len(group)>1): count = compute_overlap(group); names = [x.name for x in group]; result.append(Group(names, count)); else #only one group member pbed = BedTool(group[0].file); count = pbed.count(); result.append(Group(group[0].name, count)); return result;
def bed_intersection_scores(region_bedtool, feature_bedtool, f=0.5, F=0.5, e=True, score_index=4, **kwargs): """ intersects regions with feature bed and returns 0 if no intersection found, or the score if an intersection was found """ region_bedtool = BedTool(region_bedtool) if feature_bedtool is not None: # Get inersection labels and intersecting region subset from a sorted # region bedtool feature_bedtool = BedTool(feature_bedtool).sort() print("Feature bedtool sorted") intersect_labels = bed_intersection_labels( region_bedtool, feature_bedtool, f=f, F=F) intersecting_regions = region_bedtool.at( np.where(intersect_labels == 1)[0]) # For intersecting subset, get matched intersecting feature regions matched_intersects = intersecting_regions.intersect( BedTool(feature_bedtool), wao=True, f=f, F=F, e=True) # Group by score index coumn in matched region bedtool. # Its going to be the usr defined score index shifted by the total number # of fields/columns in the dnase bed file. groupby_col_index = score_index + intersecting_regions.field_count() grouped_matched_intersects = matched_intersects.groupby( g=[1, 2, 3], c=groupby_col_index, o="max") # Initialize score array, store scores where intersection labels are 1 scores = np.zeros(intersect_labels.shape) intersection_scores = [interval.fields[-1] for interval in grouped_matched_intersects] scores[intersect_labels == 1] = np.array( intersection_scores, dtype=float) return scores else: return -1 * np.ones((region_bedtool.count(), 1))
def vcf_intersect(vcf_path, bed_panel): """Uses Pybedtoolst a.intersect(b) to ectract from VCF variants mapping in the gene panel intervals Args: 1) a path to a valid VCF file to estract variants from. 2) bed file containing the genetic intervals of interest. Returns: A tuple, containing: 1) a VCF object built from the intervals-filtered VCF 2) the number of original intervals in the bed panel 3) the number of variants mapping to these intervals """ try: vcf_file = BedTool(vcf_path) gene_panel = BedTool(bed_panel) # gene panel doesn't need to be sorted by chrom and position. # Do the actual filtering and create a mini VCF with only the variants from the bed file's intervals: intersections = vcf_file.intersect(gene_panel, header=True) LOG.info('Computing intersections between interval filter and VCF file..') panel_intervals = gene_panel.count() intersected_vars = intersections.count() LOG.info('Extracting %s intervals from the %s total entries of the VCF file.', gene_panel.count(), vcf_file.count()) LOG.info('Number of variants found in the intervals:%s', intersected_vars) temp_intersections_file = NamedTemporaryFile('w+t', dir=os.getcwd()) intersections.saveas(temp_intersections_file.name) mini_VCF = VCF(temp_intersections_file.name) #remove temporary file: temp_intersections_file.close() # Return a tuple with: # a mini-VCF file object # the number of original intervals in the bed panel # the number of variants mapping to these intervals return (mini_VCF, panel_intervals, intersected_vars) except Exception as e: LOG.critical(e) return False
def bed_intersection_labels(region_bedtool, feature_bedtool, f=0.5, F=0.5, e=True, **kwargs): """ intersects regions with feature bed and returns binary labls """ region_bedtool = BedTool(region_bedtool) if feature_bedtool is not None: try: overlap_counts = [interval.count for interval in region_bedtool.intersect(BedTool(feature_bedtool), c=True, f=f, F=F, e=e, **kwargs)] except: # handle unexpected field numbers in feature bedtool by truncating it to bed3 feature_df = BedTool(feature_bedtool).to_dataframe() feature_bedtool = BedTool.from_dataframe( feature_df.iloc[:, [0, 1, 2]]) overlap_counts = [interval.count for interval in region_bedtool.intersect(feature_bedtool, c=True, f=f, F=F, e=e, **kwargs)] labels = np.array(overlap_counts) > 0 return labels.astype(int)[:, np.newaxis] else: return (AMBIG_LABEL * np.ones((region_bedtool.count(), 1))).astype(int)
def comp_score(self, gap_penalty): '''compute_score_given_gap_penalty''' if gap_penalty in self.__cache: return self.__cache[gap_penalty]['score'] gb = self.orig_bins.scale_neg_scores(gap_penalty) observed_result = gb.max_segments() mc_res = MonteCarlo.run_simulation(gb.chrom_scores, niter=self.mc_trials, nprocs=self.nprocs) tester = IntervalTest(observed_result, mc_res) segments = [ segment for (segment, pval) in tester.pvalues() if pval < self.pval_lim ] if len(segments) == 0: # no potential peaks found log.notice('''Gap penalty of %.2f gives a score of 0.0 \ (0 potential peaks with 0.00MB coverage)''' % gap_penalty) self.__cache[gap_penalty] = {'score': 0.00} return 0.0 # TODO use bx.python instead of pybedtools peaks_sb = StringIO.StringIO() tester.segments_to_bedstream(segments, peaks_sb) peaks = BedTool(peaks_sb.getvalue(), from_string=True) d = self.count_stats(self.bins_bedtool.intersect(peaks)) d['gap-penalty'] = gap_penalty try: d['peak_EIB_ratio'] = d['EIB'] / float(d['EIB'] + d['DIB']) except ZeroDivisionError: # no peaks found d['peak_EIB_ratio'] = 0.0 d['global_EIB_coverage'] = d['EIB'] / float( self.genome_wide_stats['EIB']) d['score'] = d['peak_EIB_ratio']**5 * d['global_EIB_coverage'] peak_cov = sum(x.end - x.start for x in peaks) / 1e6 log.notice('''Gap penalty of %.2f gives a score of %.3f \ (%d potential peaks with %.2fMB coverage)''' % (gap_penalty, d['score'], peaks.count(), peak_cov)) self.__cache[gap_penalty] = d return d['score']
def comp_score(self, gap_penalty): '''compute_score_given_gap_penalty''' if gap_penalty in self.__cache: return self.__cache[gap_penalty]['score'] gb = self.orig_bins.scale_neg_scores(gap_penalty) observed_result = gb.max_segments() mc_res = MonteCarlo.run_simulation(gb.chrom_scores, niter=self.mc_trials, nprocs=self.nprocs) tester = IntervalTest(observed_result, mc_res) segments = [segment for (segment, pval) in tester.pvalues() if pval < self.pval_lim] if len(segments) == 0: # no potential peaks found log.notice('''Gap penalty of %.2f gives a score of 0.0 \ (0 potential peaks with 0.00MB coverage)''' % gap_penalty) self.__cache[gap_penalty] = {'score': 0.00} return 0.0 # TODO use bx.python instead of pybedtools peaks_sb = StringIO.StringIO() tester.segments_to_bedstream(segments, peaks_sb) peaks = BedTool(peaks_sb.getvalue(), from_string=True) d = self.count_stats(self.bins_bedtool.intersect(peaks)) d['gap-penalty'] = gap_penalty try: d['peak_EIB_ratio'] = d['EIB'] / float(d['EIB'] + d['DIB']) except ZeroDivisionError: # no peaks found d['peak_EIB_ratio'] = 0.0 d['global_EIB_coverage'] = d['EIB'] / float(self.genome_wide_stats['EIB']) d['score'] = d['peak_EIB_ratio']**5 * d['global_EIB_coverage'] peak_cov = sum(x.end - x.start for x in peaks) / 1e6 log.notice('''Gap penalty of %.2f gives a score of %.3f \ (%d potential peaks with %.2fMB coverage)''' % (gap_penalty, d['score'], peaks.count(), peak_cov)) self.__cache[gap_penalty] = d return d['score']
def main(): error_site = BedTool('/home/kwoklab-user/Error_exome_all.bed') # protein_coding_site = BedTool('/home/kwoklab-user/Shared_resources/gemini/data/gencode15.protein_coding.20130131.hg19.bed') gemini_data_dir = "/home/kwoklab-user/Shared_resources/gemini/data" dbsnp_137_site = BedTool('/home/kwoklab-user/Shared_resources/gemini/data/dbsnp.137.vcf.gz') #https://code.google.com/p/discovering-cse/ cse_site = BedTool('/home/kwoklab-user/Shared_resources/gemini/data/cse-hiseq-8_4-2013-02-20.bed.gz') # gms_site = BedTools( path.join(gemini_data_dir, 'GRCh37-gms-mappability.vcf.gz')) rmsk_site = BedTool(path.join(gemini_data_dir, 'hg19.rmsk.bed.gz')) segdup_site = BedTool(path.join(gemini_data_dir, 'hg19.segdup.bed.gz')) clinvar_site = BedTool(path.join(gemini_data_dir,'clinvar_20130118.vcf.gz')) dgv_site = BedTool(path.join(gemini_data_dir,'hg19.dgv.bed.gz')) CpG_site = BedTool(path.join(gemini_data_dir,'hg19.CpG.bed.gz')) print "total error sites %d" % error_site.count() print "error in cse site %d" % (error_site+cse_site).count() print "error in dbsnp137 site %d" %(error_site+dbsnp_137_site).count() print "errors in repeat mask region %d" % (error_site+rmsk_site).count() print "errors in segdup %d" % (error_site+segdup_site).count() print "errors in Clinvar %d" % (error_site+clinvar_site).count() print "errors in dgv %d" % (error_site+dgv_site).count() print "errors in CpG %d" % (error_site+CpG_site).count()
def mainfunc2(path1, path2, savebedpath, savecsvpath, tmpfilepath, RegInterval, cr_id, Thread, Length=4000): # path1: path for GEMs (i.e. ___ALL.region.PEanno) # path2: path for Region (i.e. ____PETcnt_G9.motifannot) # savebedpath: path for saving extracted GEMs in .bed # savecsvpath: path for saving summary table in .csv # tmpfilepath: path for saving tmpfiles produced by pybedtool, a directory # Thread: for naming the csv file. (i.e. '0') # Length: Length of extension. Default = 4000 (int) pybedtools.helpers.cleanup() pybedtools.set_tempdir(tmpfilepath) # Specify for the path of ___ALL.region.PEanno and import it (GEMs) # path1 = 'Minji_data/SHG0180-181-182NR_hg38_cohesin_FDR_0.1_ALL_motifext4kbboth.region.PEanno' ChIA_Drop = BedTool(path1) # Specify for the path of ____PETcnt_G9.motifannot and import it (anchors, regions) # path2 = 'Minji_data/LHG0052H.e500.clusters.cis.bothanchint_G250.PETcnt_G9.motifannot.sorted.domains' Region_short = BedTool(path2) # # Remove unnecessary entries # Region_short = Region.groupby(g=[1,2,6,12,14,20,8,9,16,21], c=[12], o=['count']) # Region_short.moveto('Region_short.bed') # Region_short = BedTool('Region_short.bed') Max_iter = Region_short.count() if RegInterval == 'All': RegInterval = range(1, Max_iter) # Length = 4000 List1 = [] NowRegion = BedTool(Region_short[0:1]).saveas() # Find all fragments that intersect with Nowregion Intersection = ChIA_Drop.intersect(NowRegion, wa=True) # Append original start/and. Technical purpose for using groupby... results = [(f[0], '0', '0', f[3], f[4], f[5], f[1], f[2]) for f in Intersection] Intersection = BedTool(results) # Sort the grouping key!!!! Otherwise the later groupby doesn't work as intended... Intersection = Intersection.sort( chrThenScoreA=True).saveas('dummyfiles/Intersection' + Thread) # ipdb.set_trace() # Dict = {'Type/loopID': ['Left_0','Left_1','Right_0','Right_1','Both_0','Both_1','None_0','None_1','Total','Left Intensity', 'Right Intensity','Left motif strand', 'Right motif strand']} for i in RegInterval: TempRegion = BedTool(Region_short[i:i + 1]).saveas() GEMid = TempRegion[0][3] if GEMid[-1] == 'S': MidRegion = np.array([TempRegion[0][5], TempRegion[0][2]]).astype(int) else: continue Len = Find2side(Intersection, NowRegion[0], savebedpath, Thread, MidRegion, GEMid[:-1]) List1.append([GEMid[:-1], Len]) # ipdb.set_trace() DF1 = pd.DataFrame(List1, columns=['crID_M:x', '# of complexes']) DF1.to_csv(savecsvpath + 'List3_' + cr_id + '.csv', index=False)
def mainfunc(path1, path2, savebedpath, savecsvpath, tmpfilepath, RegInterval, cr_id, Thread, Length=4000): # path1: path for GEMs (i.e. ___ALL.region.PEanno) # path2: path for Region (i.e. ____PETcnt_G9.motifannot) # savebedpath: path for saving extracted GEMs in .bed # savecsvpath: path for saving summary table in .csv # tmpfilepath: path for saving tmpfiles produced by pybedtool, a directory # Thread: for naming the csv file. (i.e. '0') # Length: Length of extension. Default = 4000 (int) pybedtools.helpers.cleanup() pybedtools.set_tempdir(tmpfilepath) # Specify for the path of ___ALL.region.PEanno and import it (GEMs) # path1 = 'Minji_data/SHG0180-181-182NR_hg38_cohesin_FDR_0.1_ALL_motifext4kbboth.region.PEanno' ChIA_Drop = BedTool(path1) # Specify for the path of ____PETcnt_G9.motifannot and import it (anchors, regions) # path2 = 'Minji_data/LHG0052H.e500.clusters.cis.bothanchint_G250.PETcnt_G9.motifannot.sorted.domains' Region_short = BedTool(path2) # # Remove unnecessary entries # Region_short = Region.groupby(g=[1,2,6,12,14,20,8,9,16,21], c=[12], o=['count']) # Region_short.moveto('Region_short.bed') # Region_short = BedTool('Region_short.bed') Max_iter = Region_short.count() if RegInterval == 'All': RegInterval = range(Max_iter) # Length = 4000 List1 = [] List2 = [] # ipdb.set_trace() # Dict = {'Type/loopID': ['Left_0','Left_1','Right_0','Right_1','Both_0','Both_1','None_0','None_1','Total','Left Intensity', 'Right Intensity','Left motif strand', 'Right motif strand']} for i in RegInterval: # NowRegion: chrom, start_min, end_max, loop id, ... # This line can be improved... # NowRegion = NowRegion.saveas('NowRegion.bed') NowRegion = BedTool(Region_short[i:i + 1]).saveas() # Find all fragments that intersect with Nowregion Intersection = ChIA_Drop.intersect(NowRegion, wa=True) # Append original start/and. Technical purpose for using groupby... results = [(f[0], '0', '0', f[3], f[4], f[5], f[1], f[2]) for f in Intersection] Intersection = BedTool(results) # Sort the grouping key!!!! Otherwise the later groupby doesn't work as intended... Intersection = Intersection.sort(chrThenScoreA=True) # Extract the valid GEMs FinalGEMs = ProcessRegion(Intersection, Thread) # ipdb.set_trace() # Classify+sort+save if NowRegion[0][3][-2:] == 'SE': Count_L0, Count_L1 = SortGEM(FinalGEMs, NowRegion[0], 'Left', Length, savebedpath) Count_R0, Count_R1 = SortGEM(FinalGEMs, NowRegion[0], 'Right', Length, savebedpath) Count_B0, Count_B1 = SortGEM(FinalGEMs, NowRegion[0], 'Both', Length, savebedpath) Count_L = Count_L0 + Count_L1 Count_R = Count_R0 + Count_R1 Count_B = Count_B0 + Count_B1 CRID = NowRegion[0][3][:-3] List1.append([NowRegion[0][3][:-3], 'S_to_E', Count_L]) List1.append([NowRegion[0][3][:-3], 'E_to_S', Count_R]) List1.append([NowRegion[0][3][:-3], 'S_and_E', Count_B]) TempList = [NowRegion[0][3][:-3], Count_L + Count_R + Count_B] elif NowRegion[0][3][-1] == 'S': Count_R0, Count_R1 = SortGEM(FinalGEMs, NowRegion[0], 'Right', Length, savebedpath) Count_R = Count_R0 + Count_R1 MID = NowRegion[0][3][len(CRID) + 1:-1] List1.append([CRID, MID + '_to_S', Count_R]) Count_mid = Count_R elif NowRegion[0][3][-1] == 'E': Count_L0, Count_L1 = SortGEM(FinalGEMs, NowRegion[0], 'Left', Length, savebedpath) Count_L = Count_L0 + Count_L1 MID = NowRegion[0][3][len(CRID) + 1:-1] List1.append([CRID, MID + '_to_E', Count_L]) Count_mid += Count_L # ipdb.set_trace() NowList = TempList.copy() NowList.extend([MID, Count_mid]) List2.append(NowList) # Count_N0,Count_N1 = SortGEM(FinalGEMs, NowRegion[0],'None',Length,savebedpath) # Total = Count_L0+Count_L1+Count_R0+Count_R1+Count_B0+Count_B1+Count_N0+Count_N1 # # Write into dictionary # Dict[NowRegion[0][3]] = [NowRegion[0][3],Count_L0,Count_L1,Count_L0+Count_L1,(Count_L0+Count_L1)/Total*100, # Count_R0,Count_R1,Count_R0+Count_R1,(Count_R0+Count_R1)/Total*100, # Count_B0,Count_B1,Count_B0+Count_B1,(Count_B0+Count_B1)/Total*100, # Count_N0,Count_N1,Count_N0+Count_N1,(Count_N0+Count_N1)/Total*100, # Total,Total-(Count_N0+Count_N1),(Total-(Count_N0+Count_N1))/Total*100, # NowRegion[0][0]+':'+str(NowRegion[0][1])+'-'+str(NowRegion[0][2])] # # Clear all temp files for this session # pybedtools.helpers.cleanup() # RenameCol = {} # NewCol = ['LoopID','Left_0','Left_1','Left_Tol','Left_Tol %','Right_0','Right_1','Right_Tol','Right_Tol %', # 'Both_0','Both_1','Both_Tol','Both_Tol %', # 'None_0','None_1','None_Tol','None_Tol %','Total','Total-None','Total-None %', # 'Region'] # for i, name in enumerate(NewCol): # RenameCol[i] = NewCol[i] DF1 = pd.DataFrame(List1, columns=['crID', 'orientation', '# of complexes']) DF2 = pd.DataFrame(List2, columns=['crID', 'anchorcomp', 'middleID', 'loadcomp']) DF1.to_csv(savecsvpath + 'List1_' + cr_id + '.csv', index=False) DF2.to_csv(savecsvpath + 'List2_' + cr_id + '.csv', index=False)
def py_peak_calling(bedgraph, threshold, min_length, inter_peak_distance, merge_close_peaks, keep_highest_close_peak, max_length, generate_ID, output_name, delete_overlap_bed): import pybedtools import glob from pybedtools import BedTool import pandas as pd import csv if merge_close_peaks == keep_highest_close_peak: print 'Exiting... merge_close_peaks and keep_highest_close_peak set the same' sys.exit() #generate name for output bedgraph_name = glob.glob(bedgraph) filtered_name = bedgraph_name[0].replace('.bedgraph', 'filtered.bedgraph') if output_name != 'None': filename = output_name elif output_name == 'None': filename = bedgraph_name[0].replace('.bedgraph', '_peaks.bed') print 'input bedgraph file: ' + bedgraph_name[0] print 'output filename: ' + filename #import data as BedTool data = BedTool(bedgraph) print 'total sites read: ', print len(data) #retains intervals above threshold above_thresh = data.filter( lambda b: float(b.name) >= float(threshold)).saveas(filtered_name) print 'sites above threshold: ', print len(above_thresh) if len(above_thresh) == 0: print 'no regions are above the threashold\n' sys.exit() #merge adjacent above threshold regions and sum bedgraph scores (assumes bedgraph score in col 4) #d max distance between merged peaks, c: column modified merge_regions = above_thresh.merge(d=10, c=4, o='sum').saveas('temp.bed') #filter based on length criteria peaks = BedTool( merge_regions.filter(lambda x: len(x) >= min_length and len(x) <= max_length)).saveas('temp2.bed') print 'number of regions identified: ' + str(peaks.count()) if merge_close_peaks == 'True': if len(peaks) == 0: #merge the bonafide peaks if they they are shorter than the inter peak distance and sum scores and sort print 'merging peaks that are closer than: ' + str( inter_peak_distance) merge_peaks = peaks.merge(d=inter_peak_distance, c=4, o='sum').sort().saveas('temp3.bed') if len(peaks) > 0: print 'no regions can be merged' merge_close_peaks = 'False' keep_highest_close_peak = 'True' if keep_highest_close_peak == 'True': #need to read each line to find close peaks and throw away the one with the lowest score out of the two print 'entering loop' # if len(peaks) > 0: peaks.saveas('temp_input.bed') #print 'before keeping highest, number of regions identified: ' + str(BedTool('temp_input.bed').count()) last_line = [ str(item) for item in (BedTool('temp_input.bed').to_dataframe().tail( n=1).iloc[0, :].tolist()) ] with open('temp_input.bed') as myfile: with open('test_output.bed', 'w') as output: file_output = csv.writer(output, delimiter='\t') prev_line = None for line in csv.reader(myfile, delimiter='\t'): print 'testing line: ' + str(line) if prev_line is None: prev_line = line print elif float(prev_line[2]) + float( inter_peak_distance) <= float(line[1]): print 'prev_line: ' + str(prev_line) print 'line: ' + str(line) print 'features far apart, so adding' print file_output.writerow(prev_line) prev_line = line else: print 'prev_line: ' + str(prev_line) print 'line: ' + str(line) print 'features must be close' print if float(prev_line[3]) < float(line[3]): prev_line = line print 'prev_line smaller, so new prev_line' print 'prev_line: ' + str(prev_line) print print 'finished reading lines' print line print last_line if line == last_line: print 'must be last line' file_output.writerow(prev_line) merge_peaks = BedTool('test_output.bed') sys.exit() print 'number of peaks found: ' + str(merge_peaks.count()) if delete_overlap_bed != None: print 'delete_overlap_bed provided: ' + delete_overlap_bed merge_peaks = merge_peaks.intersect(b=delete_overlap_bed, v=True) print 'number of peaks retained: ' + str(merge_peaks.count()) if not generate_ID: print 'saving sorted peak bed file with no ID' merge_peaks.saveas(filename) if generate_ID: print 'saving sorted peak bed file with ID names' #change to pandas dataframe DF_peaks = merge_peaks.to_dataframe() #insert new column with id: 1.... # of peaks DF_peaks.insert( 3, 'id', ['id' + str(item) for item in range(1, (len(DF_peaks) + 1))]) ['id' + str(item) for item in range(1, 5)] #save output DF_peaks.to_csv(filename, sep='\t', header=False, index=False) return 'Finished'
def get_tf_predictive_setup(true_feature_bedtools, region_bedtool=None, ambiguous_feature_bedtools=None, bin_size=200, flank_size=400, stride=50, n_jobs=1, genome='hg19', min_bin_distance_to_chrom_edge=5000, filter_flank_overlaps=False): """ Implements the tf (and general) imputation data setup for a single sample. TODOs support chrom.sizes file for personal genomes Parameters ---------- tf_feature_peak_bedtools : list of filenames, BedTools or None items None items are treated as missing data. region_bedtools : filename or BedTool, optional If not set, union of tf_feature_peak_bedtools is used. filter_flank_overlaps : bool, default: True Labels negative bins whose flanks overlap target regions as ambiguous. ambiguous_feature_bedtools : list of filenames, BedTools or None items, optional genome : str, default: 'hg19' Can be any genome name supported by pybedtools. """ # initialize feature bedtools true_feature_bedtools = [BedTool(bedtool) if bedtool is not None else None for bedtool in true_feature_bedtools] # sanity checks if ambiguous_feature_bedtools is not None: assert len(ambiguous_feature_bedtools) == len(true_feature_bedtools) ambiguous_feature_bedtools = [BedTool(bedtool) if bedtool is not None else None for bedtool in ambiguous_feature_bedtools] # merge and bin region_bedtools if region_bedtool is not None: print(region_bedtool) region_bedtool = BedTool(region_bedtool).sort() print("Made Bedtool") region_bedtool = region_bedtool.merge() bins = bin_bed(region_bedtool, bin_size=bin_size, stride=stride) else: # use union of true peak bedtools bedtools_to_merge = [ bedtool for bedtool in true_feature_bedtools if bedtool is not None] region_bedtool = BedTool.cat( *bedtools_to_merge, postmerge=True, force_truncate=True) bins = bin_bed(region_bedtool, bin_size=bin_size, stride=stride) # throw out bins within 5kb of chromosome edge genome_chrom_sizes = getattr(genome_registry, genome) bins = bins.each(filter_by_chrom_sizes, genome_chrom_sizes, min_bin_distance_to_chrom_edge) # filter bins to chr1-22,X,Y chrom_list = ["chr%i" % (i) for i in range(1, 23)] chrom_list += ["chrX", "chrY"] bins = BedTool(bins).each(filter_interval_by_chrom, chrom_list) bins = bins.saveas() # save to temp file to enable counting num_bins = bins.count() # set genome to hg19 bins = bins.set_chromsizes(genome) # intersect bins and tf_true_peaks for true labels if n_jobs == 1: true_labels_list = [] for true_feature_bedtool in true_feature_bedtools: true_labels = bed_intersection_labels(bins, true_feature_bedtool) true_labels_list.append(true_labels) elif n_jobs > 1: # multiprocess bed intersections # save feature bedtools in temp files. Note: not necessary when inputs # are filnames true_feature_fnames = [ bedtool.fn if bedtool is not None else None for bedtool in true_feature_bedtools] true_labels_list = Parallel(n_jobs=n_jobs)(delayed(bed_intersection_labels)(bins.fn, fname) for fname in true_feature_fnames) true_labels = np.concatenate(true_labels_list, axis=1) bins_and_flanks = bins.slop(b=flank_size) if filter_flank_overlaps: # intersect bins and flanks for any overlap with true features if n_jobs == 1: flank_labels_list = [] for true_feature_bedtool in true_feature_bedtools: flank_labels = bed_intersection_labels( bins, true_feature_bedtool, f=10**-9, F=10**-9) flank_labels_list.append(flank_labels) elif n_jobs > 1: flank_labels_list = Parallel(n_jobs=n_jobs)(delayed(bed_intersection_labels)(bins.fn, bedtool.fn, f=10**-9, F=10**-9) for bedtool in true_feature_bedtools) flank_labels = np.concatenate(flank_labels_list, axis=1) # we label negative bins with any flank overlap with true features as # ambiguous true_labels[(true_labels == 0) * (flank_labels == 1)] = AMBIG_LABEL if ambiguous_feature_bedtools is not None: # intersect bins and ambiguous tfs for ambiguous labels if n_jobs == 1: ambiguous_labels_list = [] for ambiguous_feature_bedtool in ambiguous_feature_bedtools: ambiguous_labels = bed_intersection_labels( bins, ambiguous_feature_bedtool) ambiguous_labels_list.append(ambiguous_labels) elif n_jobs > 1: ambiguous_feature_fnames = [ bedtool.fn if bedtool is not None else None for bedtool in ambiguous_feature_bedtools] ambiguous_labels_list = Parallel(n_jobs=n_jobs)(delayed(bed_intersection_labels)(bins.fn, fname) for fname in ambiguous_feature_fnames) ambiguous_labels = np.concatenate(ambiguous_labels_list, axis=1) # we label negative bins that overlap ambiguous feature as ambiguous true_labels[(true_labels == 0) * (ambiguous_labels == 1)] = AMBIG_LABEL # TODO: do we want to also filter based on any flank overlap with # ambiguous features?? return bins_and_flanks, true_labels
line_count = 0 for interval in bt: try: if filter_function(interval): intervals.append(interval) # if line_count % 10000 == 0 and verbose: # print("Processed " + str(line_count) + " intervals ["+str(round(100*float(line_count)/float(initial_count), 2))+"%]") except ValueError, e: print(filter_name + " filtering failed for line #" + str(line_count)) print(str(e)) print(traceback.format_exc()) print(sys.exc_info()[0]) print(str(interval)) for i in range(1, 8): print("Field[" + str(i) + "]=" + str(interval[i])) sys.exit(1) line_count += 1 # print("Building bedtool from " + str(len(intervals)) + " intervals..") rval = BedTool(fn=intervals) # print("Built bedtool.") if verbose: info_string = "Initial: " + str(initial_count) + ", removed " + \ str(initial_count-rval.count()) + ", " + str(rval.count()) + " left." if filter_name is not None: info_string = "[" + filter_name + "] " + info_string print(info_string) return rval
def determine_sex(work_dir, bam_fpath, ave_depth, genome, target_bed=None): info() info('Determining sex') male_bed = None for k in chry_key_regions_by_genome: if k in genome: male_bed = BedTool(chry_key_regions_by_genome.get(k)) break if not male_bed: warn('Warning: no male key regions for ' + genome + ', cannot identify sex') return None male_area_size = male_bed.count() info('Male region total size: ' + str(male_area_size)) if target_bed: male_bed = BedTool(target_bed).intersect(male_bed).merge() target_male_area_size = male_bed.count() if target_male_area_size < male_area_size * MALE_TARGET_REGIONS_FACTOR: info('Target male region total size is ' + str(target_male_area_size) + ', which is less than the ' + 'checked male regions size * ' + str(MALE_TARGET_REGIONS_FACTOR) + ' (' + str(male_area_size * MALE_TARGET_REGIONS_FACTOR) + ') - cannot determine sex') return None else: info('Target male region total size is ' + str(target_male_area_size) + ', which is higher than the ' + 'checked male regions size * ' + str(MALE_TARGET_REGIONS_FACTOR) + ' (' + str(male_area_size * MALE_TARGET_REGIONS_FACTOR) + '). ' + 'Determining sex based on coverage in those regions.') else: info('WGS, determining sex based on chrY key regions coverage.') info('Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.') if not bam_fpath: critical('BAM file is required.') index_bam(bam_fpath) chry_cov_output_fpath = sambamba_depth(work_dir, male_bed, bam_fpath, []) chry_mean_coverage = get_mean_cov(chry_cov_output_fpath) info('Y key regions average depth: ' + str(chry_mean_coverage)) ave_depth = float(ave_depth) info('Sample average depth: ' + str(ave_depth)) if ave_depth < AVE_DEPTH_THRESHOLD_TO_DETERMINE_SEX: info('Sample average depth is too low (less then ' + str(AVE_DEPTH_THRESHOLD_TO_DETERMINE_SEX) + ') - cannot determine sex') return None if chry_mean_coverage == 0: info('Y depth is 0 - it\s female') sex = 'F' else: factor = ave_depth / chry_mean_coverage info('Sample depth / Y depth = ' + str(factor)) if factor > FEMALE_Y_COVERAGE_FACTOR: # if mean target coverage much higher than chrY coverage info('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female') sex = 'F' else: info('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male') sex = 'M' info('Sex is ' + sex) info() return sex
split_num = 100 split_region_list = [[]*5] print split_region_list bed = BedTool('/Users/huangzhibo/workitems/10.testData/testPlatformTJ/bed/test.bed') bed = BedTool(bed.sort().merge().window_maker(b=bed.fn, w=100)) bed.all_hits() # x = BedTool().window_maker(genome='hg38', w=1000000) bed.saveas('/Users/huangzhibo/workitems/10.testData/testPlatformTJ/bed/test_w100.bed') split_num = bed.count() if bed.count() < split_num else split_num print bed.count()/split_num # print bed.split(10, 'out') # print x n = 0 for region in bed: # print region.length print str(region).strip() n += 1 print n
split_num = 100 split_region_list = [[] * 5] print split_region_list bed = BedTool( '/Users/huangzhibo/workitems/10.testData/testPlatformTJ/bed/test.bed') bed = BedTool(bed.sort().merge().window_maker(b=bed.fn, w=100)) bed.all_hits() # x = BedTool().window_maker(genome='hg38', w=1000000) bed.saveas( '/Users/huangzhibo/workitems/10.testData/testPlatformTJ/bed/test_w100.bed') split_num = bed.count() if bed.count() < split_num else split_num print bed.count() / split_num # print bed.split(10, 'out') # print x n = 0 for region in bed: # print region.length print str(region).strip() n += 1 print n
def py_peak_calling(bedgraph, threshold, min_length, inter_peak_distance, merge_close_peaks=True, keep_highest_close_peak=False, max_length=10000, generate_ID=True, output_name=None, delete_overlap_bed=None): """ - need to install a more up-to-date varsion of bedtools before invoking Jupyter type: module load bedtools/2.21.0 (1) filters bedgraph based on threshold; (2) merges adjacent basepairs that are over threshold; (3) retains peaks that satisfy min/max length criteria; (4) merges any peaks that are closer than the inter-peak distance cutoff -or- alternatively keeps just the highest peak (this is beta functionality) - max length is typically defaulted to be very large - outputs a bed file (default col4 is the sum of the bedgraph scores; sorted by chrom;start;stop) - generate ID: will auto generate a integer list as a ID number (1... number of peaks). This will be reported as column 4 and the bedgraph scores will be shifted to column 5 as per standard bed format - note the peak score for merged peak is the *just* the sum of the two individual peaks not the total score in the merged region (i.e. there could be some sub-threshold scores in the intervening space that won't be included) -assumes bedgraph in standard format <chr> <start> <stop> <score> -output_name = option for user defined name (type with '...'), otherwise will generate name bedgraph_peaks.bed -delete_overlap_bed = option to add path to bedfile (as string), whereby any peaks that overlap this bed file will be discarded """ import pybedtools import glob from pybedtools import BedTool import pandas as pd import csv if merge_close_peaks == keep_highest_close_peak: return 'Exiting... merge_close_peaks and keep_highest_close_peak set the same' #generate name for output bedgraph_name = glob.glob(bedgraph) if output_name != None: filename = output_name elif output_name == None: filename = bedgraph_name[0].replace('.bg', '_peaks.bed') print 'input bedgraph file: ' + bedgraph_name[0] print 'output filename: ' + filename #import data as BedTool data = BedTool(bedgraph) #retains intervals above threshold above_thresh = data.filter(lambda b: float(b.name) >= threshold) #merge adjacent above threshold regions and sum bedgraph scores (assumes bedgraph score in col 4) #by increasing d value can allow for merge_regions = above_thresh.merge(d=0, c=4, o='sum') #filter based on length criteria peaks = BedTool( merge_regions.filter( lambda x: len(x) >= min_length and len(x) <= max_length)) # print 'number of regions identified before merging or filtering: ' + str(peaks.count()) if merge_close_peaks == True: #merge the bonafide peaks if they they are shorter than the inter peak distance and sum scores and sort print 'merging peaks that are closer than: ' + str(inter_peak_distance) merge_peaks = peaks.merge(d=inter_peak_distance, c=4, o='sum').sort() if keep_highest_close_peak == True: #need to read each line to find close peaks and throw away the one with the lowest score out of the two print 'entering loop' peaks.saveas('temp_input.bed') print 'before keeping highest, number of regions identified: ' + str( BedTool('temp_input.bed').count()) last_line = [ str(item) for item in (BedTool('temp_input.bed').to_dataframe().tail( n=1).iloc[0, :].tolist()) ] with open('temp_input.bed') as myfile: with open('test_output.bed', 'w') as output: file_output = csv.writer(output, delimiter='\t') prev_line = None for line in csv.reader(myfile, delimiter='\t'): # print 'testing line: ' +str(line) if prev_line is None: prev_line = line # print elif float(prev_line[2]) + float( inter_peak_distance) <= float(line[1]): # print 'prev_line: ' + str(prev_line) # print 'line: ' + str(line) # print 'features far apart, so adding' # print file_output.writerow(prev_line) prev_line = line else: # print 'prev_line: ' + str(prev_line) # print 'line: ' + str(line) # print 'features must be close' # print if float(prev_line[3]) < float(line[3]): prev_line = line # print 'prev_line smaller, so new prev_line' # print 'prev_line: ' + str(prev_line) # print # print 'finished reading lines' # print line # print last_line if line == last_line: # print 'must be last line' file_output.writerow(prev_line) merge_peaks = BedTool('test_output.bed') print 'number of peaks found: ' + str(merge_peaks.count()) if delete_overlap_bed != None: print 'delete_overlap_bed provided: ' + delete_overlap_bed merge_peaks = merge_peaks.intersect(b=delete_overlap_bed, v=True) print 'number of peaks retained: ' + str(merge_peaks.count()) if not generate_ID: print 'saving sorted peak bed file with no ID' merge_peaks.saveas(filename) if generate_ID: print 'saving sorted peak bed file with ID names' #change to pandas dataframe DF_peaks = merge_peaks.to_dataframe() #insert new column with id: 1.... # of peaks DF_peaks.insert( 3, 'id', ['id' + str(item) for item in range(1, (len(DF_peaks) + 1))]) ['id' + str(item) for item in range(1, 5)] #save output DF_peaks.to_csv(filename, sep='\t', header=False, index=False) return 'Finished'