def _get_interval(self, interval): #Return all genes that intersect interval #Also counts partial intersections search_query = pd.DataFrame({ "start": [interval.start], "end": [interval.end] }) result = intersect_intervals(self.chr_data, {interval.chr: search_query}, suppreseChrNumberCheck=True) gene_idxs = result[interval.chr]["intersection"] return self.chr_data[interval.chr].iloc[gene_idxs, :]
def set_sites_orientation( self, orient_fname): #Set orientation of sites based on gimmeMotifsData #It will fill plus_orient_data and minus_orient_data cols #And set orient_data_real to True try: self.chr_data except: logging.error("Please read data first") return None orient_chr_data = self.read_orient_file(orient_fname) result_intersection_data = intersect_intervals(self.chr_data, orient_chr_data) for chr in result_intersection_data.keys(): #if chr != 'chr4': # continue result_intersection_data[chr].sort_values( by=["orientation", "intersection", "score"], inplace=True) #duplicated = result_intersection_data[chr].duplicated(subset=["intersection", "orientation"]) #print(duplicated) result_intersection_data[chr].drop_duplicates( subset=["intersection", "orientation"], keep="first", inplace=True) #print(result_intersection_data['chr4']) plus_orient_data = result_intersection_data[chr].query( "orientation =='+'") plus_col_ind = self.chr_data[chr].columns.get_loc( "plus_orientation") plus_row_list = list(plus_orient_data["intersection"]) self.chr_data[chr].iloc[plus_row_list, plus_col_ind] = list( plus_orient_data["score"]) minus_orient_data = result_intersection_data[chr].query( "orientation =='-'") minus_col_ind = self.chr_data[chr].columns.get_loc( "minus_orientation") minus_row_list = list(minus_orient_data["intersection"]) self.chr_data[chr].iloc[minus_row_list, minus_col_ind] = list( minus_orient_data["score"]) self.orient_data_real = True
def get_predictors(self, contacts): # Basic idea: # First, set predictor for all contacts to 0 # Next, intersect left and right anchor with loops # Then, intersect these intersections to get contacts with both anchors belonging to same loop # Finally set predictors for these contacts to 1 result = pd.DataFrame({"IsLoop": [0] * len(contacts)}) left = {} right = {} for chr in np.unique(contacts["chr"].values): # Important think about index # Contacts are not assumned to belong to same chr or be sorted by chr # But other funcs, i.e. intersect_intervals operate on chr-based dicts of intervals # To solve this, we have idxs which is boolean index of contacts belonging to single chrm # And we will get intersections["ids_column"] which is idxs of those elements of idxs, # which have intersections. I.e. if we have chr1 in 3rd, 5th and 6th contact, and 6th # has intersection with loop, intersections["ids_column"] will be not 6, but 2. # To remap from intersections["ids_column"] to initial indexing of contacts we # use np.flatnonzero(idxs)[idxs2] statment (see blow idxs = contacts["chr"] == chr left[chr] = pd.DataFrame({ "start": contacts[idxs]["contact_st"] - self.window_size, "end": contacts[idxs]["contact_st"] + self.window_size }) right[chr] = pd.DataFrame({ "start": contacts[idxs]["contact_en"] - self.window_size, "end": contacts[idxs]["contact_en"] + self.window_size }) left_loops = self.loopsReader.getLeftLoopAncors(chr) if len(left_loops[chr]) == 0: # No loops on this chr logging.getLogger(__name__).warning("No loops on chr " + chr) continue right_loops = self.loopsReader.getRightLoopAncors(chr) if len(right_loops[chr]) == 0: continue #print (left) #print (left_loops) intersections_L = intersect_intervals(left_loops, left)[chr] intersections_L["Loop_id"] = intersections_L.intersection.apply( lambda x: left_loops[chr].id.iloc[x]) intersections_R = intersect_intervals(right_loops, right)[chr] intersections_R["Loop_id"] = intersections_R.intersection.apply( lambda x: right_loops[chr].id.iloc[x]) # id_of_element_in_left -- id_of_intersecting_element_in_right_loops intersections = intersections_L.merge(intersections_R, on=["Loop_id", "ids_column"], how="inner") idxs2 = intersections["ids_column"].values global_idxs = np.flatnonzero(idxs)[idxs2] result.iloc[global_idxs, 0] = 1 return result