Esempio n. 1
0
 def _get_interval(self,
                   interval):  #Return all genes that intersect interval
     #Also counts partial intersections
     search_query = pd.DataFrame({
         "start": [interval.start],
         "end": [interval.end]
     })
     result = intersect_intervals(self.chr_data,
                                  {interval.chr: search_query},
                                  suppreseChrNumberCheck=True)
     gene_idxs = result[interval.chr]["intersection"]
     return self.chr_data[interval.chr].iloc[gene_idxs, :]
Esempio n. 2
0
    def set_sites_orientation(
            self,
            orient_fname):  #Set orientation of sites based on gimmeMotifsData
        #It will fill plus_orient_data and minus_orient_data cols
        #And set orient_data_real to True
        try:
            self.chr_data
        except:
            logging.error("Please read data first")
            return None
        orient_chr_data = self.read_orient_file(orient_fname)
        result_intersection_data = intersect_intervals(self.chr_data,
                                                       orient_chr_data)
        for chr in result_intersection_data.keys():
            #if chr != 'chr4':
            #   continue
            result_intersection_data[chr].sort_values(
                by=["orientation", "intersection", "score"], inplace=True)
            #duplicated = result_intersection_data[chr].duplicated(subset=["intersection", "orientation"])
            #print(duplicated)
            result_intersection_data[chr].drop_duplicates(
                subset=["intersection", "orientation"],
                keep="first",
                inplace=True)
            #print(result_intersection_data['chr4'])
            plus_orient_data = result_intersection_data[chr].query(
                "orientation =='+'")
            plus_col_ind = self.chr_data[chr].columns.get_loc(
                "plus_orientation")
            plus_row_list = list(plus_orient_data["intersection"])
            self.chr_data[chr].iloc[plus_row_list, plus_col_ind] = list(
                plus_orient_data["score"])

            minus_orient_data = result_intersection_data[chr].query(
                "orientation =='-'")
            minus_col_ind = self.chr_data[chr].columns.get_loc(
                "minus_orientation")
            minus_row_list = list(minus_orient_data["intersection"])
            self.chr_data[chr].iloc[minus_row_list, minus_col_ind] = list(
                minus_orient_data["score"])
        self.orient_data_real = True
Esempio n. 3
0
    def get_predictors(self, contacts):
        # Basic idea:
        # First, set predictor for all contacts to 0
        # Next, intersect left and right anchor with loops
        # Then, intersect these intersections to get contacts with both anchors belonging to same loop
        # Finally set predictors for these contacts to 1

        result = pd.DataFrame({"IsLoop": [0] * len(contacts)})
        left = {}
        right = {}
        for chr in np.unique(contacts["chr"].values):

            # Important think about index
            # Contacts are not assumned to belong to same chr or be sorted by chr
            # But other funcs, i.e. intersect_intervals operate on chr-based dicts of intervals
            # To solve this, we have idxs which is boolean index of contacts belonging to single chrm
            # And we will get intersections["ids_column"] which is idxs of those elements of idxs,
            # which have intersections. I.e. if we have chr1 in 3rd, 5th and 6th contact, and 6th
            # has intersection with loop, intersections["ids_column"] will be not 6, but 2.
            # To remap from intersections["ids_column"] to initial indexing of contacts we
            # use np.flatnonzero(idxs)[idxs2] statment (see blow

            idxs = contacts["chr"] == chr
            left[chr] = pd.DataFrame({
                "start":
                contacts[idxs]["contact_st"] - self.window_size,
                "end":
                contacts[idxs]["contact_st"] + self.window_size
            })
            right[chr] = pd.DataFrame({
                "start":
                contacts[idxs]["contact_en"] - self.window_size,
                "end":
                contacts[idxs]["contact_en"] + self.window_size
            })

            left_loops = self.loopsReader.getLeftLoopAncors(chr)
            if len(left_loops[chr]) == 0:  # No loops on this chr
                logging.getLogger(__name__).warning("No loops on chr " + chr)
                continue
            right_loops = self.loopsReader.getRightLoopAncors(chr)
            if len(right_loops[chr]) == 0:
                continue

            #print (left)
            #print (left_loops)
            intersections_L = intersect_intervals(left_loops, left)[chr]
            intersections_L["Loop_id"] = intersections_L.intersection.apply(
                lambda x: left_loops[chr].id.iloc[x])

            intersections_R = intersect_intervals(right_loops, right)[chr]
            intersections_R["Loop_id"] = intersections_R.intersection.apply(
                lambda x: right_loops[chr].id.iloc[x])

            # id_of_element_in_left -- id_of_intersecting_element_in_right_loops

            intersections = intersections_L.merge(intersections_R,
                                                  on=["Loop_id", "ids_column"],
                                                  how="inner")
            idxs2 = intersections["ids_column"].values
            global_idxs = np.flatnonzero(idxs)[idxs2]
            result.iloc[global_idxs, 0] = 1
        return result