def _coverage(scdf, ocdf, kwargs): if scdf.empty: return None if ocdf.empty: df = scdf.copy() df.insert(df.shape[1], "FractionOverlaps", 0.0) return df oncls = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values) starts = scdf.Start.values ends = scdf.End.values indexes = scdf.index.values _lengths = oncls.coverage(starts, ends, indexes) _lengths = _lengths / (ends - starts) _fractions = _lengths _fractions = _fractions.astype("float64") _fractions = np.nan_to_num(_fractions) scdf = scdf.copy() scdf.insert(scdf.shape[1], "FractionOverlaps", _fractions) return scdf
def _both_dfs(scdf, ocdf, how=False): assert how in "containment first".split() + [False, None] starts = scdf.Start.values ends = scdf.End.values indexes = scdf.index.values ocdf = ocdf.reset_index(drop=True) it = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values) if not how: _self_indexes, _other_indexes = it.all_overlaps_both( starts, ends, indexes) elif how == "containment": _self_indexes, _other_indexes = it.all_containments_both( starts, ends, indexes) else: _self_indexes, _other_indexes = it.first_overlap_both( starts, ends, indexes) _self_indexes = _self_indexes _other_indexes = _other_indexes scdf = scdf.reindex(_self_indexes) ocdf = ocdf.reindex(_other_indexes) return scdf, ocdf
def __init__(self, bedFile): super(BedObject, self).__init__() # This function builds an index tree from the bed file to have a fast check if a read falls within # a specified area or not. # the hard work is done by NCLS (https://github.com/biocore-ntnu/ncls) which is also used by the # pyranges module starts = [] ends = [] currChr = None self.__ncls = {} with open(bedFile) as f: for line in f: # break the line into fields lineArray = line.strip().split() # if the chromosome is still the same, or we do this the first time, we append if currChr == lineArray[0] or currChr is None: # this is not changing anything but for the first time (when currChr is None), but # thats fine as this is neither time consuming, nor the bottle neck, its just not # pretty currChr = lineArray[0] # add the starts and stops to the list starts.append(int(lineArray[1])) ends.append(int(lineArray[2])) else: # convert to array with dtype (ncls needs that) starts = array(starts, dtype=int64) ends = array(ends, dtype=int64) # add one to the end, to have inclusive ends ends = ends + 1 # create the data structure (third column is ids... which could be anything, but # needs to be a number ) tmpNcls = NCLS(starts, ends, starts) # store the data structure under its chromosome name self.__ncls[currChr] = tmpNcls # reset all the things for the next chromosome (and initialise it while we are # already at it) currChr = lineArray[0] starts = [lineArray[1]] ends = [lineArray[2]] # finally, when we are done with everything, and the currentChr is not None, we need to # add the things one last time (just like in the else statement) if not currChr is None: # convert to array with dtype (ncls needs that) starts = array(starts, dtype=int64) ends = array(ends, dtype=int64) # add one to the end, to have inclusive ends ends = ends + 1 # create the data structure (third column is ids... which could be anything, but # needs to be a number ) tmpNcls = NCLS(starts, ends, starts) # store the data structure under its chromosome name self.__ncls[currChr] = tmpNcls
def _first_df(scdf, ocdf, how=False, invert=False, n_jobs=1, **kwargs): assert how in "containment first".split() + [False, None] starts = scdf.Start.values ends = scdf.End.values indexes = scdf.index.values print("n_jobs " * 10) print(n_jobs) if n_jobs > 1: print("deepcopy") scdf = scdf.copy(deep=True) it = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values) if not how: _indexes = it.has_overlaps(starts, ends, indexes) elif how == "containment": _indexes = it.has_containments(starts, ends, indexes) if not invert: return scdf.reindex(_indexes) else: return scdf.loc[~scdf.index.isin(_indexes)]
def _overlap(scdf, ocdf, kwargs): invert = kwargs["invert"] return_indexes = kwargs.get("return_indexes", False) if scdf.empty or ocdf.empty: return None how = kwargs["how"] assert how in "containment first".split() + [False, None] starts = scdf.Start.values ends = scdf.End.values indexes = scdf.index.values it = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values) if not how: _indexes = it.all_overlaps_self(starts, ends, indexes) elif how == "containment": _indexes = it.has_containment(starts, ends, indexes) else: _indexes = it.has_overlaps(starts, ends, indexes) if invert: _indexes = scdf.index.difference(_indexes) if return_indexes: return _indexes return scdf.reindex(_indexes)
def test_all_containments_both(): starts = np.array([5, 10], dtype=np.int64) ends = np.array([6, 50], dtype=np.int64) ids = np.array([0, 1], dtype=np.int64) ncls = NCLS(starts, ends, ids) subs, covers = ncls.all_containments_both(starts, ends, ids) print(ncls.intervals()) assert list(subs) == [0, 1] == list(covers)
def test_all_containments_both(): starts = np.array([1291845632, 3002335232], dtype=int) ends = np.array([1292894207, 3002597375], dtype=int) ids = np.array([0, 1], dtype=int) ncls = NCLS(starts, ends, ids) subs, covers = ncls.all_containments_both(starts, ends, ids) print(ncls.intervals()) assert list(subs) == [0, 1] == list(covers)
def __init__(self, starts=None, ends=None, indices=None, reduce=False): self.ncls = None if starts is not None and indices is not None: if ends is None: ends = [s + 1 for s in starts] if reduce: starts, ends, indices = list( zip(*merge_overlaps(zip(starts, ends, indices)))) starts = np.array(starts, dtype='i8') ends = np.array(ends, dtype='i8') indices = np.array(indices, dtype='i8') self.ncls = NCLS(starts, ends, indices)
def __init__(self, node_type, node_id, nodes_list, is_robot=False): super(MultiNode, self).__init__(node_type, node_id, data=None, is_robot=is_robot) self.nodes_list = nodes_list for node in self.nodes_list: node.is_robot = is_robot self.first_timestep = min(node.first_timestep for node in self.nodes_list) self._last_timestep = max(node.last_timestep for node in self.nodes_list) starts = np.array([node.first_timestep for node in self.nodes_list], dtype=np.int64) ends = np.array([node.last_timestep for node in self.nodes_list], dtype=np.int64) ids = np.arange(len(self.nodes_list), dtype=np.int64) self.interval_tree = NCLS(starts, ends, ids)
def _intersection(scdf, ocdf, kwargs): how = kwargs["how"] if ocdf.empty or scdf.empty: return None assert how in "containment first".split() + [False, None] starts = scdf.Start.values ends = scdf.End.values indexes = scdf.index.values in_dtype = ocdf.Start.dtype oncls = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values) if not how or how is None: _self_indexes, _other_indexes = oncls.all_overlaps_both( starts, ends, indexes) elif how == "containment": _self_indexes, _other_indexes = oncls.all_containments_both( starts, ends, indexes) elif how == "first": _self_indexes, _other_indexes = oncls.first_overlap_both( starts, ends, indexes) _self_indexes = _self_indexes _other_indexes = _other_indexes scdf, ocdf = scdf.reindex(_self_indexes), ocdf.reindex(_other_indexes) new_starts = pd.Series(np.where(scdf.Start.values > ocdf.Start.values, scdf.Start, ocdf.Start), index=scdf.index, dtype=in_dtype) new_ends = pd.Series(np.where(scdf.End.values < ocdf.End.values, scdf.End, ocdf.End), index=scdf.index, dtype=in_dtype) pd.options.mode.chained_assignment = None # default='warn' scdf.loc[:, "Start"] = new_starts scdf.loc[:, "End"] = new_ends pd.options.mode.chained_assignment = 'warn' if not scdf.empty: return scdf else: return None
def test_ncls(): # ids = starts print(starts, ends, ids) ncls = NCLS(starts, ends, ids) print(ncls) print(ncls.intervals()) assert list(ncls.find_overlap(0, 2)) == [] assert list(ncls.find_overlap(0, 2_147_483_647)) == [(5, 6, 0), (2_147_483_645, 2_147_483_646, 3)] r, l = ncls.all_overlaps_both(starts, ends, ids) assert list(r) == [0, 3] assert list(l) == [0, 3]
def _number_overlapping(scdf, ocdf, kwargs): keep_nonoverlapping = kwargs.get("keep_nonoverlapping", True) if scdf.empty: return None if ocdf.empty: if keep_nonoverlapping: df = scdf.copy() # print(df) df.insert(df.shape[1], "NumberOverlaps", 0) # print("df" * 100) # print(df) return df else: return None oncls = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values) starts = scdf.Start.values ends = scdf.End.values indexes = scdf.index.values _self_indexes, _other_indexes = oncls.all_overlaps_both( starts, ends, indexes) s = pd.Series(_self_indexes) counts_per_read = s.value_counts()[s.unique()].reset_index() counts_per_read.columns = ["Index", "Count"] df = scdf.copy() if keep_nonoverlapping: _missing_indexes = np.setdiff1d(scdf.index, _self_indexes) missing = pd.DataFrame(data={ "Index": _missing_indexes, "Count": 0 }, index=_missing_indexes) counts_per_read = pd.concat([counts_per_read, missing]) else: df = df.loc[_self_indexes] counts_per_read = counts_per_read.set_index("Index") df.insert(df.shape[1], "NumberOverlaps", counts_per_read) return df
class NestedContainmentList(object): def __init__(self, starts=None, ends=None, indices=None, reduce=False): self.ncls = None if starts is not None and indices is not None: if ends is None: ends = [s + 1 for s in starts] if reduce: starts, ends, indices = list( zip(*merge_overlaps(zip(starts, ends, indices)))) starts = np.array(starts, dtype='i8') ends = np.array(ends, dtype='i8') indices = np.array(indices, dtype='i8') self.ncls = NCLS(starts, ends, indices) def find_overlaps(self, start, end): if self.ncls is None: # we allow for empty objects, in which case nothing overlaps # use case: non-matching seqids return [] overlaps = [] for overlap in self.ncls.find_overlap(start, end): overlaps.append(Interval(*overlap)) return overlaps @staticmethod def from_intervals(intervals, reduce=False): starts, ends, indices = zip(*intervals) starts = np.array(starts, dtype='i8') ends = np.array(ends, dtype='i8') indices = np.array(indices, dtype='i8') obj = NestedContainmentList(starts, ends, indices, reduce=reduce) return obj
def ncls_overlap(self, decimal_places = 5, start_idx = 0, end_idx = None): # if end_idx is none set as the end of the list if end_idx is None: end_idx == len(self.start) #decimal_places = accuracy of the retentionTime axis (how many decimal places should take into acoount) (since NCLS works optimally with integers) # convert the retention times to integers so that it is compatible with ncls int_start = (self.start * (10**decimal_places)).astype(int) int_end = (self.end * (10**decimal_places)).astype(int) #create the ncls object ncls = NCLS(int_start, int_end, self.idx) #find all pairwise retention time overlaps, store in a vertical 2XN numpy nd array return np.column_stack(ncls.all_overlaps_both(int_start[start_idx:end_idx], int_end[start_idx:end_idx], self.idx[start_idx:end_idx])) #column stack puts the two lists vertically (easier iteration for np.vectorize)
def test_ncls(): # ids = starts print(starts, ends, ids) ncls = NCLS(starts, ends, ids) print(ncls) print(ncls.intervals()) assert list(ncls.find_overlap(0, 2)) == [] print("aaa", list(ncls.find_overlap(9_223_372_036_854_775_805, 9_223_372_036_854_775_806))) assert list(ncls.find_overlap(0, 9_223_372_036_854_775_806)) == [(5, 6, 2147483647), (9223372036854775805, 9223372036854775807, 3)] r, l = ncls.all_overlaps_both(starts, ends, ids) assert list(r) == [2147483647, 3] assert list(l) == [2147483647, 3]
def __init__(self, predicate, records): self.predicate = predicate self.records = [] working_tree_map = {} for idx, record in enumerate(records): genome_pos = predicate(record) if genome_pos is None: continue chrom = genome_pos.chrom if not chrom in working_tree_map: # (starts, ends, ids) working_tree_map[chrom] = ([], [], []) starts, ends, ids = working_tree_map[chrom] starts.append(genome_pos.start) ends.append(genome_pos.end) ids.append(idx) self.records.append(record) idx += 1 tree_map = {} for chrom, (starts, ends, ids) in working_tree_map.items(): tree_map[chrom] = NCLS(np.array(starts, dtype=np.long), np.array(ends, dtype=np.long), np.array(ids, dtype=np.long)) self.tree_map = tree_map
def gmap_parse_ncls(gmapFile, cutoff): gmapLoc = {} starts = [] ends = [] ids = [] ongoingCount = 0 with open(gmapFile, 'r') as fileIn: for line in fileIn: # Skip unneccessary lines if line.startswith('#'): continue sl = line.split('\t') if sl[2] != 'cDNA_match': # I don't think any other type of line is present in a GMAP gff3 file produced with PASA's settings, but this could potentially future proof the script? continue # Get details from line including start, stop, and orientation contigID = sl[0] contigStart = int(sl[3]) contigStop = int(sl[4]) identity = float(sl[5]) if identity < cutoff: # Speed up program by only holding onto hits that will pass our cutoff check. continue # Add to our NCLS # We index using ranges since it provides an easy way to retrieve GMAP matches by coordinates. Since these coordinates aren't unique, we filter any results returned by their contig ID. starts.append(contigStart) ends.append( contigStop + 1 ) # NCLS indexes 0-based, so +1 to make this more logically compliant with gff3 1-based system. ids.append(ongoingCount) gmapLoc[ongoingCount] = contigID ongoingCount += 1 # Build the NCLS object starts = pd.Series(starts) ends = pd.Series(ends) ids = pd.Series(ids) ncls = NCLS(starts.values, ends.values, ids.values) return ncls, gmapLoc
def main(argv): if len(argv) < 3: print("Usage: bedcov.py <loaded.bed> <streamed.bed>") sys.exit(1) bed, i = {}, 0 start = timer() with open(argv[1]) as fp: for line in fp: t = line[:-1].split("\t") if not t[0] in bed: bed[t[0]] = [[], [], [], None] bed[t[0]][0].append(t[1]) bed[t[0]][1].append(t[2]) bed[t[0]][2].append(i) i += 1 sys.stderr.write("Read in {} sec\n".format(timer() - start)) start = timer() for ctg in bed: bed[ctg][3] = NCLS(np.array(bed[ctg][0], dtype=np.long), np.array(bed[ctg][1], dtype=np.long), np.array(bed[ctg][2], dtype=np.long)) sys.stderr.write("Index in {} sec\n".format(timer() - start)) start = timer() with open(argv[2]) as fp: for line in fp: t = line[:-1].split("\t") if not t[0] in bed: print("{}\t{}\t{}\t0".format(t[0], t[1], t[2])) else: cnt = 0 it = bed[t[0]][3].find_overlap(long(t[1]), long(t[2])) for r in it: cnt += 1 print("{}\t{}\t{}\t{}".format(t[0], t[1], t[2], cnt)) sys.stderr.write("Query in {} sec\n".format(timer() - start))
def buildNCLSindex(sites): starts = array(sites, dtype=int64) ends = array(starts + 1, dtype=int64) idxs = arange(len(starts)) index = NCLS(starts, ends, idxs) return index
def overlap(self, im=True, decimal_places=5): #if index not linear then filter and use this hidden index use_hidden = False #if true that means reindex done for overlap (with have to unindex before return results) if not np.all(self.retentionTable.idx == np.arange( 0, len(self.retentionTable.start))): hidden_idx = np.arange(0, len(self.retentionTable.start)) use_hidden = True else: hidden_idx = self.retentionTable.idx #decimal_places = accuracy of the retentionTime axis (how many decimal places should take into acoount) # convert the retention times to integers so that it is compatible with ncls ret_int_start = (self.retentionTable.start * (10**decimal_places)).astype(int) ret_int_end = (self.retentionTable.end * (10**decimal_places)).astype(int) #create the ncls object ncls = NCLS(ret_int_start, ret_int_end, hidden_idx) #find all pairwise retention time overlaps, store in a vertical 2XN numpy nd array ret_idx = np.column_stack( ncls.all_overlaps_both(ret_int_start, ret_int_end, hidden_idx) ) #column stack puts the two lists vertically (easier iteration for np.vectorize) #filter out pairs where x=y, although these overlap not interested in them ret_idx = ret_idx[ret_idx[:, 0] != ret_idx[:, 1]] if ret_idx.size > 0: #only look for overlap if there is overlap in retention time #if im flag on, then have to check for overlap in both mz and im dimensions if im: rslt = ret_idx[self.__vecMzImOverlap(ret_idx[:, 0], ret_idx[:, 1])] else: rslt = ret_idx[self.mzTable.vec_idx_overlap( ret_idx[:, 0], ret_idx[:, 1])] else: rslt = np.array([]) #unindex if need to if use_hidden: return Precursor.unindex(rslt, self.retentionTable.idx) else: return rslt
def as_ncls_dict(self) -> Dict[Chrom, NCLS]: res = {} for chrom, chrom_df in self._obj.groupby("chrom"): res[chrom] = NCLS( chrom_df.start.values.astype(np.int64), chrom_df.end.values.astype(np.int64), chrom_df.index.values.astype(np.int64), ) return res
def _subtraction(scdf, ocdf, **kwargs): if ocdf.empty or scdf.empty: return scdf strandedness = kwargs["strandedness"] strand = True if strandedness else False chromosome = scdf.Chromosome.head(1).iloc[0] kwargs["chromosome"] = chromosome if "Strand" in ocdf and strand: strand = scdf.Strand.head(1).iloc[0] kwargs["strand"] = strand o = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values) idx_self, new_starts, new_ends = o.set_difference_helper( scdf.Start.values, scdf.End.values, scdf.index.values, scdf.__num__.values) missing_idx = pd.Index(scdf.index).difference(idx_self) idx_to_drop = new_starts != -1 new_starts = new_starts[idx_to_drop] new_ends = new_ends[idx_to_drop] idx_self = idx_self[idx_to_drop] new_starts = pd.Series(new_starts, index=idx_self) new_ends = pd.Series(new_ends, index=idx_self) scdf = scdf.reindex(missing_idx.union(idx_self)).sort_index() new_starts = new_starts.sort_index() new_ends = new_ends.sort_index() if len(idx_self): scdf.loc[scdf.index.isin(idx_self), "Start"] = new_starts.values scdf.loc[scdf.index.isin(idx_self), "End"] = new_ends.values if not scdf.empty: return scdf else: return None
def filter_by_human_annotations(self, article, annotations): ncls = NCLS(*get_intervals(article['annotations'])) new_annotations = [] num_filtered = 0 for annotation in annotations: entity_start, entity_end = get_start_end(annotation) matched_human_annotation = list( ncls.find_overlap(entity_start, entity_end)) if len(matched_human_annotation) == 0: new_annotations.append(annotation) else: human_annotation = article['annotations'][ matched_human_annotation[0][2]] human_annotation_start, human_annotation_end = get_start_end( human_annotation) assert intersect(human_annotation_start, human_annotation_end, entity_start, entity_end) num_filtered += 1 assert len(new_annotations) + num_filtered == len(annotations) return new_annotations, num_filtered
def test_ncls(): starts = pd.Series(range(0, int(1e6))) ends = starts + 100 ids = starts print(starts, ends, ids) ncls = NCLS(starts.values, ends.values, ids.values) # starts = pd.Series([0, 4]) # ends = pd.Series([2, 5]) # indexes = pd.Series([98, 99]) print(starts, ends, indexes) it = ncls.all_overlaps_both_stack(starts.values, ends.values, indexes.values) it2 = ncls.all_overlaps_both(starts.values, ends.values, indexes.values) print(it) print(it2) assert it == it2
def _both_indexes(scdf, ocdf, how=False): assert (how in "containment first".split() + [False, None]) or isinstance( how, int) starts = scdf.Start.values ends = scdf.End.values indexes = scdf.index.values it = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values) if not how: _self_indexes, _other_indexes = it.all_overlaps_both( starts, ends, indexes) elif how == "containment": _self_indexes, _other_indexes = it.all_containments_both( starts, ends, indexes) else: _self_indexes, _other_indexes = it.first_overlap_both( starts, ends, indexes) return _self_indexes, _other_indexes
def overlap_against_window(self, exp, im=True, decimal_places=5): print("starting overlap against window") ms2 = exp.ms2 #decimal_places = accuracy of the retentionTime axis (how many decimal places should take into acoount) # convert the retention times to integers so that it is compatible with ncls ret_int_start = (self.retentionTable.start * (10**decimal_places)).astype(int) ret_int_end = (self.retentionTable.end * (10**decimal_places)).astype(int) ms2_time_int_start = (ms2.timeTable.start * (10**decimal_places)).astype(int) ms2_time_int_end = (ms2.timeTable.end * (10**decimal_places)).astype(int) print(ret_int_start) print(ret_int_end) #create the ncls object ncls = NCLS(ret_int_start, ret_int_end, self.retentionTable.idx) #find all pairwise retention time overlaps, store in a vertical 2XN numpy nd array ret_idx = np.column_stack( ncls.all_overlaps_both(ms2_time_int_start, ms2_time_int_end, ms2.timeTable.idx) ) #column stack puts the two lists vertically (easier iteration for np.vectorize) ret_idx = np.fliplr(ret_idx) #if im flag on, then have to check for overlap in both mz and im dimensions if im: return ret_idx[self.vec_mz_im_overlap(ret_idx[:, 0], ret_idx[:, 1], idx2_data=ms2)] else: #print(ms2.mzTable) return ret_idx[self.mzTable.vec_idx_overlap(ret_idx[:, 0], ret_idx[:, 1], yData=ms2.mzTable)]
def create_ncls(seed): np.random.seed(seed) total_nb = int(1e7) starts = randint(0, int(1e8), total_nb) ends = starts + 100 ncls = NCLS(starts, ends, starts) print("returning") return ncls
def _both_dfs(scdf, ocdf, how=False, **kwargs): assert how in "containment first".split() + [False, None] starts = scdf.Start.values ends = scdf.End.values indexes = scdf.index.values it = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values) if not how: _self_indexes, _other_indexes = it.all_overlaps_both( starts, ends, indexes) elif how == "containment": _self_indexes, _other_indexes = it.all_containments_both( starts, ends, indexes) else: _self_indexes, _other_indexes = it.first_overlap_both( starts, ends, indexes) _self_indexes = _self_indexes _other_indexes = _other_indexes return scdf.loc[_self_indexes], ocdf.loc[_other_indexes]
def create_eij_ncls_dict(standard_event_dict): eij_by_chrom_strand = {} eij_indexed_event_dict = {} eij_only_count_dict = {} ncls_by_chrom_strand = {} for event, event_val in standard_event_dict.iteritems(): strand = event_val["strand"] chrom = event_val["chrom"] for eij in event_val["included_ei_junctions"]: ((eij_by_chrom_strand).setdefault(chrom, {})).setdefault( strand, set()).add(int(eij)) eij_index = chrom + "_" + str(eij) + "_" + strand eij_indexed_event_dict.setdefault(eij_index, set()).add(event + "_included") eij_only_count_dict.setdefault(eij_index, 0) for eij in event_val["excluded_ei_junctions"]: ((eij_by_chrom_strand).setdefault(chrom, {})).setdefault( strand, set()).add(int(eij)) eij_index = chrom + "_" + str(eij) + "_" + strand eij_indexed_event_dict.setdefault(eij_index, set()).add(event + "_excluded") eij_only_count_dict.setdefault(eij_index, 0) for chrom, chrom_dict in eij_by_chrom_strand.iteritems(): ncls_by_chrom_strand[chrom] = {} for strand, strand_dict in chrom_dict.iteritems(): starts = np.array(list(strand_dict)) - 1 ## to make 0-based ends = starts ids = starts ncls_by_chrom_strand[chrom][strand] = NCLS(starts, ends, ids) return (ncls_by_chrom_strand, eij_indexed_event_dict, eij_only_count_dict)
def _subtraction(scdf, ocdf, **kwargs): chromosome, strand = parse_grpby_key(kwargs["key"]) if ocdf.empty or scdf.empty: return scdf strandedness = kwargs["strandedness"] strand = True if strandedness else False oc = _cluster(ocdf, chromosome, strand) o = NCLS(oc.Start.values, oc.End.values, oc.index.values) idx_self, new_starts, new_ends = o.set_difference_helper( scdf.Start.values, scdf.End.values, scdf.index.values) missing_idx = pd.Index(scdf.index).difference(idx_self) idx_to_drop = new_starts != -1 new_starts = new_starts[idx_to_drop] new_ends = new_ends[idx_to_drop] idx_self = idx_self[idx_to_drop] new_starts = pd.Series(new_starts, index=idx_self).sort_index() new_ends = pd.Series(new_ends, index=idx_self).sort_index() idx_self = np.sort(idx_self) scdf = scdf.reindex(missing_idx.union(idx_self)) if len(idx_self): scdf.loc[scdf.index.isin(idx_self), "Start"] = new_starts scdf.loc[scdf.index.isin(idx_self), "End"] = new_ends return scdf