def _both_indexes(scdf, ocdf, how=False): assert (how in "containment first last outer right left".split() + [False, None]) or isinstance(how, int) starts = scdf.Start.values ends = scdf.End.values indexes = scdf.index.values it = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values) if not how: _self_indexes, _other_indexes = it.all_overlaps_both( starts, ends, indexes) elif how == "containment": _self_indexes, _other_indexes = it.all_containments_both( starts, ends, indexes) elif how == "first": _self_indexes, _other_indexes = it.first_overlap_both( starts, ends, indexes) elif how == "last": _self_indexes, _other_indexes = it.last_overlap_both( starts, ends, indexes) six = scdf.index oix = ocdf.index elif how in ["outer", "left", "right"]: _self_indexes, _other_indexes = it.all_overlaps_both( starts, ends, indexes) missing_in_s = scdf.index.difference(_self_indexes) missing_in_o = ocdf.index.difference(_other_indexes) filler_s = np.ones(len(missing_in_o), dtype=int) * -1 filler_o = np.ones(len(missing_in_s), dtype=int) * -1 if how == "outer": _self_indexes = np.concatenate( [_self_indexes, missing_in_s, filler_s]) _other_indexes = np.concatenate( [_other_indexes, filler_o, missing_in_o]) elif how == "left": _self_indexes = np.concatenate([_self_indexes, missing_in_s]) _other_indexes = np.concatenate([_other_indexes, filler_o]) elif how == "right": _self_indexes = np.concatenate([_self_indexes, filler_s]) _other_indexes = np.concatenate([_other_indexes, missing_in_o]) return _self_indexes, _other_indexes
def _both_dfs(scdf, ocdf, how=False): assert how in "containment first".split() + [False, None] starts = scdf.Start.values ends = scdf.End.values indexes = scdf.index.values ocdf = ocdf.reset_index(drop=True) it = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values) if not how: _self_indexes, _other_indexes = it.all_overlaps_both( starts, ends, indexes) elif how == "containment": _self_indexes, _other_indexes = it.all_containments_both( starts, ends, indexes) else: _self_indexes, _other_indexes = it.first_overlap_both( starts, ends, indexes) _self_indexes = _self_indexes _other_indexes = _other_indexes scdf = scdf.reindex(_self_indexes) ocdf = ocdf.reindex(_other_indexes) return scdf, ocdf
def _intersection(scdf, ocdf, kwargs): how = kwargs["how"] if ocdf.empty or scdf.empty: return None assert how in "containment first last".split() + [False, None] starts = scdf.Start.values ends = scdf.End.values indexes = scdf.index.values in_dtype = ocdf.Start.dtype oncls = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values) if not how or how is None: _self_indexes, _other_indexes = oncls.all_overlaps_both( starts, ends, indexes) elif how == "containment": _self_indexes, _other_indexes = oncls.all_containments_both( starts, ends, indexes) elif how == "first": _self_indexes, _other_indexes = oncls.first_overlap_both( starts, ends, indexes) elif how == "last": _self_indexes, _other_indexes = oncls.last_overlap_both( starts, ends, indexes) _self_indexes = _self_indexes _other_indexes = _other_indexes scdf, ocdf = scdf.reindex(_self_indexes), ocdf.reindex(_other_indexes) new_starts = pd.Series( np.where(scdf.Start.values > ocdf.Start.values, scdf.Start, ocdf.Start), index=scdf.index, dtype=in_dtype) new_ends = pd.Series( np.where(scdf.End.values < ocdf.End.values, scdf.End, ocdf.End), index=scdf.index, dtype=in_dtype) pd.options.mode.chained_assignment = None # default='warn' scdf.loc[:, "Start"] = new_starts scdf.loc[:, "End"] = new_ends pd.options.mode.chained_assignment = 'warn' if not scdf.empty: return scdf else: return None
def test_ncls(): # ids = starts print(starts, ends, ids) ncls = NCLS(starts, ends, ids) print(ncls) print(ncls.intervals()) assert list(ncls.find_overlap(0, 2)) == [] assert list(ncls.find_overlap(0, 2_147_483_647)) == [(5, 6, 0), (2_147_483_645, 2_147_483_646, 3)] r, l = ncls.all_overlaps_both(starts, ends, ids) assert list(r) == [0, 3] assert list(l) == [0, 3]
def _number_overlapping(scdf, ocdf, kwargs): keep_nonoverlapping = kwargs.get("keep_nonoverlapping", True) if scdf.empty: return None if ocdf.empty: if keep_nonoverlapping: df = scdf.copy() # print(df) df.insert(df.shape[1], "NumberOverlaps", 0) # print("df" * 100) # print(df) return df else: return None oncls = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values) starts = scdf.Start.values ends = scdf.End.values indexes = scdf.index.values _self_indexes, _other_indexes = oncls.all_overlaps_both( starts, ends, indexes) s = pd.Series(_self_indexes) counts_per_read = s.value_counts()[s.unique()].reset_index() counts_per_read.columns = ["Index", "Count"] df = scdf.copy() if keep_nonoverlapping: _missing_indexes = np.setdiff1d(scdf.index, _self_indexes) missing = pd.DataFrame(data={ "Index": _missing_indexes, "Count": 0 }, index=_missing_indexes) counts_per_read = pd.concat([counts_per_read, missing]) else: df = df.loc[_self_indexes] counts_per_read = counts_per_read.set_index("Index") df.insert(df.shape[1], "NumberOverlaps", counts_per_read) return df
def ncls_overlap(self, decimal_places = 5, start_idx = 0, end_idx = None): # if end_idx is none set as the end of the list if end_idx is None: end_idx == len(self.start) #decimal_places = accuracy of the retentionTime axis (how many decimal places should take into acoount) (since NCLS works optimally with integers) # convert the retention times to integers so that it is compatible with ncls int_start = (self.start * (10**decimal_places)).astype(int) int_end = (self.end * (10**decimal_places)).astype(int) #create the ncls object ncls = NCLS(int_start, int_end, self.idx) #find all pairwise retention time overlaps, store in a vertical 2XN numpy nd array return np.column_stack(ncls.all_overlaps_both(int_start[start_idx:end_idx], int_end[start_idx:end_idx], self.idx[start_idx:end_idx])) #column stack puts the two lists vertically (easier iteration for np.vectorize)
def test_ncls(): # ids = starts print(starts, ends, ids) ncls = NCLS(starts, ends, ids) print(ncls) print(ncls.intervals()) assert list(ncls.find_overlap(0, 2)) == [] print("aaa", list(ncls.find_overlap(9_223_372_036_854_775_805, 9_223_372_036_854_775_806))) assert list(ncls.find_overlap(0, 9_223_372_036_854_775_806)) == [(5, 6, 2147483647), (9223372036854775805, 9223372036854775807, 3)] r, l = ncls.all_overlaps_both(starts, ends, ids) assert list(r) == [2147483647, 3] assert list(l) == [2147483647, 3]
def overlap(self, im=True, decimal_places=5): #if index not linear then filter and use this hidden index use_hidden = False #if true that means reindex done for overlap (with have to unindex before return results) if not np.all(self.retentionTable.idx == np.arange( 0, len(self.retentionTable.start))): hidden_idx = np.arange(0, len(self.retentionTable.start)) use_hidden = True else: hidden_idx = self.retentionTable.idx #decimal_places = accuracy of the retentionTime axis (how many decimal places should take into acoount) # convert the retention times to integers so that it is compatible with ncls ret_int_start = (self.retentionTable.start * (10**decimal_places)).astype(int) ret_int_end = (self.retentionTable.end * (10**decimal_places)).astype(int) #create the ncls object ncls = NCLS(ret_int_start, ret_int_end, hidden_idx) #find all pairwise retention time overlaps, store in a vertical 2XN numpy nd array ret_idx = np.column_stack( ncls.all_overlaps_both(ret_int_start, ret_int_end, hidden_idx) ) #column stack puts the two lists vertically (easier iteration for np.vectorize) #filter out pairs where x=y, although these overlap not interested in them ret_idx = ret_idx[ret_idx[:, 0] != ret_idx[:, 1]] if ret_idx.size > 0: #only look for overlap if there is overlap in retention time #if im flag on, then have to check for overlap in both mz and im dimensions if im: rslt = ret_idx[self.__vecMzImOverlap(ret_idx[:, 0], ret_idx[:, 1])] else: rslt = ret_idx[self.mzTable.vec_idx_overlap( ret_idx[:, 0], ret_idx[:, 1])] else: rslt = np.array([]) #unindex if need to if use_hidden: return Precursor.unindex(rslt, self.retentionTable.idx) else: return rslt
def test_ncls(): starts = pd.Series(range(0, int(1e6))) ends = starts + 100 ids = starts print(starts, ends, ids) ncls = NCLS(starts.values, ends.values, ids.values) # starts = pd.Series([0, 4]) # ends = pd.Series([2, 5]) # indexes = pd.Series([98, 99]) print(starts, ends, indexes) it = ncls.all_overlaps_both_stack(starts.values, ends.values, indexes.values) it2 = ncls.all_overlaps_both(starts.values, ends.values, indexes.values) print(it) print(it2) assert it == it2
def _both_indexes(scdf, ocdf, how=False): assert (how in "containment first".split() + [False, None]) or isinstance( how, int) starts = scdf.Start.values ends = scdf.End.values indexes = scdf.index.values it = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values) if not how: _self_indexes, _other_indexes = it.all_overlaps_both( starts, ends, indexes) elif how == "containment": _self_indexes, _other_indexes = it.all_containments_both( starts, ends, indexes) else: _self_indexes, _other_indexes = it.first_overlap_both( starts, ends, indexes) return _self_indexes, _other_indexes
def overlap_against_window(self, exp, im=True, decimal_places=5): print("starting overlap against window") ms2 = exp.ms2 #decimal_places = accuracy of the retentionTime axis (how many decimal places should take into acoount) # convert the retention times to integers so that it is compatible with ncls ret_int_start = (self.retentionTable.start * (10**decimal_places)).astype(int) ret_int_end = (self.retentionTable.end * (10**decimal_places)).astype(int) ms2_time_int_start = (ms2.timeTable.start * (10**decimal_places)).astype(int) ms2_time_int_end = (ms2.timeTable.end * (10**decimal_places)).astype(int) print(ret_int_start) print(ret_int_end) #create the ncls object ncls = NCLS(ret_int_start, ret_int_end, self.retentionTable.idx) #find all pairwise retention time overlaps, store in a vertical 2XN numpy nd array ret_idx = np.column_stack( ncls.all_overlaps_both(ms2_time_int_start, ms2_time_int_end, ms2.timeTable.idx) ) #column stack puts the two lists vertically (easier iteration for np.vectorize) ret_idx = np.fliplr(ret_idx) #if im flag on, then have to check for overlap in both mz and im dimensions if im: return ret_idx[self.vec_mz_im_overlap(ret_idx[:, 0], ret_idx[:, 1], idx2_data=ms2)] else: #print(ms2.mzTable) return ret_idx[self.mzTable.vec_idx_overlap(ret_idx[:, 0], ret_idx[:, 1], yData=ms2.mzTable)]
def _both_dfs(scdf, ocdf, how=False, **kwargs): assert how in "containment first".split() + [False, None] starts = scdf.Start.values ends = scdf.End.values indexes = scdf.index.values it = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values) if not how: _self_indexes, _other_indexes = it.all_overlaps_both( starts, ends, indexes) elif how == "containment": _self_indexes, _other_indexes = it.all_containments_both( starts, ends, indexes) else: _self_indexes, _other_indexes = it.first_overlap_both( starts, ends, indexes) _self_indexes = _self_indexes _other_indexes = _other_indexes return scdf.loc[_self_indexes], ocdf.loc[_other_indexes]
def overlap_against_other(self, query, im=True, decimal_places=5): #decimal_places = accuracy of the retentionTime axis (how many decimal places should take into acoount) # convert the retention times to integers so that it is compatible with ncls ret_int_start = (self.retentionTable.start * (10**decimal_places)).astype(int) ret_int_end = (self.retentionTable.end * (10**decimal_places)).astype(int) query_ret_int_start = (query.retentionTable.start * (10**decimal_places)).astype(int) query_ret_int_end = (query.retentionTable.end * (10**decimal_places)).astype(int) #create the ncls object ncls = NCLS(ret_int_start, ret_int_end, self.retentionTable.idx) #find all pairwise retention time overlaps, store in a vertical 2XN numpy nd array ret_idx = np.column_stack( ncls.all_overlaps_both(query_ret_int_start, query_ret_int_end, query.retentionTable.idx) ) #column stack puts the two lists vertically (easier iteration for np.vectorize) print(ret_idx) #reverse columns so have the retention time idx first then the frame idx ret_idx = np.flipr(ret_idx) print(ret_idx) #can't filter out because the index are not the same #ret_idx = ret_idx[ret_idx[:,0] != ret_idx[:,1]] #if im flag on, then have to check for overlap in both mz and im dimensions if im: return ret_idx[self.__vecMzImOverlap(ret_idx[:, 0], ret_idx[:, 1])] else: return ret_idx[self.mzTable.vec_idx_overlap( ret_idx[:, 0], ret_idx[:, 1])]
values2 = np.ones(len(starts2)) # Test AIList i = AIList() i.from_array(starts1, ends1, ids1, values1) i.construct() ai_res = i.intersect_from_array(starts2, ends2, ids2) i.intersect(starts2[50], ends2[50]) # Test NCLS n = NCLS(starts1, ends1, ids1) n_res = n.all_overlaps_both(starts2, ends2, ids2) list(n.find_overlap(starts2[50], ends2[50])) # Test pandas p = pd.IntervalIndex.from_tuples(list(zip(starts1, ends1))) p.overlaps(pd.Interval(starts2[50], ends2[50])) # Test quicksect b = quicksect.IntervalTree() for i in range(len(starts1)): b.add(starts1[i], ends1[i]) b.search(starts2[50], ends2[50])
def projection(GTF_FILE, VCF_FILE, chrom_set=set()): """ Projects VCF file to transcript coordinates. Creates intermediate file `` Parameters ---------- GTF_FILE : string containing GTF file name, assumed to be unzipped VCF_FILE : string containing VCF file name, can be a gzipped file chrom_set : set() set of chromosomes to be sampled Returns ------- vcf_txome : pandas dataframe with the following header [ 'chrom_x', 'gene', 'txome', 'relative_pos', 'transcript_length', 'id', 'ref', 'alt', 'qual', 'filter', 'info', 'format', 'samples' ] call it truncated VCF """ from timeit import default_timer as timer start = timer() # records = read_vcf(VCF_FILE) records = tiny_vcf_reader(VCF_FILE) df = pd.DataFrame(records, columns=VCF_FIELDS) end = timer() print('parsed vcf in', (end - start), 'seconds') start = timer() # Create a minimal for GTF import sys import subprocess cmd = [ 'cat', GTF_FILE, '| awk -F \"\t\" \'($3 == \"transcript\") {print $1,$4,$5,$9}\'', '| tr -d \";\\"\"', '| awk \'{print $1,$2,$3,$5,$9}\' > chrome_gene_tr.info' ] print('running ...') print(' '.join(cmd)) retval = subprocess.call(' '.join(cmd), shell=True) if (retval): print('awk commant failed') sys.exit(1) gtf_df = pd.read_csv('chrome_gene_tr.info', sep=' ', names=['chrom', 'start', 'end', 'gene', 'txome'], header=None) retval = subprocess.call('rm chrome_gene_tr.info', shell=True) if (retval): print('can\'t delete the intermediate file') if len(chrom_set) == 0: chrom_set = set(gtf_df.chrom.values) dataframes = [] for i in chrom_set: gtf_df_subset = gtf_df.loc[gtf_df.chrom == i] df_subset = df.loc[df.chrom == i] if (not len(df_subset)): continue start_val = gtf_df.loc[gtf_df.chrom == i].start.values end_val = gtf_df.loc[gtf_df.chrom == i].end.values indices = gtf_df.loc[gtf_df.chrom == i].index.values query_start_val = df.loc[df.chrom == i].pos.values query_end_val = df.loc[df.chrom == i].pos.values + 1 query_indices = df.loc[df.chrom == i].index.values ncls = NCLS(np.array(start_val), np.array(end_val), indices) result = ncls.all_overlaps_both(query_start_val, query_end_val, query_indices) map_df = pd.DataFrame(list(zip(*result)), columns=['vcf_index', 'gtf_index']) if (not len(map_df)): continue vcf_gtf_subset = pd.merge( gtf_df_subset.join(map_df.set_index('gtf_index')), df_subset, left_on='vcf_index', left_index=False, right_index=True, ) dataframes += [vcf_gtf_subset] print('chromosome ', i, ' done') if (not len(dataframes)): print('there is no intersection with the VCF...exiting') return None print('Merging the chromosomes...') vcf_gtf = pd.concat(dataframes) vcf_gtf['relative_pos'] = vcf_gtf['pos'] - vcf_gtf['start'] vcf_gtf['transcript_length'] = vcf_gtf['end'] - vcf_gtf['start'] + 1 end = timer() print('elapsed ', (end - start), " seconds") return (vcf_gtf)
from ncls import NCLS import pickle import pandas as pd import numpy as np starts = np.array(list(reversed([3, 5, 8])), dtype=np.int) ends = np.array(list(reversed([6, 7, 9])), dtype=np.int) indexes = np.array(list(reversed([0, 1, 2])), dtype=np.int) # starts = np.array([3, 5, 8], dtype=np.int) # ends = np.array([6, 7, 9], dtype=np.int) # indexes = np.array([0, 1, 2], dtype=np.int) ncls = NCLS(starts, ends, indexes) starts2 = np.array([1, 6]) ends2 = np.array([10, 7]) indexes2 = np.array([0, 1]) print(ncls.all_overlaps_both(starts2, ends2, indexes2))