def combine(trackList, fn, win_size=1000, aggregate={}): """ Applies a custom function to a list of tracks, such as union, intersection, etc., and return a single result track. The input streams need to be ordered w.r.t 'chr', 'start' and 'end'. To be applied chromosome by chromosome. Only fields of the first track are kept. Values for a common field are merged by default according to `common.strand_merge`,`common.no_merge` and `common.generic_merge`, respectively for strand, chromosome and all others. :param trackList: list of FeatureStream objects. :param fn: boolean function to apply, such as bbcflib.gfminer.stream.union. :param win_size: (int) window size, in bp. :param aggregate: (dict) for each field name given as a key, its value is the function to apply to the vector containing all trackList's values for this field in order to merge them. E.g. ``{'score': lambda x: sum(x)/len(x)}`` will return the average of all *trackList*'s scores in the output. :rtype: FeatureStream """ aggregate.setdefault('strand',common.strand_merge) aggregate.setdefault('chr',common.no_merge) _f = ['start','end'] if all('chr' in t.fields for t in trackList): _f += ['chr'] if isinstance(fn,str): fn = eval(fn) # can type "combine(...,fn='intersection')" trackList = [common.cobble(common.reorder(t,fields=_f)) for t in trackList] return common.fusion(FeatureStream(_combine(trackList,fn,win_size,aggregate), fields=trackList[0].fields))
def overlap(trackList,trackFeatures,strict=False,annotate=False,flatten=common.cobble): """ For each stream in *trackList*, keep only items overlapping at least one element of *trackFeatures*. The input streams need to be ordered w.r.t 'chr', 'start' and 'end'. To be applied chromosome by chromosome. If several tracks are given in either trackList or trackFeatures, they will be concatenated into one. :param trackList: FeatureStream - the elements to be filtered. If a list of streams is provided, they will be merged (using `concatenate`). :param trackFeatures: FeatureStream - the filter. If a list fo streams is provided, they will be merged (using `concatenate`). :param strict: (bool) if True, only score regions from *trackList* that entirely contain a feature region of *trackFeatures* will be returned. [False] :param annotate: (bool) if True, supplementary annotation (and the corresponding fields) from *trackFeatures* will be added to the result. [False] :param flatten: (func) one of None, `common.fusion` or `common.cobble`. Function to be applied to *trackFeatures* before all. [common.cobble] :rtype: FeatureStream """ def _overlap(tl,tf,stranded,strict): if strict: olap = lambda x,y: x[0] <= y[0] and y[1] <= x[1] else: olap = lambda x,y: x[0] < y[1] if stranded: tl_strand_idx = tl.fields.index('strand') tf_strand_idx = tf.fields.index('strand') same_strand = lambda x,y:x[tl_strand_idx]==y[tf_strand_idx] else: same_strand = lambda x,y:True x = tl.next() for y in tf: try: if not same_strand(x,y): x = tl.next() while x[1] <= y[0]: x = tl.next() while olap(x,y): yield x x = tl.next() except StopIteration: break if isinstance(trackFeatures,(list,tuple)): trackList = concatenate(trackFeatures) if isinstance(trackFeatures,(list,tuple)): trackFeatures = concatenate(trackFeatures) stranded = 'strand' in (set(trackList.fields) & set(trackFeatures.fields)) if flatten is None: _tf = trackFeatures else: _tf = flatten(trackFeatures,stranded=stranded) _tl = common.reorder(trackList,['start','end']) _tf = common.reorder(trackFeatures,['start','end']) return FeatureStream(_overlap(_tl,_tf,stranded,strict), _tl.fields)
def merge_scores(trackList, method="arithmetic"): """ Creates a stream with per-base average of several score tracks:: X1: __________666666666______ X2: _____2222222222__________ R: _____11111444443333______ :param trackList: list of FeatureStream objects. :param method: (str) type of average: one of 'arithmetic','geometric', or 'sum' (no average). :rtype: FeatureStream """ tracks = [FeatureStream(common.sentinelize(x, [sys.maxint] * len(x.fields)), x.fields) for x in trackList] tracks = [common.reorder(t, ["start", "end", "score"]) for t in tracks] fields = [f for f in tracks[0].fields if all([f in t.fields for t in tracks])] # common fields elements = [list(x.next()) for x in tracks] track_denom = 1.0 / len(trackList) if hasattr(method, "__call__"): mean_fn = lambda scores, denom: method(scores) else: mean_fn = _score_functions.get(method, _arithmetic_mean) for i in xrange(len(tracks) - 1, -1, -1): if elements[i][0] == sys.maxint: tracks.pop(i) elements.pop(i) def _stream(tracks): while tracks: start = min([x[0] for x in elements]) end = min([x[0] for x in elements if x[0] > start] + [x[1] for x in elements]) scores = [x[2] for x in elements if x[1] > start and x[0] < end] if len(fields) > 3: rest = [] for i in range(len(fields[3:])): r = [str(x[3 + i]) for x in elements if not (x[3 + i] is None) and x[1] > start and x[0] < end] if all([x == r[0] for x in r]): rest.append(r[0]) else: rest.append("|".join(r)) yield (start, end, mean_fn(scores, track_denom)) + tuple(rest) else: yield (start, end, mean_fn(scores, track_denom)) for i in xrange(len(tracks) - 1, -1, -1): if elements[i][0] < end: elements[i][0] = end if elements[i][1] <= end: elements[i] = list(tracks[i].next()) if elements[i][0] == sys.maxint: tracks.pop(i) elements.pop(i) return FeatureStream(_stream(tracks), fields)
def score_by_feature(trackScores, trackFeatures, method='mean'): """ For every feature from *trackFeatures*, get the list of all scores it contains and apply an operation *method* on this list (by default, scores are averaged). Warning: both score and feature streams must be sorted! (use `common.sorted_stream` is necessary). The output is a stream similar to *trackFeatures* but with an additional `score` field for each stream in *trackScores*:: method = 'mean': X: ------##########--------------##########------ Y: ___________666666666__________6666666666______ R: ______[ 3. ]______________[ 6. ]______ method = 'sum': X : ------##########--------------##########------ Y1: ___________666666666__________6666666666______ Y2: ___222222_____________________333_____________ R : ______[ 30,6 ]______________[ 60,9 ]______ :param trackScores: (list of) one or several -sorted- score track(s) (FeatureStream). :param trackFeatures: (FeatureStream) one -sorted- feature track. :param method: (str of function): operation applied to the list of scores from one feature. Can be one of 'sum','mean','median','min','max', or a custom function. :rtype: FeatureStream """ def _stream(ts, tf): X = [common.sentinelize(x, [sys.maxint] * len(x.fields)) for x in ts] S = [[(-sys.maxint, -sys.maxint, 0.0)] for t in ts] start_idx = tf.fields.index('start') end_idx = tf.fields.index('end') if hasattr(method, '__call__'): mean_fn = lambda scores, denom: method(scores) else: mean_fn = _score_functions.get(method, _arithmetic_mean) for y in tf: ystart = y[start_idx] yend = y[end_idx] scores = () for i in range(len(ts)): xnext = S[i][-1] # Load into S all score items which intersect feature y while xnext[0] < yend: xnext = X[i].next() if xnext[1] > ystart: S[i].append(xnext) n = 0 while S[i][n][1] <= ystart: n += 1 S[i] = S[i][n:] scores_y = [] for s in S[i]: if yend <= s[0]: continue if s[0] < ystart: start = ystart else: start = s[0] if yend < s[1]: end = yend else: end = s[1] scores_y.extend([s[2]] * (end - start)) scores += (mean_fn(scores_y, 1.0 / (yend - ystart)), ) yield tuple(y) + scores if not (isinstance(trackScores, (list, tuple))): trackScores = [trackScores] if isinstance(trackFeatures, (list, tuple)): trackFeatures = concatenate(trackFeatures) if len(trackScores) > 1 or 'score' in trackFeatures.fields: _fields = ["score" + str(i) for i in range(len(trackScores))] else: _fields = ["score"] _ts = [common.reorder(t, ['start', 'end', 'score']) for t in trackScores] return FeatureStream(_stream(_ts, trackFeatures), trackFeatures.fields + _fields)
def filter_scores(trackScores, trackFeatures, method='sum', strict=False, annotate=False, flatten=common.cobble): """ Extract from *trackScores* only the regions overlapping *trackFeatures*'s regions. Warning: both score and features streams must be sorted! (use `common.sorted_stream` if necessary). Example:: X: _____#########__________#############_______ Y: __________666666666___2222776_444___________ R: __________6666__________22776_444___________ Note: *trackFeatures* is :func:`cobbled <bbcflib.gfminer.common.cobble>` by default (to avoid score duplications). An alternative is :func:`fusion <bbcflib.gfminer.common.fusion>`, or nothing. If strand information is present in both *trackScores* and *trackFeatures*, only scores inside a region of the same strand are kept. :param trackScores: (FeatureStream) one -sorted- score track. If a list of streams is provided, they will be merged (using `merge_scores`). :param trackFeatures: (FeatureStream) one -sorted- feature track. If a list of streams is provided, they will be merged (using `concatenate`). :param method: (str) `merge_scores` *method* argument, in case *trackScores* is a list. ['sum'] :param strict: (bool) if True, only score regions from *trackScores* that are strictly contained in a feature region of *trackFeatures* will be returned. [False] :param annotate: (bool) if True, supplementary annotation (and the corresponding fields) from *trackFeatures* will be added to the result. [False] :param flatten: (func) one of None, `common.fusion` or `common.cobble`. Function to be applied to *trackFeatures* before all. [common.cobble] :rtype: FeatureStream """ def _stream(ts, tf): tf = common.sentinelize(tf, [sys.maxint] * len(tf.fields)) info_idx = [k for k, f in enumerate(tf.fields) if f not in ts.fields] if stranded: ts_strand_idx = ts.fields.index('strand') tf_strand_idx = tf.fields.index('strand') same_strand = lambda x, y: x[ts_strand_idx] == y[tf_strand_idx] else: same_strand = lambda x, y: True Y = [] ynext = (-sys.maxint, -sys.maxint, 0.0) for x in ts: xstart = x[0] xend = x[1] # Load into Y all feature items which intersect score x while ynext[0] < xend: if ynext[1] > xstart: Y.append(ynext) ynext = tf.next() # Remove features that are far behind x if Y: n = 0 try: while Y[n][1] <= xstart: n += 1 Y = Y[n:] except IndexError: Y = [ynext] # Yield intersections for y in Y: if not same_strand(x, y): continue info = tuple([y[k] for k in info_idx]) if annotate else () if strict and (y[0] > xstart or y[1] < xend): continue if y[0] >= xend: continue # keep for next iteration start = xstart if y[0] < xstart else y[0] end = xend if y[1] > xend else y[1] yield (start, end) + tuple(x[2:]) + info if isinstance(trackFeatures, (list, tuple)): trackFeatures = concatenate(trackFeatures) if isinstance(trackScores, (list, tuple)): trackScores = merge_scores(trackScores, method) _info_fields = [ f for f in trackFeatures.fields if f not in trackScores.fields ] if annotate else [] stranded = 'strand' in (set(trackScores.fields) & set(trackFeatures.fields)) if flatten is None: _tf = trackFeatures else: _tf = flatten(trackFeatures, stranded=stranded) _ts = common.reorder(trackScores, ['start', 'end']) _tf = common.reorder(_tf, ['start', 'end']) return FeatureStream(_stream(_ts, _tf), _ts.fields + _info_fields)
def _test(stream): return reorder(stream, fields=['2', '1'])
def test_reorder(self): stream = fstream([(10, 12, 0.5), (14, 15, 1.2)], fields=['start', 'end', 'score']) expected = [(12, 0.5, 10), (15, 1.2, 14)] res = list(reorder(stream, ['end', 'score', 'start'])) self.assertListEqual(res, expected)
def concatenate(trackList, fields=None, remove_duplicates=False, group_by=None, aggregate={}): """ Returns one stream containing all features from a list of tracks, ordered by *fields*. :param trackList: list of FeatureStream objects. :param fields: (list of str) list of fields to keep in the output (at least ['start','end']). :param remove_duplicates: (bool) whether to remove items that are identical in several of the tracks in *trackList*. [False] :param group_by: (list of str) if specified, elements having all values for these fields in common will be merged into a singe element. Other fields are merged according to *aggregate* if specified, or `common.generic_merge` by default. :aggregate: (dict) for each field name given as a key, its value is the function to apply to the vector containing all different values for this field in order to merge them. E.g. ``{'score': lambda x: sum(x)}`` will return the sum of all scores in the output. :rtype: FeatureStream """ def _find_min(feat_tuple): """Return the index of the 'smallest' element amongst a tuple of features from different tracks. Priority is given to the first field; if the first field items are equal amongst several elements, it looks at the second field, a.s.o.""" nmin = 0 xmin = feat_tuple[0] for n,x in enumerate(feat_tuple[1:]): if x[0] == sys.maxint: continue for k in range(len(x)): if cmp(hash(x[k]),hash(xmin[k]))<0: xmin = x nmin = n+1 break elif cmp(hash(x[k]),hash(xmin[k]))>0: break return nmin def _weave(_t,N): """Generator yielding all features represented in a list of tracks *_t*, sorted w.r.t the *N* first fields.""" current = [x.next()[:N] for x in _t] # init allfields = [t.fields for t in _t] n = _find_min(current) last = current[n] current[n] = _t[n].next()[:N] if not group_by: yield last while 1: # Remove duplicates if remove_duplicates: while not all([current.count(x)==1 for x in current]): for k in range(len(current)): if current.count(current[k]) > 1: current[k] = _t[k].next()[:N] n = _find_min(current) if current[n][0] == sys.maxint: break if group_by: idx = [allfields[n].index(f) for f in group_by] if all(current[n][i] == last[i] for i in idx): last = tuple(current[n][i] if i in idx \ else aggregate.get(allfields[n][i],common.generic_merge)((last[i],current[n][i])) \ for i in range(len(allfields[n]))) # merge last and current else: yield last last = current[n] else: yield current[n] current[n] = _t[n].next()[:N] if group_by: yield last if len(trackList) == 1: return trackList[0] if fields is None: fields = trackList[0].fields fields = [f for f in fields if all(f in t.fields for t in trackList)] _of = ['start','end'] if 'chr' in fields: _of = ['chr']+_of if 'name' in fields: _of += ['name'] _of += [f for f in fields if not(f in _of)] tl = [common.reorder(t,_of) for t in trackList] tl = [FeatureStream(common.sentinelize(x,(sys.maxint,)*len(x.fields)),x.fields) for x in tl] return FeatureStream(_weave(tl,len(_of)),fields=_of)
def getNearestFeature(features, annotations, thresholdPromot=2000, thresholdInter=20000, thresholdUTR=10): """ For each element of *features*, searches the nearest element of *annotations* and returns a stream similar to *features*, with additional annotation fields, e.g.:: ('chr5',12,14) -> ('chr5',12,14,'geneId|geneName','location_type','distance'). If there are several genes, they are separated by '_': geneId1|geneName1_geneId2|geneName2. For each gene, `location_type` is one of: * 'Intergenic' if there are no genes within a distance `thresholdInter`, * 'Included' if the feature is included in the gene, * 'Promot' if the feature is upstream and within `thresholdInter` of the gene start, * 'Upstream' if the feature is upstream and beyond the promoter of the gene, * '3UTR' if the feature is downstream and within `thresholdUTR`% of the distance to the next downstream gene, * 'Downstream' otherwise. These annotations can be concatenated with '_' as well. The distance to each gene is negative if the feature is included, positive otherwise. :param features: (FeatureStream) features track. :param annotations: (FeatureStream) gene annotation track (e.g. as obtained with assembly.gene_track()). :param thresholdPromot: (int) associates the promoter of each gene which promoter is within this distance of the feature. Above the threshold, associates only the closest. [2000] :param thresholdInter: (int) no gene beyond this distance will be considered. [100000] :param thresholdUTR: (int) in case the feature is surrounded by two eligible genes on the same strand: if distance to gene1's 3'UTR upstream is less than *thresholdUTR*% of the distance between gene1 and gene2, associated to 3'UTR of gene1, else to promoter of gene2. [10] :rtype: FeatureStream (..., str, str, str). :: <-- feat --> ______| thresholdPromot ++++++ thresholdPromot |______ -----|______|-------------------------------------------|______|---------- gene 1 gene 2 feat ______ thresholdInter ++++++ thresholdInter ______ -----|______|----------...------------------...----------------|______|--- gene 1 gene 2 feat --> ++++++ --> |______ 10% 90% |______ -----|______|------|------------------|______|----- (attributed to gene1) gene 1 thresholdUTR gene 2 """ def _get_feature(_t, _a): F = [] _a = common.sentinelize(_a, [sys.maxint] * len(_a.fields)) for peak in _t: distMinBefore = distMinAfter = thresholdInter + 1 gene = dist = typeLoc = "" geneBefore = geneAfter = strandBefore = strandAfter = None included = 0 # keep only genes which don't start too far for annot in _a: F.append(annot) if annot[0] > peak[1] + thresholdInter: break # remove genes that end too far fpop = -1 # always keep one gene before for annot in F: if annot[1] > peak[0] - thresholdInter: break fpop += 1 if fpop > 0: F = F[fpop:] for annot in F: # if the peak is totally included in the gene if (peak[0] >= annot[0]) and (annot[1] >= peak[1]): includedGene = annot[2] includedDist = ( annot[3] == -1) and annot[1] - peak[1] or peak[0] - annot[0] included = 1 # if the gene is totally included in the peak elif (annot[0] > peak[0]) and (peak[1] > annot[1]): includedGene = annot[2] includedDist = 0 included = 1 else: # if annot is not too far 3' and no intersection if 0 < (peak[0] - annot[1]) < distMinBefore: distMinBefore = peak[0] - annot[1] geneBefore = annot[2] strandBefore = annot[3] # if intersection (annot is before) elif annot[0] < peak[0] < annot[1]: distMinBefore = 0 geneBefore = annot[2] strandBefore = annot[3] #print "gene %s overlaps begin of peak %s" % (geneBefore,peakName) # if annot is not too far 5' and no intersection if 0 < (annot[0] - peak[1]) < distMinAfter: distMinAfter = annot[0] - peak[1] geneAfter = annot[2] strandAfter = annot[3] # if intersection (annot is after) elif annot[0] < peak[1] < annot[1]: distMinAfter = 0 geneAfter = annot[2] strandAfter = annot[3] #print "gene %s overlaps end of peak %s" % (geneAfter,peakName) # detect intergenic peak if not ( included ) and distMinBefore > thresholdInter and distMinAfter > thresholdInter: yield peak + ('', 'Intergenic', thresholdInter) continue # detect peak before the first or after the last gene on the chromosome if geneBefore == None: if distMinAfter <= thresholdInter: gene = geneAfter dist = distMinAfter typeLoc = (strandAfter == 1) and "Upstream" or "Downstream" elif geneAfter == None: if distMinBefore <= thresholdInter: gene = geneBefore dist = distMinBefore typeLoc = (strandBefore == -1) and "Upstream" or "Downstream" # detect peak between two genes on the same strand elif strandBefore == strandAfter: if strandBefore == 1: if thresholdUTR * distMinAfter > 100 * distMinBefore: gene = geneBefore dist = distMinBefore if distMinAfter < thresholdPromot: typeLoc = "3UTR" else: typeLoc = "Downstream" else: gene = geneAfter dist = distMinAfter if dist < thresholdPromot: typeLoc = "Promot" else: typeLoc = "Upstream" else: if thresholdUTR * distMinBefore > 100 * distMinAfter: gene = geneAfter dist = distMinAfter if distMinBefore < thresholdPromot: typeLoc = "3UTR" else: typeLoc = "Downstream" else: gene = geneBefore dist = distMinBefore if dist < thresholdPromot: typeLoc = "Promot" else: typeLoc = "Upstream" # detect peak between two genes on different strands else: # detect peak between 2 promoters if strandBefore == -1: typeLoc = "Upstream" if distMinBefore < distMinAfter: gene = geneBefore dist = distMinBefore if dist < thresholdPromot: typeLoc = "Promot" if distMinAfter < thresholdPromot: typeLoc += "_Promot" gene += "_" + geneAfter dist = str(dist) + "_" + str(distMinAfter) else: gene = geneAfter dist = distMinAfter if dist < thresholdPromot: typeLoc = "Promot" if distMinBefore < thresholdPromot: typeLoc += "_Promot" gene += "_" + geneBefore dist = str(dist) + "_" + str(distMinBefore) # detect peak between 2 3UTR else: typeLoc = "Downstream" # detect peak overlapping the 2 3UTR if distMinBefore == distMinAfter: if thresholdUTR * thresholdPromot > 100 * distMinBefore: typeLoc = "3UTR" typeLoc += "_" + typeLoc gene = geneBefore + "_" + geneAfter dist = str(distMinBefore) + "_" + str(distMinAfter) elif distMinBefore < distMinAfter: dist = distMinBefore gene = geneBefore if thresholdUTR * thresholdPromot > 100 * dist: typeLoc = "3UTR" else: dist = distMinAfter gene = geneAfter if thresholdUTR * thresholdPromot > 100 * dist: typeLoc = "3UTR" if included == 1: gene += "_" + includedGene if gene else includedGene dist = str(dist) dist = dist + "_" + str(includedDist) if dist else str( includedDist) typeLoc += "_Included" if typeLoc else "Included" yield peak + (gene, typeLoc, dist) if isinstance(features, (tuple, list)): features = features[0] if isinstance(annotations, (tuple, list)): annotations = annotations[0] features = common.reorder(features, ['start', 'end']) annot = common.reorder(annotations, ['start', 'end', 'name', 'strand']) _fields = features.fields + ['gene', 'location_type', 'distance'] return FeatureStream(_get_feature(features, annot), fields=_fields)
def score_by_feature(trackScores, trackFeatures, method="mean"): """ For every feature from *trackFeatures*, get the list of all scores it contains and apply an operation *method* on this list (by default, scores are averaged). Warning: both score and feature streams must be sorted! (use `common.sorted_stream` is necessary). The output is a stream similar to *trackFeatures* but with an additional `score` field for each stream in *trackScores*:: method = 'mean': X: ------##########--------------##########------ Y: ___________666666666__________6666666666______ R: ______[ 3. ]______________[ 6. ]______ method = 'sum': X : ------##########--------------##########------ Y1: ___________666666666__________6666666666______ Y2: ___222222_____________________333_____________ R : ______[ 30,6 ]______________[ 60,9 ]______ :param trackScores: (list of) one or several -sorted- score track(s) (FeatureStream). :param trackFeatures: (FeatureStream) one -sorted- feature track. :param method: (str of function): operation applied to the list of scores from one feature. Can be one of 'sum','mean','median','min','max', or a custom function. :rtype: FeatureStream """ def _stream(ts, tf): X = [common.sentinelize(x, [sys.maxint] * len(x.fields)) for x in ts] S = [[(-sys.maxint, -sys.maxint, 0.0)] for t in ts] start_idx = tf.fields.index("start") end_idx = tf.fields.index("end") if hasattr(method, "__call__"): mean_fn = lambda scores, denom: method(scores) else: mean_fn = _score_functions.get(method, _arithmetic_mean) for y in tf: ystart = y[start_idx] yend = y[end_idx] scores = () for i in range(len(ts)): xnext = S[i][-1] # Load into S all score items which intersect feature y while xnext[0] < yend: xnext = X[i].next() if xnext[1] > ystart: S[i].append(xnext) n = 0 while S[i][n][1] <= ystart: n += 1 S[i] = S[i][n:] scores_y = [] for s in S[i]: if yend <= s[0]: continue if s[0] < ystart: start = ystart else: start = s[0] if yend < s[1]: end = yend else: end = s[1] scores_y.extend([s[2]] * (end - start)) scores += (mean_fn(scores_y, 1.0 / (yend - ystart)),) yield tuple(y) + scores if not (isinstance(trackScores, (list, tuple))): trackScores = [trackScores] if isinstance(trackFeatures, (list, tuple)): trackFeatures = concatenate(trackFeatures) if len(trackScores) > 1 or "score" in trackFeatures.fields: _fields = ["score" + str(i) for i in range(len(trackScores))] else: _fields = ["score"] _ts = [common.reorder(t, ["start", "end", "score"]) for t in trackScores] return FeatureStream(_stream(_ts, trackFeatures), trackFeatures.fields + _fields)
def window_smoothing(trackList, window_size, step_size=1, stop_val=sys.maxint, featurewise=False): """ Given a (list of) signal track(s) *trackList*, a *window_size* L (in base pairs by default, or in number of features if *featurewise* is True), and a *step_size*, return as many signal tracks with, at each position p (multiple of *step_size*), the average score in the window [p-L/2, p+L/2]:: X: __________666666666666____________ R: ______12345666666666654321________ (not exact scores here) :param trackList: FeatureStream, or list of FeatureStream objects. :param window_size: (int) window size in bp. :param step_size: (int) step length (one score returned per *step_size* positions). [1] :param stop_val: (int) sequence length. [sys.maxint] :param featurewise: (bool) bp (False), or number of features (True). [False] :rtype: FeatureStream Example of windows, window_size=9, step_size=3: [0,1,2,3,4,5,6,7,8,9), [3,4,5,6,7,8,9,10,11,12), ... """ def _stepping_mean(track, score, denom): score = 0.0 F = [] score = 0.0 nmid = window_size / 2 for x in track: F.append(x) score += x[2] if len(F) < window_size: continue yield (F[nmid][0], F[nmid][1], round(score * denom + 1e-7, 6)) + F[nmid][3:] for shift in xrange(step_size): score -= F.pop(0)[2] def _running_mean(track, win_start, denom): score = 0.0 F = [] for x in track: F.append(x) fstart = F[0][0] fend = F[0][1] chrom = F[0][3] win_start = max(win_start, fstart - window_size) win_end = win_start + window_size lstart = F[-1][0] lend = F[-1][1] while win_end < lend: delta = 0 steps = [fend - win_start, lend - win_end] if fstart > win_start: steps.append(fstart - win_start) else: delta -= F[0][2] if lstart > win_end: steps.append(lstart - win_end) else: delta += F[-1][2] nsteps = min(steps) sst = -(win_start % step_size) % step_size sen = -(win_start + nsteps % step_size) % step_size win_center = (win_start + win_end) / 2 if abs(delta) > 1e-11: delta *= denom score += delta * sst for step in xrange(sst, nsteps, step_size): if score > 1e-11 and win_center + step >= 0 and win_center + step + step_size <= stop_val: yield (win_center + step, win_center + step + step_size, score, chrom) score += delta * step_size score -= delta * sen else: if score > 1e-11 and win_center + sst >= 0 and win_center + sen + nsteps <= stop_val: yield (win_center + sst, win_center + sen + nsteps, score, chrom) win_start += nsteps win_end += nsteps if fend <= win_start: F.pop(0) if F: fstart = F[0][0] fend = F[0][1] win_start = max(win_start, fstart - window_size) win_end = win_start + window_size if win_end > stop_val: break while F: delta = 0 steps = [fend - win_start] if fstart > win_start: steps.append(fstart - win_start) else: delta -= F[0][2] nsteps = min(steps) sst = -(win_start % step_size) % step_size sen = -(win_start + nsteps % step_size) % step_size win_center = (win_start + win_end) / 2 if abs(delta) > 1e-11: delta *= denom score += delta * sst for step in xrange(sst, nsteps, step_size): if score > 1e-11 and win_center + step >= 0 and win_center + step + step_size <= stop_val: yield (win_center + step, win_center + step + step_size, score, chrom) score += delta * step_size score -= delta * sen else: if score > 1e-11 and win_center + sst >= 0 and win_center + sen + nsteps <= stop_val: yield (win_center + sst, win_center + sen + nsteps, score, chrom) win_start += nsteps win_end += nsteps if fend <= win_start: F.pop(0) if F: fstart = F[0][0] fend = F[0][1] win_start = max(win_start, fstart - window_size) win_end = win_start + window_size if win_end > stop_val: break denom = 1.0 / window_size win_start = -window_size _f = ["start", "end", "score"] if featurewise: call = _stepping_mean else: call = _running_mean if isinstance(trackList, (list, tuple)): return [ FeatureStream(call(common.reorder(t, _f), win_start, denom), fields=_f) for n, t in enumerate(trackList) ] else: return FeatureStream(call(common.reorder(trackList, _f), win_start, denom), fields=_f)
def filter_scores(trackScores, trackFeatures, method="sum", strict=False, annotate=False, flatten=common.cobble): """ Extract from *trackScores* only the regions overlapping *trackFeatures*'s regions. Warning: both score and features streams must be sorted! (use `common.sorted_stream` if necessary). Example:: X: _____#########__________#############_______ Y: __________666666666___2222776_444___________ R: __________6666__________22776_444___________ Note: *trackFeatures* is :func:`cobbled <bbcflib.gfminer.common.cobble>` by default (to avoid score duplications). An alternative is :func:`fusion <bbcflib.gfminer.common.fusion>`, or nothing. If strand information is present in both *trackScores* and *trackFeatures*, only scores inside a region of the same strand are kept. :param trackScores: (FeatureStream) one -sorted- score track. If a list of streams is provided, they will be merged (using `merge_scores`). :param trackFeatures: (FeatureStream) one -sorted- feature track. If a list of streams is provided, they will be merged (using `concatenate`). :param method: (str) `merge_scores` *method* argument, in case *trackScores* is a list. ['sum'] :param strict: (bool) if True, only score regions from *trackScores* that are strictly contained in a feature region of *trackFeatures* will be returned. [False] :param annotate: (bool) if True, supplementary annotation (and the corresponding fields) from *trackFeatures* will be added to the result. [False] :param flatten: (func) one of None, `common.fusion` or `common.cobble`. Function to be applied to *trackFeatures* before all. [common.cobble] :rtype: FeatureStream """ def _stream(ts, tf): tf = common.sentinelize(tf, [sys.maxint] * len(tf.fields)) info_idx = [k for k, f in enumerate(tf.fields) if f not in ts.fields] if stranded: ts_strand_idx = ts.fields.index("strand") tf_strand_idx = tf.fields.index("strand") same_strand = lambda x, y: x[ts_strand_idx] == y[tf_strand_idx] else: same_strand = lambda x, y: True Y = [] ynext = (-sys.maxint, -sys.maxint, 0.0) for x in ts: xstart = x[0] xend = x[1] # Load into Y all feature items which intersect score x while ynext[0] < xend: if ynext[1] > xstart: Y.append(ynext) ynext = tf.next() # Remove features that are far behind x if Y: n = 0 try: while Y[n][1] <= xstart: n += 1 Y = Y[n:] except IndexError: Y = [ynext] # Yield intersections for y in Y: if not same_strand(x, y): continue info = tuple([y[k] for k in info_idx]) if annotate else () if strict and (y[0] > xstart or y[1] < xend): continue if y[0] >= xend: continue # keep for next iteration start = xstart if y[0] < xstart else y[0] end = xend if y[1] > xend else y[1] yield (start, end) + tuple(x[2:]) + info if isinstance(trackFeatures, (list, tuple)): trackFeatures = concatenate(trackFeatures) if isinstance(trackScores, (list, tuple)): trackScores = merge_scores(trackScores, method) _info_fields = [f for f in trackFeatures.fields if f not in trackScores.fields] if annotate else [] stranded = "strand" in (set(trackScores.fields) & set(trackFeatures.fields)) if flatten is None: _tf = trackFeatures else: _tf = flatten(trackFeatures, stranded=stranded) _ts = common.reorder(trackScores, ["start", "end"]) _tf = common.reorder(_tf, ["start", "end"]) return FeatureStream(_stream(_ts, _tf), _ts.fields + _info_fields)
def test_reorder(self): stream = fstream([(10,12,0.5), (14,15,1.2)], fields=['start','end','score']) expected = [(12,0.5,10), (15,1.2,14)] res = list(reorder(stream,['end','score','start'])) self.assertListEqual(res,expected)
def _test(stream): return reorder(stream,fields=['2','1'])
def window_smoothing(trackList, window_size, step_size=1, stop_val=sys.maxint, featurewise=False): """ Given a (list of) signal track(s) *trackList*, a *window_size* L (in base pairs by default, or in number of features if *featurewise* is True), and a *step_size*, return as many signal tracks with, at each position p (multiple of *step_size*), the average score in the window [p-L/2, p+L/2]:: X: __________666666666666____________ R: ______12345666666666654321________ (not exact scores here) :param trackList: FeatureStream, or list of FeatureStream objects. :param window_size: (int) window size in bp. :param step_size: (int) step length (one score returned per *step_size* positions). [1] :param stop_val: (int) sequence length. [sys.maxint] :param featurewise: (bool) bp (False), or number of features (True). [False] :rtype: FeatureStream Example of windows, window_size=9, step_size=3: [0,1,2,3,4,5,6,7,8,9), [3,4,5,6,7,8,9,10,11,12), ... """ def _stepping_mean(track, score, denom): score = 0.0 F = [] score = 0.0 nmid = window_size / 2 for x in track: F.append(x) score += x[2] if len(F) < window_size: continue yield (F[nmid][0], F[nmid][1], round(score * denom + 1e-7, 6)) + F[nmid][3:] for shift in xrange(step_size): score -= F.pop(0)[2] def _running_mean(track, win_start, denom): score = 0.0 F = [] for x in track: F.append(x) fstart = F[0][0] fend = F[0][1] chrom = F[0][3] win_start = max(win_start, fstart - window_size) win_end = win_start + window_size lstart = F[-1][0] lend = F[-1][1] while win_end < lend: delta = 0 steps = [fend - win_start, lend - win_end] if fstart > win_start: steps.append(fstart - win_start) else: delta -= F[0][2] if lstart > win_end: steps.append(lstart - win_end) else: delta += F[-1][2] nsteps = min(steps) sst = -(win_start % step_size) % step_size sen = -(win_start + nsteps % step_size) % step_size win_center = (win_start + win_end) / 2 if abs(delta) > 1e-11: delta *= denom score += delta * sst for step in xrange(sst, nsteps, step_size): if score > 1e-11 and win_center + step >= 0 and win_center + step + step_size <= stop_val: yield (win_center + step, win_center + step + step_size, score, chrom) score += delta * step_size score -= delta * sen else: if score > 1e-11 and win_center + sst >= 0 and win_center + sen + nsteps <= stop_val: yield (win_center + sst, win_center + sen + nsteps, score, chrom) win_start += nsteps win_end += nsteps if fend <= win_start: F.pop(0) if F: fstart = F[0][0] fend = F[0][1] win_start = max(win_start, fstart - window_size) win_end = win_start + window_size if win_end > stop_val: break while F: delta = 0 steps = [fend - win_start] if fstart > win_start: steps.append(fstart - win_start) else: delta -= F[0][2] nsteps = min(steps) sst = -(win_start % step_size) % step_size sen = -(win_start + nsteps % step_size) % step_size win_center = (win_start + win_end) / 2 if abs(delta) > 1e-11: delta *= denom score += delta * sst for step in xrange(sst, nsteps, step_size): if score > 1e-11 and win_center + step >= 0 and win_center + step + step_size <= stop_val: yield (win_center + step, win_center + step + step_size, score, chrom) score += delta * step_size score -= delta * sen else: if score > 1e-11 and win_center + sst >= 0 and win_center + sen + nsteps <= stop_val: yield (win_center + sst, win_center + sen + nsteps, score, chrom) win_start += nsteps win_end += nsteps if fend <= win_start: F.pop(0) if F: fstart = F[0][0] fend = F[0][1] win_start = max(win_start, fstart - window_size) win_end = win_start + window_size if win_end > stop_val: break denom = 1.0 / window_size win_start = -window_size _f = ['start', 'end', 'score'] if featurewise: call = _stepping_mean else: call = _running_mean if isinstance(trackList, (list, tuple)): return [ FeatureStream(call(common.reorder(t, _f), win_start, denom), fields=_f) for n, t in enumerate(trackList) ] else: return FeatureStream(call(common.reorder(trackList, _f), win_start, denom), fields=_f)
def neighborhood(trackList, before_start=None, after_end=None, after_start=None, before_end=None, on_strand=False): """ Given streams of features and four integers *before_start*, *after_end*, *after_start* and *before_end*, this will return one or two features for every input feature: * Only *before_start* and *after_end* are given:: (start, end, ...) -> (start-before_start, end+after_end, ...) * Only *before_start* and *after_start* are given:: (start, end, ...) -> (start-before_start, start+after_start, ...) * Only *after_end* and *before_end* are given:: (start, end, ...) -> (end-before_end, end+after_end, ...) * If all four parameters are given, a pair of features is generated:: (start, end, ...) -> (start-before_start, start+after_start, ...) (end-before_end, end+after_end, ...) * If the boolean parameter *on_strand* is set to True, then `start` and `end` are understood relative to orientation:: (start, end, -1, ...) -> (start-after_end, start+before_end, -1, ...) (end-after_start, end+before_start, -1, ...) (start, end, +1, ...) -> (start-before_start, start+after_start, +1, ...) (end-before_end, end+after_end, +1, ...) :param trackList: list of FeatureStream objects. :param before_start: (int) number of bp before the feature start. :param after_end: (int) number of bp after feature end. :param after_start: (int) number of bp after the feature start. :param before_end: (int) number of bp before the feature end. :param on_strand: (bool) True to respect strand orientation. [False] :rtype: FeatureStream """ def _generate_single(track,a,b,c,d): _buf = [] for x in track: if a: if on_strand and x[2]<0: _buf.append((x[0]-after_end, x[0]+before_end+1) + x[2:]) _buf.append((x[1]-after_start-1, x[1]+before_start) + x[2:]) else: _buf.append((x[0]-before_start, x[0]+after_start+1) + x[2:]) _buf.append((x[1]-before_end-1, x[1]+after_end) + x[2:]) elif b: if on_strand and x[2]<0: _buf.append((x[1]-after_start-1, x[1]+before_start) + x[2:]) else: _buf.append((x[0]-before_start, x[0]+after_start+1) + x[2:]) elif c: if on_strand and x[2]<0: _buf.append((x[0]-after_end, x[0]+before_end+1) + x[2:]) else: _buf.append((x[1]-before_end-1, x[1]+after_end) + x[2:]) elif d: if on_strand and x[2]<0: _buf.append((x[0]-after_end, x[1]+before_start) + x[2:]) else: _buf.append((x[0]-before_start, x[1]+after_end) + x[2:]) _buf.sort() while _buf and _buf[0][1] < x[0]: yield _buf.pop(0) while _buf: yield _buf.pop(0) _fields = ['start','end'] if on_strand: _fields += ['strand'] case1 = True case2 = True case3 = True case4 = True if before_start is None: case1 = case2 = case4 = False if after_end is None: case1 = case3 = case4 = False if after_start is None: case1 = case2 = False if before_end is None: case1 = case3 = False if isinstance(trackList,(list,tuple)): tl = [common.reorder(t,_fields) for t in trackList] return [FeatureStream(_generate_single(t,case1,case2,case3,case4), fields=t.fields) for t in tl] else: tl = common.reorder(trackList,_fields) return FeatureStream(_generate_single(tl,case1,case2,case3,case4), fields=tl.fields)
def merge_scores(trackList, method='arithmetic'): """ Creates a stream with per-base average of several score tracks:: X1: __________666666666______ X2: _____2222222222__________ R: _____11111444443333______ :param trackList: list of FeatureStream objects. :param method: (str) type of average: one of 'arithmetic','geometric', or 'sum' (no average). :rtype: FeatureStream """ tracks = [ FeatureStream(common.sentinelize(x, [sys.maxint] * len(x.fields)), x.fields) for x in trackList ] tracks = [common.reorder(t, ['start', 'end', 'score']) for t in tracks] fields = [ f for f in tracks[0].fields if all([f in t.fields for t in tracks]) ] # common fields elements = [list(x.next()) for x in tracks] track_denom = 1.0 / len(trackList) if hasattr(method, '__call__'): mean_fn = lambda scores, denom: method(scores) else: mean_fn = _score_functions.get(method, _arithmetic_mean) for i in xrange(len(tracks) - 1, -1, -1): if elements[i][0] == sys.maxint: tracks.pop(i) elements.pop(i) def _stream(tracks): while tracks: start = min([x[0] for x in elements]) end = min([x[0] for x in elements if x[0] > start] + [x[1] for x in elements]) scores = [x[2] for x in elements if x[1] > start and x[0] < end] if len(fields) > 3: rest = [] for i in range(len(fields[3:])): r = [ str(x[3 + i]) for x in elements if not (x[3 + i] is None) and x[1] > start and x[0] < end ] if all([x == r[0] for x in r]): rest.append(r[0]) else: rest.append("|".join(r)) yield (start, end, mean_fn(scores, track_denom)) + tuple(rest) else: yield (start, end, mean_fn(scores, track_denom)) for i in xrange(len(tracks) - 1, -1, -1): if elements[i][0] < end: elements[i][0] = end if elements[i][1] <= end: elements[i] = list(tracks[i].next()) if elements[i][0] == sys.maxint: tracks.pop(i) elements.pop(i) return FeatureStream(_stream(tracks), fields)
def getNearestFeature(features, annotations, thresholdPromot=2000, thresholdInter=20000, thresholdUTR=10): """ For each element of *features*, searches the nearest element of *annotations* and returns a stream similar to *features*, with additional annotation fields, e.g.:: ('chr5',12,14) -> ('chr5',12,14,'geneId|geneName','location_type','distance'). If there are several genes, they are separated by '_': geneId1|geneName1_geneId2|geneName2. For each gene, `location_type` is one of: * 'Intergenic' if there are no genes within a distance `thresholdInter`, * 'Included' if the feature is included in the gene, * 'Promot' if the feature is upstream and within `thresholdInter` of the gene start, * 'Upstream' if the feature is upstream and beyond the promoter of the gene, * '3UTR' if the feature is downstream and within `thresholdUTR`% of the distance to the next downstream gene, * 'Downstream' otherwise. These annotations can be concatenated with '_' as well. The distance to each gene is negative if the feature is included, positive otherwise. :param features: (FeatureStream) features track. :param annotations: (FeatureStream) gene annotation track (e.g. as obtained with assembly.gene_track()). :param thresholdPromot: (int) associates the promoter of each gene which promoter is within this distance of the feature. Above the threshold, associates only the closest. [2000] :param thresholdInter: (int) no gene beyond this distance will be considered. [100000] :param thresholdUTR: (int) in case the feature is surrounded by two eligible genes on the same strand: if distance to gene1's 3'UTR upstream is less than *thresholdUTR*% of the distance between gene1 and gene2, associated to 3'UTR of gene1, else to promoter of gene2. [10] :rtype: FeatureStream (..., str, str, str). :: <-- feat --> ______| thresholdPromot ++++++ thresholdPromot |______ -----|______|-------------------------------------------|______|---------- gene 1 gene 2 feat ______ thresholdInter ++++++ thresholdInter ______ -----|______|----------...------------------...----------------|______|--- gene 1 gene 2 feat --> ++++++ --> |______ 10% 90% |______ -----|______|------|------------------|______|----- (attributed to gene1) gene 1 thresholdUTR gene 2 """ def _get_feature(_t,_a): F = [] _a = common.sentinelize(_a, [sys.maxint]*len(_a.fields)) for peak in _t: distMinBefore = distMinAfter = thresholdInter+1 gene = dist = typeLoc = "" geneBefore = geneAfter = strandBefore = strandAfter = None included = 0 # keep only genes which don't start too far for annot in _a: F.append(annot) if annot[0] > peak[1]+thresholdInter: break # remove genes that end too far fpop = -1 # always keep one gene before for annot in F: if annot[1] > peak[0]-thresholdInter: break fpop += 1 if fpop>0: F = F[fpop:] for annot in F: # if the peak is totally included in the gene if (peak[0]>=annot[0]) and (annot[1]>=peak[1]): includedGene = annot[2] includedDist = (annot[3] == -1) and annot[1]-peak[1] or peak[0]-annot[0] included = 1 # if the gene is totally included in the peak elif (annot[0]>peak[0]) and (peak[1]>annot[1]): includedGene = annot[2] includedDist = 0 included = 1 else: # if annot is not too far 3' and no intersection if 0 < (peak[0]-annot[1]) < distMinBefore: distMinBefore = peak[0]-annot[1] geneBefore = annot[2] strandBefore = annot[3] # if intersection (annot is before) elif annot[0] < peak[0] < annot[1]: distMinBefore = 0 geneBefore = annot[2] strandBefore = annot[3] #print "gene %s overlaps begin of peak %s" % (geneBefore,peakName) # if annot is not too far 5' and no intersection if 0 < (annot[0]-peak[1]) < distMinAfter: distMinAfter = annot[0]-peak[1] geneAfter = annot[2] strandAfter = annot[3] # if intersection (annot is after) elif annot[0] < peak[1] < annot[1]: distMinAfter = 0 geneAfter = annot[2] strandAfter = annot[3] #print "gene %s overlaps end of peak %s" % (geneAfter,peakName) # detect intergenic peak if not(included) and distMinBefore > thresholdInter and distMinAfter > thresholdInter: yield peak+('','Intergenic',thresholdInter) continue # detect peak before the first or after the last gene on the chromosome if geneBefore == None: if distMinAfter <= thresholdInter: gene = geneAfter dist = distMinAfter typeLoc = (strandAfter == 1) and "Upstream" or "Downstream" elif geneAfter == None: if distMinBefore <= thresholdInter: gene = geneBefore dist = distMinBefore typeLoc = (strandBefore == -1) and "Upstream" or "Downstream" # detect peak between two genes on the same strand elif strandBefore == strandAfter: if strandBefore == 1: if thresholdUTR*distMinAfter > 100*distMinBefore: gene = geneBefore dist = distMinBefore if distMinAfter < thresholdPromot: typeLoc = "3UTR" else: typeLoc = "Downstream" else: gene = geneAfter dist = distMinAfter if dist < thresholdPromot: typeLoc = "Promot" else: typeLoc = "Upstream" else: if thresholdUTR*distMinBefore > 100*distMinAfter: gene = geneAfter dist = distMinAfter if distMinBefore < thresholdPromot: typeLoc = "3UTR" else: typeLoc = "Downstream" else: gene = geneBefore dist = distMinBefore if dist < thresholdPromot: typeLoc = "Promot" else: typeLoc = "Upstream" # detect peak between two genes on different strands else: # detect peak between 2 promoters if strandBefore == -1: typeLoc = "Upstream" if distMinBefore < distMinAfter: gene = geneBefore dist = distMinBefore if dist < thresholdPromot: typeLoc = "Promot" if distMinAfter < thresholdPromot: typeLoc += "_Promot" gene += "_"+geneAfter dist = str(dist)+"_"+str(distMinAfter) else: gene = geneAfter dist = distMinAfter if dist < thresholdPromot: typeLoc = "Promot" if distMinBefore < thresholdPromot: typeLoc += "_Promot" gene += "_"+geneBefore dist = str(dist)+"_"+str(distMinBefore) # detect peak between 2 3UTR else: typeLoc = "Downstream" # detect peak overlapping the 2 3UTR if distMinBefore == distMinAfter: if thresholdUTR*thresholdPromot > 100*distMinBefore: typeLoc = "3UTR" typeLoc += "_"+typeLoc gene = geneBefore+"_"+geneAfter dist = str(distMinBefore)+"_"+str(distMinAfter) elif distMinBefore < distMinAfter: dist = distMinBefore gene = geneBefore if thresholdUTR*thresholdPromot > 100*dist: typeLoc = "3UTR" else: dist = distMinAfter gene = geneAfter if thresholdUTR*thresholdPromot > 100*dist: typeLoc = "3UTR" if included == 1: gene += "_"+includedGene if gene else includedGene dist = str(dist) dist = dist+"_"+str(includedDist) if dist else str(includedDist) typeLoc += "_Included" if typeLoc else "Included" yield peak+(gene,typeLoc,dist) if isinstance(features,(tuple,list)): features = features[0] if isinstance(annotations,(tuple,list)): annotations = annotations[0] features = common.reorder(features,['start','end']) annot = common.reorder(annotations,['start','end','name','strand']) _fields = features.fields+['gene','location_type','distance'] return FeatureStream(_get_feature(features,annot),fields=_fields)