Ejemplo n.º 1
0
def combine(trackList, fn, win_size=1000, aggregate={}):
    """
    Applies a custom function to a list of tracks, such as union, intersection,
    etc., and return a single result track. The input streams need to be ordered
    w.r.t 'chr', 'start' and 'end'. To be applied chromosome by chromosome.

    Only fields of the first track are kept. Values for a common field are
    merged by default according to `common.strand_merge`,`common.no_merge` and `common.generic_merge`,
    respectively for strand, chromosome and all others.

    :param trackList: list of FeatureStream objects.
    :param fn: boolean function to apply, such as bbcflib.gfminer.stream.union.
    :param win_size: (int) window size, in bp.
    :param aggregate: (dict) for each field name given as a key, its value is the function
        to apply to the vector containing all trackList's values for this field in order
        to merge them. E.g. ``{'score': lambda x: sum(x)/len(x)}`` will return the average of
        all *trackList*'s scores in the output.
    :rtype: FeatureStream
    """
    aggregate.setdefault('strand',common.strand_merge)
    aggregate.setdefault('chr',common.no_merge)
    _f = ['start','end']
    if all('chr' in t.fields for t in trackList):
        _f += ['chr']
    if isinstance(fn,str): fn = eval(fn) # can type "combine(...,fn='intersection')"
    trackList = [common.cobble(common.reorder(t,fields=_f)) for t in trackList]
    return common.fusion(FeatureStream(_combine(trackList,fn,win_size,aggregate),
                                       fields=trackList[0].fields))
Ejemplo n.º 2
0
def overlap(trackList,trackFeatures,strict=False,annotate=False,flatten=common.cobble):
    """
    For each stream in *trackList*, keep only items overlapping at least one element
    of *trackFeatures*.  The input streams need to be ordered w.r.t 'chr', 'start' and 'end'.
    To be applied chromosome by chromosome. If several tracks are given in either trackList
    or trackFeatures, they will be concatenated into one.

    :param trackList: FeatureStream - the elements to be filtered.
        If a list of streams is provided, they will be merged (using `concatenate`).
    :param trackFeatures: FeatureStream - the filter.
        If a list fo streams is provided, they will be merged (using `concatenate`).
    :param strict: (bool) if True, only score regions from *trackList* that
        entirely contain a feature region of *trackFeatures* will be returned. [False]
    :param annotate: (bool) if True, supplementary annotation (and the corresponding fields)
        from *trackFeatures* will be added to the result. [False]
    :param flatten: (func) one of None, `common.fusion` or `common.cobble`.
        Function to be applied to *trackFeatures* before all. [common.cobble]
    :rtype: FeatureStream
    """
    def _overlap(tl,tf,stranded,strict):
        if strict: olap = lambda x,y: x[0] <= y[0] and y[1] <= x[1]
        else: olap = lambda x,y: x[0] < y[1]
        if stranded:
            tl_strand_idx = tl.fields.index('strand')
            tf_strand_idx = tf.fields.index('strand')
            same_strand = lambda x,y:x[tl_strand_idx]==y[tf_strand_idx]
        else: same_strand = lambda x,y:True
        x = tl.next()
        for y in tf:
            try:
                if not same_strand(x,y): x = tl.next()
                while x[1] <= y[0]: x = tl.next()
                while olap(x,y):
                    yield x
                    x = tl.next()
            except StopIteration: break

    if isinstance(trackFeatures,(list,tuple)): trackList = concatenate(trackFeatures)
    if isinstance(trackFeatures,(list,tuple)): trackFeatures = concatenate(trackFeatures)
    stranded = 'strand' in (set(trackList.fields) & set(trackFeatures.fields))
    if flatten is None: _tf = trackFeatures
    else: _tf = flatten(trackFeatures,stranded=stranded)
    _tl = common.reorder(trackList,['start','end'])
    _tf = common.reorder(trackFeatures,['start','end'])
    return FeatureStream(_overlap(_tl,_tf,stranded,strict), _tl.fields)
Ejemplo n.º 3
0
def merge_scores(trackList, method="arithmetic"):
    """
    Creates a stream with per-base average of several score tracks::

        X1: __________666666666______
        X2: _____2222222222__________
        R:  _____11111444443333______

    :param trackList: list of FeatureStream objects.
    :param method: (str) type of average: one of 'arithmetic','geometric', or 'sum' (no average).
    :rtype: FeatureStream
    """
    tracks = [FeatureStream(common.sentinelize(x, [sys.maxint] * len(x.fields)), x.fields) for x in trackList]
    tracks = [common.reorder(t, ["start", "end", "score"]) for t in tracks]
    fields = [f for f in tracks[0].fields if all([f in t.fields for t in tracks])]  # common fields
    elements = [list(x.next()) for x in tracks]
    track_denom = 1.0 / len(trackList)

    if hasattr(method, "__call__"):
        mean_fn = lambda scores, denom: method(scores)
    else:
        mean_fn = _score_functions.get(method, _arithmetic_mean)
    for i in xrange(len(tracks) - 1, -1, -1):
        if elements[i][0] == sys.maxint:
            tracks.pop(i)
            elements.pop(i)

    def _stream(tracks):
        while tracks:
            start = min([x[0] for x in elements])
            end = min([x[0] for x in elements if x[0] > start] + [x[1] for x in elements])
            scores = [x[2] for x in elements if x[1] > start and x[0] < end]
            if len(fields) > 3:
                rest = []
                for i in range(len(fields[3:])):
                    r = [str(x[3 + i]) for x in elements if not (x[3 + i] is None) and x[1] > start and x[0] < end]
                    if all([x == r[0] for x in r]):
                        rest.append(r[0])
                    else:
                        rest.append("|".join(r))
                yield (start, end, mean_fn(scores, track_denom)) + tuple(rest)
            else:
                yield (start, end, mean_fn(scores, track_denom))
            for i in xrange(len(tracks) - 1, -1, -1):
                if elements[i][0] < end:
                    elements[i][0] = end
                if elements[i][1] <= end:
                    elements[i] = list(tracks[i].next())
                if elements[i][0] == sys.maxint:
                    tracks.pop(i)
                    elements.pop(i)

    return FeatureStream(_stream(tracks), fields)
Ejemplo n.º 4
0
def score_by_feature(trackScores, trackFeatures, method='mean'):
    """
    For every feature from *trackFeatures*, get the list of all scores it contains
    and apply an operation *method* on this list (by default, scores are averaged).
    Warning: both score and feature streams must be sorted! (use `common.sorted_stream` is necessary).
    The output is a stream similar to *trackFeatures* but with an additional `score` field
    for each stream in *trackScores*::
        method = 'mean':

        X: ------##########--------------##########------
        Y: ___________666666666__________6666666666______
        R: ______[   3.   ]______________[   6.   ]______


        method = 'sum':

        X : ------##########--------------##########------
        Y1: ___________666666666__________6666666666______
        Y2: ___222222_____________________333_____________
        R : ______[  30,6  ]______________[  60,9  ]______

    :param trackScores: (list of) one or several -sorted- score track(s) (FeatureStream).
    :param trackFeatures: (FeatureStream) one -sorted- feature track.
    :param method: (str of function): operation applied to the list of scores from one feature.
        Can be one of 'sum','mean','median','min','max', or a custom function.
    :rtype: FeatureStream
    """
    def _stream(ts, tf):
        X = [common.sentinelize(x, [sys.maxint] * len(x.fields)) for x in ts]
        S = [[(-sys.maxint, -sys.maxint, 0.0)] for t in ts]
        start_idx = tf.fields.index('start')
        end_idx = tf.fields.index('end')
        if hasattr(method, '__call__'):
            mean_fn = lambda scores, denom: method(scores)
        else:
            mean_fn = _score_functions.get(method, _arithmetic_mean)
        for y in tf:
            ystart = y[start_idx]
            yend = y[end_idx]
            scores = ()
            for i in range(len(ts)):
                xnext = S[i][-1]
                # Load into S all score items which intersect feature y
                while xnext[0] < yend:
                    xnext = X[i].next()
                    if xnext[1] > ystart: S[i].append(xnext)
                n = 0
                while S[i][n][1] <= ystart:
                    n += 1
                S[i] = S[i][n:]
                scores_y = []
                for s in S[i]:
                    if yend <= s[0]: continue
                    if s[0] < ystart: start = ystart
                    else: start = s[0]
                    if yend < s[1]: end = yend
                    else: end = s[1]
                    scores_y.extend([s[2]] * (end - start))
                scores += (mean_fn(scores_y, 1.0 / (yend - ystart)), )
            yield tuple(y) + scores

    if not (isinstance(trackScores, (list, tuple))):
        trackScores = [trackScores]
    if isinstance(trackFeatures, (list, tuple)):
        trackFeatures = concatenate(trackFeatures)
    if len(trackScores) > 1 or 'score' in trackFeatures.fields:
        _fields = ["score" + str(i) for i in range(len(trackScores))]
    else:
        _fields = ["score"]
    _ts = [common.reorder(t, ['start', 'end', 'score']) for t in trackScores]
    return FeatureStream(_stream(_ts, trackFeatures),
                         trackFeatures.fields + _fields)
Ejemplo n.º 5
0
def filter_scores(trackScores,
                  trackFeatures,
                  method='sum',
                  strict=False,
                  annotate=False,
                  flatten=common.cobble):
    """
    Extract from *trackScores* only the regions overlapping *trackFeatures*'s regions.
    Warning: both score and features streams must be sorted! (use `common.sorted_stream` if necessary).
    Example::

        X: _____#########__________#############_______
        Y: __________666666666___2222776_444___________
        R: __________6666__________22776_444___________

    Note: *trackFeatures* is :func:`cobbled <bbcflib.gfminer.common.cobble>` by default (to avoid
    score duplications). An alternative is :func:`fusion <bbcflib.gfminer.common.fusion>`, or nothing.
    If strand information is present in both *trackScores* and *trackFeatures*, only scores inside
    a region of the same strand are kept.

    :param trackScores: (FeatureStream) one -sorted- score track.
        If a list of streams is provided, they will be merged (using `merge_scores`).
    :param trackFeatures: (FeatureStream) one -sorted- feature track.
        If a list of streams is provided, they will be merged (using `concatenate`).
    :param method: (str) `merge_scores` *method* argument, in case *trackScores* is a list. ['sum']
    :param strict: (bool) if True, only score regions from *trackScores* that are
        strictly contained in a feature region of *trackFeatures* will be returned. [False]
    :param annotate: (bool) if True, supplementary annotation (and the corresponding fields)
        from *trackFeatures* will be added to the result. [False]
    :param flatten: (func) one of None, `common.fusion` or `common.cobble`.
        Function to be applied to *trackFeatures* before all. [common.cobble]
    :rtype: FeatureStream
    """
    def _stream(ts, tf):
        tf = common.sentinelize(tf, [sys.maxint] * len(tf.fields))
        info_idx = [k for k, f in enumerate(tf.fields) if f not in ts.fields]
        if stranded:
            ts_strand_idx = ts.fields.index('strand')
            tf_strand_idx = tf.fields.index('strand')
            same_strand = lambda x, y: x[ts_strand_idx] == y[tf_strand_idx]
        else:
            same_strand = lambda x, y: True
        Y = []
        ynext = (-sys.maxint, -sys.maxint, 0.0)
        for x in ts:
            xstart = x[0]
            xend = x[1]
            # Load into Y all feature items which intersect score x
            while ynext[0] < xend:
                if ynext[1] > xstart:
                    Y.append(ynext)
                ynext = tf.next()
            # Remove features that are far behind x
            if Y:
                n = 0
                try:
                    while Y[n][1] <= xstart:
                        n += 1
                    Y = Y[n:]
                except IndexError:
                    Y = [ynext]
            # Yield intersections
            for y in Y:
                if not same_strand(x, y): continue
                info = tuple([y[k] for k in info_idx]) if annotate else ()
                if strict and (y[0] > xstart or y[1] < xend): continue
                if y[0] >= xend: continue  # keep for next iteration
                start = xstart if y[0] < xstart else y[0]
                end = xend if y[1] > xend else y[1]
                yield (start, end) + tuple(x[2:]) + info

    if isinstance(trackFeatures, (list, tuple)):
        trackFeatures = concatenate(trackFeatures)
    if isinstance(trackScores, (list, tuple)):
        trackScores = merge_scores(trackScores, method)
    _info_fields = [
        f for f in trackFeatures.fields if f not in trackScores.fields
    ] if annotate else []
    stranded = 'strand' in (set(trackScores.fields)
                            & set(trackFeatures.fields))
    if flatten is None:
        _tf = trackFeatures
    else:
        _tf = flatten(trackFeatures, stranded=stranded)
    _ts = common.reorder(trackScores, ['start', 'end'])
    _tf = common.reorder(_tf, ['start', 'end'])
    return FeatureStream(_stream(_ts, _tf), _ts.fields + _info_fields)
Ejemplo n.º 6
0
 def _test(stream):
     return reorder(stream, fields=['2', '1'])
Ejemplo n.º 7
0
 def test_reorder(self):
     stream = fstream([(10, 12, 0.5), (14, 15, 1.2)],
                      fields=['start', 'end', 'score'])
     expected = [(12, 0.5, 10), (15, 1.2, 14)]
     res = list(reorder(stream, ['end', 'score', 'start']))
     self.assertListEqual(res, expected)
Ejemplo n.º 8
0
def concatenate(trackList, fields=None, remove_duplicates=False, group_by=None, aggregate={}):
    """
    Returns one stream containing all features from a list of tracks, ordered by *fields*.

    :param trackList: list of FeatureStream objects.
    :param fields: (list of str) list of fields to keep in the output (at least ['start','end']).
    :param remove_duplicates: (bool) whether to remove items that are identical in several
        of the tracks in *trackList*. [False]
    :param group_by: (list of str) if specified, elements having all values for these fields in
        common will be merged into a singe element. Other fields are merged according to *aggregate*
        if specified, or `common.generic_merge` by default.
    :aggregate: (dict) for each field name given as a key, its value is the function
        to apply to the vector containing all different values for this field in order to merge them.
        E.g. ``{'score': lambda x: sum(x)}`` will return the sum of all scores in the output.
    :rtype: FeatureStream
    """
    def _find_min(feat_tuple):
        """Return the index of the 'smallest' element amongst a tuple of features from
        different tracks. Priority is given to the first field; if the first field items
        are equal amongst several elements, it looks at the second field, a.s.o."""
        nmin = 0
        xmin = feat_tuple[0]
        for n,x in enumerate(feat_tuple[1:]):
            if x[0] == sys.maxint: continue
            for k in range(len(x)):
                if cmp(hash(x[k]),hash(xmin[k]))<0:
                    xmin = x
                    nmin = n+1
                    break
                elif cmp(hash(x[k]),hash(xmin[k]))>0:
                    break
        return nmin

    def _weave(_t,N):
        """Generator yielding all features represented in a list of tracks *_t*,
        sorted w.r.t the *N* first fields."""
        current = [x.next()[:N] for x in _t] # init
        allfields = [t.fields for t in _t]
        n = _find_min(current)
        last = current[n]
        current[n] = _t[n].next()[:N]
        if not group_by: yield last
        while 1:
            # Remove duplicates
            if remove_duplicates:
                while not all([current.count(x)==1 for x in current]):
                    for k in range(len(current)):
                        if current.count(current[k]) > 1:
                            current[k] = _t[k].next()[:N]
            n = _find_min(current)
            if current[n][0] == sys.maxint: break
            if group_by:
                idx = [allfields[n].index(f) for f in group_by]
                if all(current[n][i] == last[i] for i in idx):
                    last = tuple(current[n][i] if i in idx \
                            else aggregate.get(allfields[n][i],common.generic_merge)((last[i],current[n][i])) \
                            for i in range(len(allfields[n]))) # merge last and current
                else:
                    yield last
                    last = current[n]
            else:
                yield current[n]
            current[n] = _t[n].next()[:N]
        if group_by: yield last

    if len(trackList) == 1: return trackList[0]
    if fields is None:
        fields = trackList[0].fields
    fields = [f for f in fields if all(f in t.fields for t in trackList)]
    _of = ['start','end']
    if 'chr' in fields: _of = ['chr']+_of
    if 'name' in fields: _of += ['name']
    _of += [f for f in fields if not(f in _of)]
    tl = [common.reorder(t,_of) for t in trackList]
    tl = [FeatureStream(common.sentinelize(x,(sys.maxint,)*len(x.fields)),x.fields) for x in tl]
    return FeatureStream(_weave(tl,len(_of)),fields=_of)
Ejemplo n.º 9
0
def getNearestFeature(features,
                      annotations,
                      thresholdPromot=2000,
                      thresholdInter=20000,
                      thresholdUTR=10):
    """
    For each element of *features*, searches the nearest element of *annotations* and returns
    a stream similar to *features*, with additional annotation fields, e.g.::

        ('chr5',12,14) -> ('chr5',12,14,'geneId|geneName','location_type','distance').

    If there are several genes, they are separated by '_': geneId1|geneName1_geneId2|geneName2.
    For each gene, `location_type` is one of:

    * 'Intergenic' if there are no genes within a distance `thresholdInter`,
    * 'Included' if the feature is included in the gene,
    * 'Promot' if the feature is upstream and within `thresholdInter` of the gene start,
    * 'Upstream' if the feature is upstream and beyond the promoter of the gene,
    * '3UTR' if the feature is downstream and within `thresholdUTR`% of the distance to the next downstream gene,
    * 'Downstream' otherwise.

    These annotations can be concatenated with '_' as well.
    The distance to each gene is negative if the feature is included, positive otherwise.

    :param features: (FeatureStream) features track.
    :param annotations: (FeatureStream) gene annotation track
        (e.g. as obtained with assembly.gene_track()).
    :param thresholdPromot: (int) associates the promoter of each gene which promoter is within
        this distance of the feature. Above the threshold, associates only the closest. [2000]
    :param thresholdInter: (int) no gene beyond this distance will be considered. [100000]
    :param thresholdUTR: (int) in case the feature is surrounded by two eligible genes on the
        same strand: if distance to gene1's 3'UTR upstream is less than *thresholdUTR*% of the distance
        between gene1 and gene2, associated to 3'UTR of gene1, else to promoter of gene2. [10]
    :rtype: FeatureStream (..., str, str, str).

    ::

                 <--                   feat                    -->
             ______| thresholdPromot  ++++++   thresholdPromot |______
       -----|______|-------------------------------------------|______|----------
             gene 1                                             gene 2

                                         feat
             ______  thresholdInter     ++++++        thresholdInter   ______
       -----|______|----------...------------------...----------------|______|---
             gene 1                                                    gene 2

                          feat
            -->          ++++++               -->
            |______  10%             90%     |______
       -----|______|------|------------------|______|-----  (attributed to gene1)
             gene 1      thresholdUTR         gene 2
    """
    def _get_feature(_t, _a):
        F = []
        _a = common.sentinelize(_a, [sys.maxint] * len(_a.fields))
        for peak in _t:
            distMinBefore = distMinAfter = thresholdInter + 1
            gene = dist = typeLoc = ""
            geneBefore = geneAfter = strandBefore = strandAfter = None
            included = 0
            # keep only genes which don't start too far
            for annot in _a:
                F.append(annot)
                if annot[0] > peak[1] + thresholdInter: break
            # remove genes that end too far
            fpop = -1  # always keep one gene before
            for annot in F:
                if annot[1] > peak[0] - thresholdInter: break
                fpop += 1
            if fpop > 0: F = F[fpop:]
            for annot in F:
                # if the peak is totally included in the gene
                if (peak[0] >= annot[0]) and (annot[1] >= peak[1]):
                    includedGene = annot[2]
                    includedDist = (
                        annot[3]
                        == -1) and annot[1] - peak[1] or peak[0] - annot[0]
                    included = 1
                # if the gene is totally included in the peak
                elif (annot[0] > peak[0]) and (peak[1] > annot[1]):
                    includedGene = annot[2]
                    includedDist = 0
                    included = 1
                else:
                    # if annot is not too far 3' and no intersection
                    if 0 < (peak[0] - annot[1]) < distMinBefore:
                        distMinBefore = peak[0] - annot[1]
                        geneBefore = annot[2]
                        strandBefore = annot[3]
                    # if intersection (annot is before)
                    elif annot[0] < peak[0] < annot[1]:
                        distMinBefore = 0
                        geneBefore = annot[2]
                        strandBefore = annot[3]
                        #print "gene %s overlaps begin of peak %s" % (geneBefore,peakName)
                    # if annot is not too far 5' and no intersection
                    if 0 < (annot[0] - peak[1]) < distMinAfter:
                        distMinAfter = annot[0] - peak[1]
                        geneAfter = annot[2]
                        strandAfter = annot[3]
                    # if intersection (annot is after)
                    elif annot[0] < peak[1] < annot[1]:
                        distMinAfter = 0
                        geneAfter = annot[2]
                        strandAfter = annot[3]
                        #print "gene %s overlaps end of peak %s" % (geneAfter,peakName)
            # detect intergenic peak
            if not (
                    included
            ) and distMinBefore > thresholdInter and distMinAfter > thresholdInter:
                yield peak + ('', 'Intergenic', thresholdInter)
                continue
            # detect peak before the first or after the last gene on the chromosome
            if geneBefore == None:
                if distMinAfter <= thresholdInter:
                    gene = geneAfter
                    dist = distMinAfter
                    typeLoc = (strandAfter == 1) and "Upstream" or "Downstream"
            elif geneAfter == None:
                if distMinBefore <= thresholdInter:
                    gene = geneBefore
                    dist = distMinBefore
                    typeLoc = (strandBefore
                               == -1) and "Upstream" or "Downstream"
            # detect peak between two genes on the same strand
            elif strandBefore == strandAfter:
                if strandBefore == 1:
                    if thresholdUTR * distMinAfter > 100 * distMinBefore:
                        gene = geneBefore
                        dist = distMinBefore
                        if distMinAfter < thresholdPromot:
                            typeLoc = "3UTR"
                        else:
                            typeLoc = "Downstream"
                    else:
                        gene = geneAfter
                        dist = distMinAfter
                        if dist < thresholdPromot:
                            typeLoc = "Promot"
                        else:
                            typeLoc = "Upstream"
                else:
                    if thresholdUTR * distMinBefore > 100 * distMinAfter:
                        gene = geneAfter
                        dist = distMinAfter
                        if distMinBefore < thresholdPromot:
                            typeLoc = "3UTR"
                        else:
                            typeLoc = "Downstream"
                    else:
                        gene = geneBefore
                        dist = distMinBefore
                        if dist < thresholdPromot:
                            typeLoc = "Promot"
                        else:
                            typeLoc = "Upstream"
            # detect peak between two genes on different strands
            else:
                # detect peak between 2 promoters
                if strandBefore == -1:
                    typeLoc = "Upstream"
                    if distMinBefore < distMinAfter:
                        gene = geneBefore
                        dist = distMinBefore
                        if dist < thresholdPromot:
                            typeLoc = "Promot"
                            if distMinAfter < thresholdPromot:
                                typeLoc += "_Promot"
                                gene += "_" + geneAfter
                                dist = str(dist) + "_" + str(distMinAfter)
                    else:
                        gene = geneAfter
                        dist = distMinAfter
                        if dist < thresholdPromot:
                            typeLoc = "Promot"
                            if distMinBefore < thresholdPromot:
                                typeLoc += "_Promot"
                                gene += "_" + geneBefore
                                dist = str(dist) + "_" + str(distMinBefore)
                # detect peak between 2 3UTR
                else:
                    typeLoc = "Downstream"
                    # detect peak overlapping the 2 3UTR
                    if distMinBefore == distMinAfter:
                        if thresholdUTR * thresholdPromot > 100 * distMinBefore:
                            typeLoc = "3UTR"
                        typeLoc += "_" + typeLoc
                        gene = geneBefore + "_" + geneAfter
                        dist = str(distMinBefore) + "_" + str(distMinAfter)
                    elif distMinBefore < distMinAfter:
                        dist = distMinBefore
                        gene = geneBefore
                        if thresholdUTR * thresholdPromot > 100 * dist:
                            typeLoc = "3UTR"
                    else:
                        dist = distMinAfter
                        gene = geneAfter
                        if thresholdUTR * thresholdPromot > 100 * dist:
                            typeLoc = "3UTR"
            if included == 1:
                gene += "_" + includedGene if gene else includedGene
                dist = str(dist)
                dist = dist + "_" + str(includedDist) if dist else str(
                    includedDist)
                typeLoc += "_Included" if typeLoc else "Included"
            yield peak + (gene, typeLoc, dist)

    if isinstance(features, (tuple, list)): features = features[0]
    if isinstance(annotations, (tuple, list)): annotations = annotations[0]
    features = common.reorder(features, ['start', 'end'])
    annot = common.reorder(annotations, ['start', 'end', 'name', 'strand'])
    _fields = features.fields + ['gene', 'location_type', 'distance']
    return FeatureStream(_get_feature(features, annot), fields=_fields)
Ejemplo n.º 10
0
def score_by_feature(trackScores, trackFeatures, method="mean"):
    """
    For every feature from *trackFeatures*, get the list of all scores it contains
    and apply an operation *method* on this list (by default, scores are averaged).
    Warning: both score and feature streams must be sorted! (use `common.sorted_stream` is necessary).
    The output is a stream similar to *trackFeatures* but with an additional `score` field
    for each stream in *trackScores*::
        method = 'mean':

        X: ------##########--------------##########------
        Y: ___________666666666__________6666666666______
        R: ______[   3.   ]______________[   6.   ]______


        method = 'sum':

        X : ------##########--------------##########------
        Y1: ___________666666666__________6666666666______
        Y2: ___222222_____________________333_____________
        R : ______[  30,6  ]______________[  60,9  ]______

    :param trackScores: (list of) one or several -sorted- score track(s) (FeatureStream).
    :param trackFeatures: (FeatureStream) one -sorted- feature track.
    :param method: (str of function): operation applied to the list of scores from one feature.
        Can be one of 'sum','mean','median','min','max', or a custom function.
    :rtype: FeatureStream
    """

    def _stream(ts, tf):
        X = [common.sentinelize(x, [sys.maxint] * len(x.fields)) for x in ts]
        S = [[(-sys.maxint, -sys.maxint, 0.0)] for t in ts]
        start_idx = tf.fields.index("start")
        end_idx = tf.fields.index("end")
        if hasattr(method, "__call__"):
            mean_fn = lambda scores, denom: method(scores)
        else:
            mean_fn = _score_functions.get(method, _arithmetic_mean)
        for y in tf:
            ystart = y[start_idx]
            yend = y[end_idx]
            scores = ()
            for i in range(len(ts)):
                xnext = S[i][-1]
                # Load into S all score items which intersect feature y
                while xnext[0] < yend:
                    xnext = X[i].next()
                    if xnext[1] > ystart:
                        S[i].append(xnext)
                n = 0
                while S[i][n][1] <= ystart:
                    n += 1
                S[i] = S[i][n:]
                scores_y = []
                for s in S[i]:
                    if yend <= s[0]:
                        continue
                    if s[0] < ystart:
                        start = ystart
                    else:
                        start = s[0]
                    if yend < s[1]:
                        end = yend
                    else:
                        end = s[1]
                    scores_y.extend([s[2]] * (end - start))
                scores += (mean_fn(scores_y, 1.0 / (yend - ystart)),)
            yield tuple(y) + scores

    if not (isinstance(trackScores, (list, tuple))):
        trackScores = [trackScores]
    if isinstance(trackFeatures, (list, tuple)):
        trackFeatures = concatenate(trackFeatures)
    if len(trackScores) > 1 or "score" in trackFeatures.fields:
        _fields = ["score" + str(i) for i in range(len(trackScores))]
    else:
        _fields = ["score"]
    _ts = [common.reorder(t, ["start", "end", "score"]) for t in trackScores]
    return FeatureStream(_stream(_ts, trackFeatures), trackFeatures.fields + _fields)
Ejemplo n.º 11
0
def window_smoothing(trackList, window_size, step_size=1, stop_val=sys.maxint, featurewise=False):
    """
    Given a (list of) signal track(s) *trackList*, a *window_size* L (in base pairs by default,
    or in number of features if *featurewise* is True),  and a *step_size*,
    return as many signal tracks with, at each position p (multiple of *step_size*),
    the average score in the window [p-L/2, p+L/2]::

        X: __________666666666666____________
        R: ______12345666666666654321________ (not exact scores here)

    :param trackList: FeatureStream, or list of FeatureStream objects.
    :param window_size: (int) window size in bp.
    :param step_size: (int) step length (one score returned per *step_size* positions). [1]
    :param stop_val: (int) sequence length. [sys.maxint]
    :param featurewise: (bool) bp (False), or number of features (True). [False]
    :rtype: FeatureStream

    Example of windows, window_size=9, step_size=3:

    [0,1,2,3,4,5,6,7,8,9), [3,4,5,6,7,8,9,10,11,12), ...
    """

    def _stepping_mean(track, score, denom):
        score = 0.0
        F = []
        score = 0.0
        nmid = window_size / 2
        for x in track:
            F.append(x)
            score += x[2]
            if len(F) < window_size:
                continue
            yield (F[nmid][0], F[nmid][1], round(score * denom + 1e-7, 6)) + F[nmid][3:]
            for shift in xrange(step_size):
                score -= F.pop(0)[2]

    def _running_mean(track, win_start, denom):
        score = 0.0
        F = []
        for x in track:
            F.append(x)
            fstart = F[0][0]
            fend = F[0][1]
            chrom = F[0][3]
            win_start = max(win_start, fstart - window_size)
            win_end = win_start + window_size
            lstart = F[-1][0]
            lend = F[-1][1]
            while win_end < lend:
                delta = 0
                steps = [fend - win_start, lend - win_end]
                if fstart > win_start:
                    steps.append(fstart - win_start)
                else:
                    delta -= F[0][2]
                if lstart > win_end:
                    steps.append(lstart - win_end)
                else:
                    delta += F[-1][2]
                nsteps = min(steps)
                sst = -(win_start % step_size) % step_size
                sen = -(win_start + nsteps % step_size) % step_size
                win_center = (win_start + win_end) / 2
                if abs(delta) > 1e-11:
                    delta *= denom
                    score += delta * sst
                    for step in xrange(sst, nsteps, step_size):
                        if score > 1e-11 and win_center + step >= 0 and win_center + step + step_size <= stop_val:
                            yield (win_center + step, win_center + step + step_size, score, chrom)
                        score += delta * step_size
                    score -= delta * sen
                else:
                    if score > 1e-11 and win_center + sst >= 0 and win_center + sen + nsteps <= stop_val:
                        yield (win_center + sst, win_center + sen + nsteps, score, chrom)
                win_start += nsteps
                win_end += nsteps
                if fend <= win_start:
                    F.pop(0)
                    if F:
                        fstart = F[0][0]
                        fend = F[0][1]
                        win_start = max(win_start, fstart - window_size)
                        win_end = win_start + window_size
                        if win_end > stop_val:
                            break
        while F:
            delta = 0
            steps = [fend - win_start]
            if fstart > win_start:
                steps.append(fstart - win_start)
            else:
                delta -= F[0][2]
            nsteps = min(steps)
            sst = -(win_start % step_size) % step_size
            sen = -(win_start + nsteps % step_size) % step_size
            win_center = (win_start + win_end) / 2
            if abs(delta) > 1e-11:
                delta *= denom
                score += delta * sst
                for step in xrange(sst, nsteps, step_size):
                    if score > 1e-11 and win_center + step >= 0 and win_center + step + step_size <= stop_val:
                        yield (win_center + step, win_center + step + step_size, score, chrom)
                    score += delta * step_size
                score -= delta * sen
            else:
                if score > 1e-11 and win_center + sst >= 0 and win_center + sen + nsteps <= stop_val:
                    yield (win_center + sst, win_center + sen + nsteps, score, chrom)
            win_start += nsteps
            win_end += nsteps
            if fend <= win_start:
                F.pop(0)
                if F:
                    fstart = F[0][0]
                    fend = F[0][1]
                    win_start = max(win_start, fstart - window_size)
                    win_end = win_start + window_size
                    if win_end > stop_val:
                        break

    denom = 1.0 / window_size
    win_start = -window_size
    _f = ["start", "end", "score"]
    if featurewise:
        call = _stepping_mean
    else:
        call = _running_mean
    if isinstance(trackList, (list, tuple)):
        return [
            FeatureStream(call(common.reorder(t, _f), win_start, denom), fields=_f) for n, t in enumerate(trackList)
        ]
    else:
        return FeatureStream(call(common.reorder(trackList, _f), win_start, denom), fields=_f)
Ejemplo n.º 12
0
def filter_scores(trackScores, trackFeatures, method="sum", strict=False, annotate=False, flatten=common.cobble):
    """
    Extract from *trackScores* only the regions overlapping *trackFeatures*'s regions.
    Warning: both score and features streams must be sorted! (use `common.sorted_stream` if necessary).
    Example::

        X: _____#########__________#############_______
        Y: __________666666666___2222776_444___________
        R: __________6666__________22776_444___________

    Note: *trackFeatures* is :func:`cobbled <bbcflib.gfminer.common.cobble>` by default (to avoid
    score duplications). An alternative is :func:`fusion <bbcflib.gfminer.common.fusion>`, or nothing.
    If strand information is present in both *trackScores* and *trackFeatures*, only scores inside
    a region of the same strand are kept.

    :param trackScores: (FeatureStream) one -sorted- score track.
        If a list of streams is provided, they will be merged (using `merge_scores`).
    :param trackFeatures: (FeatureStream) one -sorted- feature track.
        If a list of streams is provided, they will be merged (using `concatenate`).
    :param method: (str) `merge_scores` *method* argument, in case *trackScores* is a list. ['sum']
    :param strict: (bool) if True, only score regions from *trackScores* that are
        strictly contained in a feature region of *trackFeatures* will be returned. [False]
    :param annotate: (bool) if True, supplementary annotation (and the corresponding fields)
        from *trackFeatures* will be added to the result. [False]
    :param flatten: (func) one of None, `common.fusion` or `common.cobble`.
        Function to be applied to *trackFeatures* before all. [common.cobble]
    :rtype: FeatureStream
    """

    def _stream(ts, tf):
        tf = common.sentinelize(tf, [sys.maxint] * len(tf.fields))
        info_idx = [k for k, f in enumerate(tf.fields) if f not in ts.fields]
        if stranded:
            ts_strand_idx = ts.fields.index("strand")
            tf_strand_idx = tf.fields.index("strand")
            same_strand = lambda x, y: x[ts_strand_idx] == y[tf_strand_idx]
        else:
            same_strand = lambda x, y: True
        Y = []
        ynext = (-sys.maxint, -sys.maxint, 0.0)
        for x in ts:
            xstart = x[0]
            xend = x[1]
            # Load into Y all feature items which intersect score x
            while ynext[0] < xend:
                if ynext[1] > xstart:
                    Y.append(ynext)
                ynext = tf.next()
            # Remove features that are far behind x
            if Y:
                n = 0
                try:
                    while Y[n][1] <= xstart:
                        n += 1
                    Y = Y[n:]
                except IndexError:
                    Y = [ynext]
            # Yield intersections
            for y in Y:
                if not same_strand(x, y):
                    continue
                info = tuple([y[k] for k in info_idx]) if annotate else ()
                if strict and (y[0] > xstart or y[1] < xend):
                    continue
                if y[0] >= xend:
                    continue  # keep for next iteration
                start = xstart if y[0] < xstart else y[0]
                end = xend if y[1] > xend else y[1]
                yield (start, end) + tuple(x[2:]) + info

    if isinstance(trackFeatures, (list, tuple)):
        trackFeatures = concatenate(trackFeatures)
    if isinstance(trackScores, (list, tuple)):
        trackScores = merge_scores(trackScores, method)
    _info_fields = [f for f in trackFeatures.fields if f not in trackScores.fields] if annotate else []
    stranded = "strand" in (set(trackScores.fields) & set(trackFeatures.fields))
    if flatten is None:
        _tf = trackFeatures
    else:
        _tf = flatten(trackFeatures, stranded=stranded)
    _ts = common.reorder(trackScores, ["start", "end"])
    _tf = common.reorder(_tf, ["start", "end"])
    return FeatureStream(_stream(_ts, _tf), _ts.fields + _info_fields)
Ejemplo n.º 13
0
 def test_reorder(self):
     stream = fstream([(10,12,0.5), (14,15,1.2)], fields=['start','end','score'])
     expected = [(12,0.5,10), (15,1.2,14)]
     res = list(reorder(stream,['end','score','start']))
     self.assertListEqual(res,expected)
Ejemplo n.º 14
0
 def _test(stream):
     return reorder(stream,fields=['2','1'])
Ejemplo n.º 15
0
def window_smoothing(trackList,
                     window_size,
                     step_size=1,
                     stop_val=sys.maxint,
                     featurewise=False):
    """
    Given a (list of) signal track(s) *trackList*, a *window_size* L (in base pairs by default,
    or in number of features if *featurewise* is True),  and a *step_size*,
    return as many signal tracks with, at each position p (multiple of *step_size*),
    the average score in the window [p-L/2, p+L/2]::

        X: __________666666666666____________
        R: ______12345666666666654321________ (not exact scores here)

    :param trackList: FeatureStream, or list of FeatureStream objects.
    :param window_size: (int) window size in bp.
    :param step_size: (int) step length (one score returned per *step_size* positions). [1]
    :param stop_val: (int) sequence length. [sys.maxint]
    :param featurewise: (bool) bp (False), or number of features (True). [False]
    :rtype: FeatureStream

    Example of windows, window_size=9, step_size=3:

    [0,1,2,3,4,5,6,7,8,9), [3,4,5,6,7,8,9,10,11,12), ...
    """
    def _stepping_mean(track, score, denom):
        score = 0.0
        F = []
        score = 0.0
        nmid = window_size / 2
        for x in track:
            F.append(x)
            score += x[2]
            if len(F) < window_size: continue
            yield (F[nmid][0], F[nmid][1], round(score * denom + 1e-7,
                                                 6)) + F[nmid][3:]
            for shift in xrange(step_size):
                score -= F.pop(0)[2]

    def _running_mean(track, win_start, denom):
        score = 0.0
        F = []
        for x in track:
            F.append(x)
            fstart = F[0][0]
            fend = F[0][1]
            chrom = F[0][3]
            win_start = max(win_start, fstart - window_size)
            win_end = win_start + window_size
            lstart = F[-1][0]
            lend = F[-1][1]
            while win_end < lend:
                delta = 0
                steps = [fend - win_start, lend - win_end]
                if fstart > win_start: steps.append(fstart - win_start)
                else: delta -= F[0][2]
                if lstart > win_end: steps.append(lstart - win_end)
                else: delta += F[-1][2]
                nsteps = min(steps)
                sst = -(win_start % step_size) % step_size
                sen = -(win_start + nsteps % step_size) % step_size
                win_center = (win_start + win_end) / 2
                if abs(delta) > 1e-11:
                    delta *= denom
                    score += delta * sst
                    for step in xrange(sst, nsteps, step_size):
                        if score > 1e-11 and win_center + step >= 0 and win_center + step + step_size <= stop_val:
                            yield (win_center + step,
                                   win_center + step + step_size, score, chrom)
                        score += delta * step_size
                    score -= delta * sen
                else:
                    if score > 1e-11 and win_center + sst >= 0 and win_center + sen + nsteps <= stop_val:
                        yield (win_center + sst, win_center + sen + nsteps,
                               score, chrom)
                win_start += nsteps
                win_end += nsteps
                if fend <= win_start:
                    F.pop(0)
                    if F:
                        fstart = F[0][0]
                        fend = F[0][1]
                        win_start = max(win_start, fstart - window_size)
                        win_end = win_start + window_size
                        if win_end > stop_val: break
        while F:
            delta = 0
            steps = [fend - win_start]
            if fstart > win_start: steps.append(fstart - win_start)
            else: delta -= F[0][2]
            nsteps = min(steps)
            sst = -(win_start % step_size) % step_size
            sen = -(win_start + nsteps % step_size) % step_size
            win_center = (win_start + win_end) / 2
            if abs(delta) > 1e-11:
                delta *= denom
                score += delta * sst
                for step in xrange(sst, nsteps, step_size):
                    if score > 1e-11 and win_center + step >= 0 and win_center + step + step_size <= stop_val:
                        yield (win_center + step,
                               win_center + step + step_size, score, chrom)
                    score += delta * step_size
                score -= delta * sen
            else:
                if score > 1e-11 and win_center + sst >= 0 and win_center + sen + nsteps <= stop_val:
                    yield (win_center + sst, win_center + sen + nsteps, score,
                           chrom)
            win_start += nsteps
            win_end += nsteps
            if fend <= win_start:
                F.pop(0)
                if F:
                    fstart = F[0][0]
                    fend = F[0][1]
                    win_start = max(win_start, fstart - window_size)
                    win_end = win_start + window_size
                    if win_end > stop_val: break

    denom = 1.0 / window_size
    win_start = -window_size
    _f = ['start', 'end', 'score']
    if featurewise:
        call = _stepping_mean
    else:
        call = _running_mean
    if isinstance(trackList, (list, tuple)):
        return [
            FeatureStream(call(common.reorder(t, _f), win_start, denom),
                          fields=_f) for n, t in enumerate(trackList)
        ]
    else:
        return FeatureStream(call(common.reorder(trackList, _f), win_start,
                                  denom),
                             fields=_f)
Ejemplo n.º 16
0
def neighborhood(trackList, before_start=None, after_end=None,
                 after_start=None, before_end=None, on_strand=False):
    """
    Given streams of features and four integers *before_start*, *after_end*,
    *after_start* and *before_end*, this will return one or two features
    for every input feature:

    * Only *before_start* and *after_end* are given::

         (start, end, ...) -> (start-before_start, end+after_end, ...)

    * Only *before_start* and *after_start* are given::

         (start, end, ...) -> (start-before_start, start+after_start, ...)

    * Only *after_end* and *before_end* are given::

         (start, end, ...) -> (end-before_end, end+after_end, ...)

    * If all four parameters are given, a pair of features is generated::

         (start, end, ...) -> (start-before_start, start+after_start, ...)
                              (end-before_end, end+after_end, ...)

    * If the boolean parameter *on_strand* is set to True,
      then `start` and `end` are understood relative to orientation::

         (start, end, -1, ...) -> (start-after_end, start+before_end, -1, ...)
                                  (end-after_start, end+before_start, -1, ...)
         (start, end, +1, ...) -> (start-before_start, start+after_start, +1, ...)
                                  (end-before_end, end+after_end, +1, ...)

    :param trackList: list of FeatureStream objects.
    :param before_start: (int) number of bp before the feature start.
    :param after_end: (int) number of bp after feature end.
    :param after_start: (int) number of bp after the feature start.
    :param before_end: (int) number of bp before the feature end.
    :param on_strand: (bool) True to respect strand orientation. [False]
    :rtype: FeatureStream
    """
    def _generate_single(track,a,b,c,d):
        _buf = []
        for x in track:
            if a:
                if on_strand and x[2]<0:
                    _buf.append((x[0]-after_end,     x[0]+before_end+1) + x[2:])
                    _buf.append((x[1]-after_start-1, x[1]+before_start) + x[2:])
                else:
                    _buf.append((x[0]-before_start, x[0]+after_start+1) + x[2:])
                    _buf.append((x[1]-before_end-1, x[1]+after_end)     + x[2:])
            elif b:
                if on_strand and x[2]<0:
                    _buf.append((x[1]-after_start-1, x[1]+before_start)  + x[2:])
                else:
                    _buf.append((x[0]-before_start,  x[0]+after_start+1) + x[2:])
            elif c:
                if on_strand and x[2]<0:
                    _buf.append((x[0]-after_end,    x[0]+before_end+1) + x[2:])
                else:
                    _buf.append((x[1]-before_end-1, x[1]+after_end)    + x[2:])
            elif d:
                if on_strand and x[2]<0:
                    _buf.append((x[0]-after_end,    x[1]+before_start) + x[2:])
                else:
                    _buf.append((x[0]-before_start, x[1]+after_end)    + x[2:])
            _buf.sort()
            while _buf and _buf[0][1] < x[0]: yield _buf.pop(0)
        while _buf: yield _buf.pop(0)

    _fields = ['start','end']
    if on_strand: _fields += ['strand']
    case1 = True
    case2 = True
    case3 = True
    case4 = True
    if before_start is None:
        case1 = case2 = case4 = False
    if after_end is None:
        case1 = case3 = case4 = False
    if after_start is None:
        case1 = case2 = False
    if before_end is None:
        case1 = case3 = False
    if isinstance(trackList,(list,tuple)):
        tl = [common.reorder(t,_fields) for t in trackList]
        return [FeatureStream(_generate_single(t,case1,case2,case3,case4),
                                    fields=t.fields) for t in tl]
    else:
        tl = common.reorder(trackList,_fields)
        return FeatureStream(_generate_single(tl,case1,case2,case3,case4),
                                   fields=tl.fields)
Ejemplo n.º 17
0
def merge_scores(trackList, method='arithmetic'):
    """
    Creates a stream with per-base average of several score tracks::

        X1: __________666666666______
        X2: _____2222222222__________
        R:  _____11111444443333______

    :param trackList: list of FeatureStream objects.
    :param method: (str) type of average: one of 'arithmetic','geometric', or 'sum' (no average).
    :rtype: FeatureStream
    """
    tracks = [
        FeatureStream(common.sentinelize(x, [sys.maxint] * len(x.fields)),
                      x.fields) for x in trackList
    ]
    tracks = [common.reorder(t, ['start', 'end', 'score']) for t in tracks]
    fields = [
        f for f in tracks[0].fields if all([f in t.fields for t in tracks])
    ]  # common fields
    elements = [list(x.next()) for x in tracks]
    track_denom = 1.0 / len(trackList)

    if hasattr(method, '__call__'):
        mean_fn = lambda scores, denom: method(scores)
    else:
        mean_fn = _score_functions.get(method, _arithmetic_mean)
    for i in xrange(len(tracks) - 1, -1, -1):
        if elements[i][0] == sys.maxint:
            tracks.pop(i)
            elements.pop(i)

    def _stream(tracks):
        while tracks:
            start = min([x[0] for x in elements])
            end = min([x[0] for x in elements if x[0] > start] +
                      [x[1] for x in elements])
            scores = [x[2] for x in elements if x[1] > start and x[0] < end]
            if len(fields) > 3:
                rest = []
                for i in range(len(fields[3:])):
                    r = [
                        str(x[3 + i]) for x in elements if
                        not (x[3 + i] is None) and x[1] > start and x[0] < end
                    ]
                    if all([x == r[0] for x in r]):
                        rest.append(r[0])
                    else:
                        rest.append("|".join(r))
                yield (start, end, mean_fn(scores, track_denom)) + tuple(rest)
            else:
                yield (start, end, mean_fn(scores, track_denom))
            for i in xrange(len(tracks) - 1, -1, -1):
                if elements[i][0] < end:
                    elements[i][0] = end
                if elements[i][1] <= end:
                    elements[i] = list(tracks[i].next())
                if elements[i][0] == sys.maxint:
                    tracks.pop(i)
                    elements.pop(i)

    return FeatureStream(_stream(tracks), fields)
Ejemplo n.º 18
0
def getNearestFeature(features, annotations, thresholdPromot=2000, thresholdInter=20000, thresholdUTR=10):
    """
    For each element of *features*, searches the nearest element of *annotations* and returns
    a stream similar to *features*, with additional annotation fields, e.g.::

        ('chr5',12,14) -> ('chr5',12,14,'geneId|geneName','location_type','distance').

    If there are several genes, they are separated by '_': geneId1|geneName1_geneId2|geneName2.
    For each gene, `location_type` is one of:

    * 'Intergenic' if there are no genes within a distance `thresholdInter`,
    * 'Included' if the feature is included in the gene,
    * 'Promot' if the feature is upstream and within `thresholdInter` of the gene start,
    * 'Upstream' if the feature is upstream and beyond the promoter of the gene,
    * '3UTR' if the feature is downstream and within `thresholdUTR`% of the distance to the next downstream gene,
    * 'Downstream' otherwise.

    These annotations can be concatenated with '_' as well.
    The distance to each gene is negative if the feature is included, positive otherwise.

    :param features: (FeatureStream) features track.
    :param annotations: (FeatureStream) gene annotation track
        (e.g. as obtained with assembly.gene_track()).
    :param thresholdPromot: (int) associates the promoter of each gene which promoter is within
        this distance of the feature. Above the threshold, associates only the closest. [2000]
    :param thresholdInter: (int) no gene beyond this distance will be considered. [100000]
    :param thresholdUTR: (int) in case the feature is surrounded by two eligible genes on the
        same strand: if distance to gene1's 3'UTR upstream is less than *thresholdUTR*% of the distance
        between gene1 and gene2, associated to 3'UTR of gene1, else to promoter of gene2. [10]
    :rtype: FeatureStream (..., str, str, str).

    ::

                 <--                   feat                    -->
             ______| thresholdPromot  ++++++   thresholdPromot |______
       -----|______|-------------------------------------------|______|----------
             gene 1                                             gene 2

                                         feat
             ______  thresholdInter     ++++++        thresholdInter   ______
       -----|______|----------...------------------...----------------|______|---
             gene 1                                                    gene 2

                          feat
            -->          ++++++               -->
            |______  10%             90%     |______
       -----|______|------|------------------|______|-----  (attributed to gene1)
             gene 1      thresholdUTR         gene 2
    """
    def _get_feature(_t,_a):
        F = []
        _a = common.sentinelize(_a, [sys.maxint]*len(_a.fields))
        for peak in _t:
            distMinBefore = distMinAfter = thresholdInter+1
            gene = dist = typeLoc = ""
            geneBefore = geneAfter = strandBefore = strandAfter = None
            included = 0
            # keep only genes which don't start too far
            for annot in _a:
                F.append(annot)
                if annot[0] > peak[1]+thresholdInter: break
            # remove genes that end too far
            fpop = -1 # always keep one gene before
            for annot in F:
                if annot[1] > peak[0]-thresholdInter: break
                fpop += 1
            if fpop>0: F = F[fpop:]
            for annot in F:
                # if the peak is totally included in the gene
                if (peak[0]>=annot[0]) and (annot[1]>=peak[1]):
                    includedGene = annot[2]
                    includedDist = (annot[3] == -1) and annot[1]-peak[1] or peak[0]-annot[0]
                    included = 1
                # if the gene is totally included in the peak
                elif (annot[0]>peak[0]) and (peak[1]>annot[1]):
                    includedGene = annot[2]
                    includedDist = 0
                    included = 1
                else:
                    # if annot is not too far 3' and no intersection
                    if 0 < (peak[0]-annot[1]) < distMinBefore:
                        distMinBefore = peak[0]-annot[1]
                        geneBefore = annot[2]
                        strandBefore = annot[3]
                    # if intersection (annot is before)
                    elif annot[0] < peak[0] < annot[1]:
                        distMinBefore = 0
                        geneBefore = annot[2]
                        strandBefore = annot[3]
                        #print "gene %s overlaps begin of peak %s" % (geneBefore,peakName)
                    # if annot is not too far 5' and no intersection
                    if 0 < (annot[0]-peak[1]) < distMinAfter:
                        distMinAfter = annot[0]-peak[1]
                        geneAfter = annot[2]
                        strandAfter = annot[3]
                    # if intersection (annot is after)
                    elif annot[0] < peak[1] < annot[1]:
                        distMinAfter = 0
                        geneAfter = annot[2]
                        strandAfter = annot[3]
                        #print "gene %s overlaps end of peak %s" % (geneAfter,peakName)
            # detect intergenic peak
            if not(included) and distMinBefore > thresholdInter and distMinAfter > thresholdInter:
                yield peak+('','Intergenic',thresholdInter)
                continue
            # detect peak before the first or after the last gene on the chromosome
            if geneBefore == None:
                if distMinAfter <= thresholdInter:
                    gene = geneAfter
                    dist = distMinAfter
                    typeLoc = (strandAfter == 1) and  "Upstream" or "Downstream"
            elif geneAfter == None:
                if distMinBefore <= thresholdInter:
                    gene = geneBefore
                    dist = distMinBefore
                    typeLoc = (strandBefore == -1) and  "Upstream" or "Downstream"
            # detect peak between two genes on the same strand
            elif strandBefore == strandAfter:
                if strandBefore == 1:
                    if thresholdUTR*distMinAfter > 100*distMinBefore:
                        gene = geneBefore
                        dist = distMinBefore
                        if distMinAfter < thresholdPromot:
                            typeLoc = "3UTR"
                        else:
                            typeLoc = "Downstream"
                    else:
                        gene = geneAfter
                        dist = distMinAfter
                        if dist < thresholdPromot:
                            typeLoc = "Promot"
                        else:
                            typeLoc = "Upstream"
                else:
                    if thresholdUTR*distMinBefore > 100*distMinAfter:
                        gene = geneAfter
                        dist = distMinAfter
                        if distMinBefore < thresholdPromot:
                            typeLoc = "3UTR"
                        else:
                            typeLoc = "Downstream"
                    else:
                        gene = geneBefore
                        dist = distMinBefore
                        if dist < thresholdPromot:
                            typeLoc = "Promot"
                        else:
                            typeLoc = "Upstream"
            # detect peak between two genes on different strands
            else:
                # detect peak between 2 promoters
                if strandBefore == -1:
                    typeLoc = "Upstream"
                    if distMinBefore < distMinAfter:
                        gene = geneBefore
                        dist = distMinBefore
                        if dist < thresholdPromot:
                            typeLoc = "Promot"
                            if distMinAfter < thresholdPromot:
                                typeLoc += "_Promot"
                                gene += "_"+geneAfter
                                dist = str(dist)+"_"+str(distMinAfter)
                    else:
                        gene = geneAfter
                        dist = distMinAfter
                        if dist < thresholdPromot:
                            typeLoc = "Promot"
                            if distMinBefore < thresholdPromot:
                                typeLoc += "_Promot"
                                gene += "_"+geneBefore
                                dist = str(dist)+"_"+str(distMinBefore)
                # detect peak between 2 3UTR
                else:
                    typeLoc = "Downstream"
                    # detect peak overlapping the 2 3UTR
                    if distMinBefore == distMinAfter:
                        if thresholdUTR*thresholdPromot > 100*distMinBefore:
                            typeLoc = "3UTR"
                        typeLoc += "_"+typeLoc
                        gene = geneBefore+"_"+geneAfter
                        dist = str(distMinBefore)+"_"+str(distMinAfter)
                    elif distMinBefore < distMinAfter:
                        dist = distMinBefore
                        gene = geneBefore
                        if thresholdUTR*thresholdPromot > 100*dist:
                            typeLoc = "3UTR"
                    else:
                        dist = distMinAfter
                        gene = geneAfter
                        if thresholdUTR*thresholdPromot > 100*dist:
                            typeLoc = "3UTR"
            if included == 1:
                gene += "_"+includedGene if gene else includedGene
                dist = str(dist)
                dist = dist+"_"+str(includedDist) if dist else str(includedDist)
                typeLoc += "_Included" if typeLoc else "Included"
            yield peak+(gene,typeLoc,dist)
    if isinstance(features,(tuple,list)): features = features[0]
    if isinstance(annotations,(tuple,list)): annotations = annotations[0]
    features = common.reorder(features,['start','end'])
    annot = common.reorder(annotations,['start','end','name','strand'])
    _fields = features.fields+['gene','location_type','distance']
    return FeatureStream(_get_feature(features,annot),fields=_fields)