Exemple #1
0
def shuffled(stream, chrlen=sys.maxint, repeat_number=1, sorted=True):
    """Return a stream of randomly located features of the same length and annotation
    as these of the original stream.

    :param stream: FeatureStream object.
    :param chrlen: (int) chromosome length. [9223372036854775807]
    :param repeat_number: (int) *repeat_number* random features are yielded per input feature. [1]
    :param sorted: (bool) whether or not to sort the output stream. [True]
    :rtype: FeatureStream
    """
    import random
    _f = ['start', 'end']
    features = reorder(stream, _f)

    def _shuffled(_s):
        randpos = []
        for feat in _s:
            feat_len = feat[1] - feat[0]
            for s in xrange(repeat_number):
                if len(randpos) == 0:
                    randpos = [
                        random.randint(0, chrlen - feat_len)
                        for i in xrange(10000)
                    ]
                start = randpos.pop()
                yield (start, start + feat_len) + feat[2:]

    if sorted:
        return sorted_stream(FeatureStream(_shuffled(features),
                                           features.fields),
                             fields=_f)
    else:
        return FeatureStream(_shuffled(features), features.fields)
Exemple #2
0
def map_chromosomes(stream, chromosomes, keep=False):
    """
    Translate the chromosome identifiers in *stream* into chromosome names of the type 'chr5'.

    :param stream: FeatureStream object.
    :param chromosomes: a dictionary of chromosomes, such as `genrep.Assembly.chromosomes`.
    :param keep: (bool) keep all features (True) or only those which chromosome identifier
        is recognized (False). [False]
    """
    if not ('chr' in stream.fields): return stream
    ic = stream.fields.index('chr')
    chrom_map = {}
    for k, c in chromosomes.iteritems():
        cname = c['name']
        chrom_map[cname] = cname  # {'chrIV': 'chrIV'}
        if cname.startswith('chr') and len(cname) > 3:
            chrom_map[cname[3:]] = cname  # {'IV': 'chrIV'}
        chrom_map[k[0]] = cname  # {2780: 'chrIV'}
        chrom_map[str(k[1]) + "." +
                  str(k[2])] = cname  # {'NC_001136.9': 'chrIV'}
        chrom_map[str(k[0]) + "_" + str(k[1]) + "." +
                  str(k[2])] = cname  # {'2780_NC_001136.9': 'chrIV'}
        if c.get('synonyms'):
            for s in c['synonyms'].split(','):
                chrom_map[s] = cname  # {synonym: 'chrIV'}
    if keep:
        return FeatureStream(
            (x[:ic] + (chrom_map.get(x[ic], x[0]), ) + x[ic + 1:]
             for x in stream), stream.fields)
    else:
        return FeatureStream((x[:ic] + (chrom_map[x[ic]], ) + x[ic + 1:]
                              for x in stream if x[ic] in chrom_map),
                             stream.fields)
Exemple #3
0
def score_threshold(stream,
                    threshold=0.0,
                    lower=False,
                    strict=False,
                    fields='score'):
    """
    Filter the features of a track which score is above or below a certain threshold.

    :param stream: FeatureStream, or list of FeatureStream objects.
    :param threshold: (float) threshold above/below which features are retained
    :param lower: (bool) higher (False) or lower (True) bound.
    :param strict: (bool) strictly above/below threshold.
    :param fields: (str or list of str) names of the fields to apply the filter to.
    :rtype: FeatureStream, or list of FeatureStream objects
    """
    if not (isinstance(fields, (list, tuple))):
        fields = [fields]

    def _threshold(stream, th, lower, fields):
        gt = operator.gt if strict else operator.ge
        lower = -1 if lower else 1
        fidx = [stream.fields.index(f) for f in fields]
        for x in stream:
            if all([gt(lower * x[k], lower * th) for k in fidx]):
                yield x

    if isinstance(stream, (list, tuple)):
        return [
            FeatureStream(_threshold(s, threshold, lower, fields),
                          fields=s.fields) for s in stream
        ]
    else:
        return FeatureStream(_threshold(stream, threshold, lower, fields),
                             fields=stream.fields)
Exemple #4
0
def sorted_stream(stream,
                  chrnames=[],
                  fields=['chr', 'start', 'end'],
                  reverse=False):
    """Sorts a stream according to *fields* values. Will load the entire stream in memory.
    The order of names in *chrnames* is used to sort the 'chr' field if available.

    :param stream: FeatureStream object.
    :param chrnames: list of chrmosome names.
    :param fields: list of field names. [['chr','start','end']]
    :param reverse: reverse order. [False]
    :rtype: FeatureStream
    """
    fidx = [stream.fields.index(f) for f in fields if f in stream.fields]
    chri = -1
    if 'chr' in fields: chri = fields.index('chr')
    feature_list = list(stream)
    sort_list = []
    for n, f in enumerate(feature_list):
        if chri >= 0 and f[fidx[chri]] in chrnames:
            fchr = chrnames.index(f[fidx[chri]])
        else:
            fchr = f[fidx[chri]]
        x = tuple(f[i] for i in fidx[:chri]) + (fchr, ) + tuple(
            f[i] for i in fidx[chri + 1:]) + (n, )
        sort_list.append(x)
    sort_list.sort(reverse=reverse)
    return FeatureStream((feature_list[t[-1]] for t in sort_list),
                         stream.fields)
Exemple #5
0
def combine(trackList, fn, win_size=1000, aggregate={}):
    """
    Applies a custom function to a list of tracks, such as union, intersection,
    etc., and return a single result track. The input streams need to be ordered
    w.r.t 'chr', 'start' and 'end'. To be applied chromosome by chromosome.

    Only fields of the first track are kept. Values for a common field are
    merged by default according to `common.strand_merge`,`common.no_merge` and `common.generic_merge`,
    respectively for strand, chromosome and all others.

    :param trackList: list of FeatureStream objects.
    :param fn: boolean function to apply, such as bbcflib.gfminer.stream.union.
    :param win_size: (int) window size, in bp.
    :param aggregate: (dict) for each field name given as a key, its value is the function
        to apply to the vector containing all trackList's values for this field in order
        to merge them. E.g. ``{'score': lambda x: sum(x)/len(x)}`` will return the average of
        all *trackList*'s scores in the output.
    :rtype: FeatureStream
    """
    aggregate.setdefault('strand',common.strand_merge)
    aggregate.setdefault('chr',common.no_merge)
    _f = ['start','end']
    if all('chr' in t.fields for t in trackList):
        _f += ['chr']
    if isinstance(fn,str): fn = eval(fn) # can type "combine(...,fn='intersection')"
    trackList = [common.cobble(common.reorder(t,fields=_f)) for t in trackList]
    return common.fusion(FeatureStream(_combine(trackList,fn,win_size,aggregate),
                                       fields=trackList[0].fields))
Exemple #6
0
def apply(stream, fields, functions):
    """
    Applies custom transformations to the respective fields.

    :param stream: FeatureStream object.
    :param fields: (list of str) list of fields to transform in the output.
    :param functions: list of functions to apply to the respective *fields*.
    :rtype: FeatureStream, or list of FeatureStream objects
    """
    def _apply(stream, fields, functions):
        nf = len(stream.fields)
        idx = [stream.fields.index(f) for f in fields]
        fct = dict(zip(idx, functions))
        for i in range(nf):
            fct.setdefault(i, lambda x: x)
        for x in stream:
            yield tuple([fct[i](x[i]) for i in range(nf)])

    if isinstance(fields, str): fields = [fields]
    if hasattr(functions, '__call__'): functions = [functions]
    assert len(fields) == len(
        functions
    ), "The number of fields does not equal the number of functions."
    return FeatureStream(_apply(stream, fields, functions),
                         fields=stream.fields)
Exemple #7
0
    def __call__(self, **kw):
        # Create a track with the whole chromosome
        chrmeta = _get_chrmeta(**kw)
        sig0 = track(kw['TrackMulti']['tracks'][0])
        fields = sig0.fields
        format = sig0.format
        is_chr = 'chr' in fields
        _f0 = ('chr', 'start', 'end') if is_chr else ('start', 'end')
        _f1 = [f for f in fields if f not in _f0]
        whole_chr = []
        if is_chr:
            for chr in chrmeta:
                whole_chr.append((chr, 0, chrmeta[chr]['length']) +
                                 ('0', ) * len(_f1))
        else:
            fields = [f for f in fields if f not in ['start', 'end']]
            fields = ['start', 'end'] + fields
            for chr in chrmeta:
                whole_chr.append((0, chrmeta[chr]['length']) +
                                 ('0', ) * len(_f1))
        whole_chr = FeatureStream(whole_chr, fields=fields)
        temp = self.temporary_path() + '.' + format
        with track(temp, fields=fields) as wc:
            wc.write(whole_chr)

        kw['TrackMulti']['tracks'] = [temp] + kw['TrackMulti']['tracks']
        output = self.temporary_path(fname='combined.')
        output = _combine(self._func, output, **kw)
        self.new_file(output, 'combined')
        return self.display_time()
Exemple #8
0
def plot_footprint_profile(ex, bedlist, signals, chrnames, groups, logfile):
    files = dict((gid, {'pdf': "", 'mat': []}) for gid in bedlist.keys())
    logfile.write("Plotting footprints:\n")
    logfile.flush()
    for gid, motifbed in bedlist.iteritems():
        #        signals = [track(sig) for sig in siglist[gid]]
        snames = [sig.name for sig in signals[gid]]
        tmotif = track(motifbed, format='bed')
        data = {}
        numregs = {}
        for chrom in chrnames:
            fread = {}
            for r in tmotif.read(chrom):
                r2 = r[3].split(":")
                key = (r2[0], len(r2[1]))
                if key in fread: fread[key].append(r[1:3])
                else: fread[key] = [r[1:3]]
            for motif, regs in fread.iteritems():
                if motif not in data:
                    data[motif] = zeros(shape=(motif[1] + 2 * _plot_flank[1],
                                               len(signals[gid])))
                    numregs[motif] = 0
                numregs[motif] += len(regs)
                tFeat = sorted_stream(
                    segment_features(FeatureStream(regs,
                                                   fields=['start', 'end']),
                                     nbins=motif[1],
                                     upstream=_plot_flank,
                                     downstream=_plot_flank))
                for t in score_by_feature(
                    [s.read(chrom) for s in signals[gid]], tFeat):
                    data[motif][t[2]] += t[3:]
        files[gid]['pdf'] = unique_filename_in()
        new = True
        last = len(data)
        for motif, dat in data.iteritems():
            last -= 1
            mname, nbins = motif
            dat /= float(numregs[motif])
            X = range(-_plot_flank[1], _plot_flank[1] + nbins)
            for k in range(nbins):
                X[k + _plot_flank[1]] = str(k + 1)
            ####### Could do a heatmap (sort by intensity)...
            lineplot(X, [dat[:, n] for n in range(dat.shape[-1])],
                     mfrow=[4, 2],
                     output=files[gid]['pdf'],
                     new=new,
                     last=(last == 0),
                     legend=snames,
                     main=mname)
            new = False
            _datf = unique_filename_in()
            with open(_datf, "w") as dff:
                dff.write("\t".join([""] + [str(x) for x in X]) + "\n")
                for n, sn in enumerate(snames):
                    dff.write("\t".join([sn] + [str(x)
                                                for x in dat[:, n]]) + "\n")
            files[gid]['mat'].append((mname, _datf))
    return files
Exemple #9
0
def sentinelize(stream, sentinel=sys.maxint):
    """Append *sentinel* at the end of *iterable* (avoid StopIteration error)."""
    def _sentinelize(stream):
        for item in stream:
            yield item
        yield sentinel

    return FeatureStream(_sentinelize(stream), fields=stream.fields)
Exemple #10
0
 def _filter_deconv(stream, pval):
     ferr = re.compile(r';FERR=([\d\.]+)$')
     return FeatureStream(
         ((x[0], ) + ((x[2] + x[1]) / 2 - 150,
                      (x[2] + x[1]) / 2 + 150) + x[3:]
          for x in stream if "FERR=" in x[3]
          and float(ferr.search(x[3]).groups()[0]) <= pval),
         fields=stream.fields)
Exemple #11
0
def unroll(stream, regions, fields=['score']):
    """Creates a stream of *end*-*start* items with appropriate *fields* values at every base position.
    For example, ``unroll([(10,12,0.5,'a'), (14,15,1.2,'b')], regions=(9,16))`` returns::

        FeatureStream([(0,),(0.5,'a'),(0.5,'a'),(0,),(0,),(1.2,'b'),(0,)])
                        9      10        11      12   13     14      15

    :param stream: FeatureStream object.
    :param regions: either a pair (start,end) or an ordered list of such pairs or a FeatureStream
        interpreted as bounds of the region(s) to return.
    :param fields: list of field names **in addition to 'start','end'**. [['score']]
    :rtype: FeatureStream
    """
    if not (isinstance(fields, (list, tuple))): fields = [fields]
    with_chrom = False
    if isinstance(regions, (list, tuple)):
        if not isinstance(regions[0], (list, tuple)): regions = [regions]
        if len(regions[0]) > 2: with_chrom = True
        regions = iter(regions)
    elif isinstance(regions, FeatureStream):
        _f = ['start', 'end']
        if 'chr' in regions.fields:
            _f = ['chr'] + _f
            with_chrom = True
        regions = reorder(regions, _f)
    else:
        raise ValueError("regions: Expected tuple or FeatureStream, got %s." %
                         type(regions))
    if with_chrom:
        s = reorder(stream, ['start', 'end', 'chr'] + fields)
        nf = 3
    else:
        s = reorder(stream, ['start', 'end'] + fields)
        nf = 2
    item0 = (0, ) + (None, ) * (len(fields) - 1)

    def _unr(s):
        for reg in regions:
            if with_chrom:
                chrom, pos, end = reg[:3]
            else:
                chrom = None
                pos, end = reg[:2]
            for x in s:
                if chrom and not (x[2] == chrom): continue
                if x[1] <= pos: continue
                while pos < min(x[0], end):
                    yield item0
                    pos += 1
                while pos < min(x[1], end):
                    yield x[nf:]
                    pos += 1
                if pos >= end: break
            while pos < end:
                yield item0
                pos += 1

    return FeatureStream(_unr(s), fields=s.fields[nf:])
Exemple #12
0
        def _shift(stream, shift):
            istart = stream.fields.index('start')
            iend = stream.fields.index('end')
            i1 = min(istart, iend)
            i2 = max(istart, iend)

            def _apply_shift(x):
                return x[:i1] + (x[i1] + shift,) + x[i1 + 1:i2] + (x[i2] + shift,) + x[i2 + 1:]
            return FeatureStream((_apply_shift(x) for x in stream),
                                       fields=stream.fields)
Exemple #13
0
def add_name_field(stream):
    """
    Adds a unique name to each record in the stream.
    """
    ci = stream.fields.index('chr')
    si = stream.fields.index('start')
    ei = stream.fields.index('end')
    _f = stream.fields + ['name']
    return FeatureStream(
        (r + ("%s:%i-%i" % (r[ci], r[si], r[ei]), ) for r in stream),
        fields=_f)
Exemple #14
0
    def __call__(self, **kw):
        assembly = genrep.Assembly(kw.get('assembly'))
        format = kw['format']
        if kw['feature_type'] == 'genes':
            map = assembly.get_gene_mapping()
            get_info = self.genes_annot
        elif kw['feature_type'] == 'exons':
            map = assembly.get_exon_mapping()
            get_info = self.exons_annot
        elif kw['feature_type'] == 'transcripts':
            map = assembly.get_transcript_mapping()
            get_info = self.trans_annot

        def _annotate(ids_list):
            with open(ids_list) as ids_file:
                for id in ids_file:
                    id = id.strip()
                    if map.get(id):
                        yield get_info(id, map.get(id))
                    else:
                        yield ('NA', '0', '0', id, 0.0, '0')

        ids_list = kw.get('ids_list')
        fields = ['chr', 'start', 'end', 'name', 'score', 'strand']
        if ids_list:
            assert os.path.exists(
                str(ids_list)), "File not found: '%s'" % ids_list
            fulltrack = FeatureStream(_annotate(ids_list), fields=fields)
            fname = os.path.splitext(os.path.basename(ids_list))[0]
        else:
            fulltrack = FeatureStream((get_info(g, map[g]) for g in map),
                                      fields=fields)
            fname = kw['feature_type']
        output = self.temporary_path(fname=fname + '.' + format)
        out = track(output, chrmeta=assembly)
        out.write(fulltrack)
        self.new_file(output, 'fulltrack')
        return self.display_time()
Exemple #15
0
    def _join_macs(stream, xlsl, _f):
        def _macs_row(_s):
            for _p in _s:
                for _n in _p[3].split("|"):
                    if len(xlsl) == 1:
                        nb = int(
                            _n.split(";")[0][13:]) if _n[:3] == "ID=" else int(
                                _n[10:])
                        yield _p + xlsl[0][nb - 1][1:]
                    else:
                        nb = _n.split(
                            ";")[0][13:] if _n[:3] == "ID=" else _n[10:]
                        nb = nb.split(":")
                        yield _p + xlsl[int(nb[1])][int(nb[0]) - 1][1:]

        return FeatureStream(_macs_row(stream), fields=_f)
Exemple #16
0
def overlap(trackList,trackFeatures,strict=False,annotate=False,flatten=common.cobble):
    """
    For each stream in *trackList*, keep only items overlapping at least one element
    of *trackFeatures*.  The input streams need to be ordered w.r.t 'chr', 'start' and 'end'.
    To be applied chromosome by chromosome. If several tracks are given in either trackList
    or trackFeatures, they will be concatenated into one.

    :param trackList: FeatureStream - the elements to be filtered.
        If a list of streams is provided, they will be merged (using `concatenate`).
    :param trackFeatures: FeatureStream - the filter.
        If a list fo streams is provided, they will be merged (using `concatenate`).
    :param strict: (bool) if True, only score regions from *trackList* that
        entirely contain a feature region of *trackFeatures* will be returned. [False]
    :param annotate: (bool) if True, supplementary annotation (and the corresponding fields)
        from *trackFeatures* will be added to the result. [False]
    :param flatten: (func) one of None, `common.fusion` or `common.cobble`.
        Function to be applied to *trackFeatures* before all. [common.cobble]
    :rtype: FeatureStream
    """
    def _overlap(tl,tf,stranded,strict):
        if strict: olap = lambda x,y: x[0] <= y[0] and y[1] <= x[1]
        else: olap = lambda x,y: x[0] < y[1]
        if stranded:
            tl_strand_idx = tl.fields.index('strand')
            tf_strand_idx = tf.fields.index('strand')
            same_strand = lambda x,y:x[tl_strand_idx]==y[tf_strand_idx]
        else: same_strand = lambda x,y:True
        x = tl.next()
        for y in tf:
            try:
                if not same_strand(x,y): x = tl.next()
                while x[1] <= y[0]: x = tl.next()
                while olap(x,y):
                    yield x
                    x = tl.next()
            except StopIteration: break

    if isinstance(trackFeatures,(list,tuple)): trackList = concatenate(trackFeatures)
    if isinstance(trackFeatures,(list,tuple)): trackFeatures = concatenate(trackFeatures)
    stranded = 'strand' in (set(trackList.fields) & set(trackFeatures.fields))
    if flatten is None: _tf = trackFeatures
    else: _tf = flatten(trackFeatures,stranded=stranded)
    _tl = common.reorder(trackList,['start','end'])
    _tf = common.reorder(trackFeatures,['start','end'])
    return FeatureStream(_overlap(_tl,_tf,stranded,strict), _tl.fields)
Exemple #17
0
def fusion(stream, aggregate={}, stranded=False):
    """Fuses overlapping features in *stream* and applies *aggregate[f]* function to each field *f*.
    *stream* has to be sorted w.r.t. 'chr' (if any), 'start' and 'end'.

    Example::

        [('chr1',10,15,'A',1),('chr1',13,18,'B',-1),('chr1',18,25,'C',-1)]

        yields

        ('chr1', 10, 18, 'A|B', 0)
        ('chr1', 18, 25, 'C', -1)

    :param stream: FeatureStream object.
    :param stranded: (bool) if True, only features of the same strand are fused. [False]
    :rtype: FeatureStream
    """
    aggreg = dict(aggreg_functions)
    aggreg.update(aggregate)

    def _fuse(s, stranded):
        try:
            x = list(s.next())
        except StopIteration:
            return
        has_chr = 'chr' in s.fields
        if has_chr: chridx = s.fields.index('chr')
        if stranded: stridx = s.fields.index('strand')
        for y in s:
            new_chr = has_chr and (x[chridx] != y[chridx])
            new_str = stranded and (x[stridx] != y[stridx])
            if y[0] < x[1] and not (new_chr or new_str):
                x[1] = max(x[1], y[1])
                x[2:] = [
                    aggreg.get(f, generic_merge)((x[n + 2], y[n + 2]))
                    for n, f in enumerate(s.fields[2:])
                ]
            else:
                yield tuple(x)
                x = list(y)
        yield tuple(x)

    stream = reorder(stream, ['start', 'end'])
    return FeatureStream(_fuse(stream, stranded), fields=stream.fields)
Exemple #18
0
def normalize(trackList, method='total', field='score'):
    """Normalizes the scores in every stream from *trackList* using the given *method*.
    It assumes that each of the streams represents the same features, i.e. the n-th element
    of one stream corresponds to the n-th element of another.

    [!] This function will temporarily store everything in memory.

    :param trackList: FeatureStream, or list of FeatureStream objects.
    :param method: normalization method:
        * ``'total'`` divides every score vector by its sum (total number of reads) x 10^7 .
        * ``'deseq'`` applies DESeq's normalization ("size factors") - considering every track
            as belonging to a different group.
        * ``'quantile'`` applies quantile normalization.
    :param field: (str) name of the field containing the scores (must be the same for all streams).
    """
    if not isinstance(trackList, (list, tuple)):
        trackList = [trackList]
    allcontents = [list(t) for t in trackList]
    ncols = len(trackList)
    nlines = len(allcontents[0])
    assert all(
        len(t) == nlines for t in
        allcontents), "All streams must have the same number of elements."
    # Build the matrix
    allscores = zeros((ncols, nlines))
    for n, content in enumerate(allcontents):
        idx = trackList[n].fields.index(field)
        allscores[n] = asarray([x[idx] for x in content])
    # Normalize
    allscores = common.normalize(asarray(allscores), method)
    # Reinsert the new scores in the respective tracks
    for n, content in enumerate(allcontents):
        idx = trackList[n].fields.index(field)
        for k, x in enumerate(content):
            content[k] = x[:idx] + (allscores[n][k], ) + x[idx + 1:]
    res = [
        FeatureStream(t, fields=trackList[n].fields)
        for n, t in enumerate(allcontents)
    ]
    if len(trackList) == 1:
        return res[0]
    else:
        return res
Exemple #19
0
def concat_fields(stream,
                  infields,
                  outfield='name',
                  separator='|',
                  as_tuple=False):
    """
    Concatenate fields of a stream. Ex.:

    ('chr1', 12, 'aa', 'bb') -> ('chr1', 12, 'aa|bb')     # as_tuple=False
    ('chr1', 12, 'aa', 'bb') -> ('chr1', 12, ('aa','bb')) # as_tuple=True

    :param stream: FeatureStream object.
    :param infields: (list of str) list of fields to concatenate.
    :param outfield: (str) name of the new field created by concatenation of *infields*
        (can be an already existing one). ['name']
    :param separator: (str) char to add between entries from concatenated fields. ['|']
    :param as_tuple: (bool) join concatenated field entries in a tuple instead of a
        separator in a single string. [False]
    :rtype: FeatureStream object.
    """
    _infields = [f for f in stream.fields
                 if not (f in infields)]  # untouched fields
    in_out_indx = [stream.fields.index(f) for f in _infields]
    to_extend = []
    if not (outfield in _infields):
        _infields += [outfield]
        to_extend = [None]
    out_indx = _infields.index(outfield)
    in_indx = [stream.fields.index(f) for f in infields]

    def _concat(stream):
        for x in stream:
            y = [x[i] for i in in_out_indx] + to_extend
            if as_tuple:
                y[out_indx] = tuple((x[i] for i in in_indx))
            else:
                y[out_indx] = separator.join([str(x[i]) for i in in_indx])
            yield tuple(y)

    return FeatureStream(_concat(stream), _infields)
Exemple #20
0
def selection(trackList,selection):
    """
    For each stream in *trackList*, keep only items satisfying the *selection*'s filters.
    A selection is entered as a dictionary which keys are field names, and values are
    the scope of possible entries for each field. Example::

        sel = {'chr':['chrI','chrII'], 'start':(1,10000), 'end':(5000,15000), 'count':range(30), ...}
        selection(stream, selection=sel)

    All filters in a selection must be satisfied for an item to pass through it (AND operator).
    To give alternative conditions (OR operator), one must give several such selections in a list:

        sel = [{'chr':'chrI', 'start':(1,10000)}, {'chr':'chrI', 'end':(1000000,1500000)}]

    Values can be tuples (range of values), lists (of possible values), or a single element.

    :param trackList: FeatureStream, or list of FeatureStream objects.
    :param selection: (dict, or list of dict) the filter described above.
    """
    def _check_fields(item,filter):
        for k,v in filter.iteritems():
            z = item[k]
            if isinstance(v,tuple):
                if float(z) < v[0] or float(z) >= v[1]:
                    return False
            elif isinstance(v,list):
                if z not in v:
                    return False
            elif z!= v: return False
        return True

    def _filter(stream,selection):
        filters = [dict([(stream.fields.index(f),v) for f,v in sel.iteritems()]) for sel in selection]
        for x in stream:
            if any(_check_fields(x,f) for f in filters): yield x

    if isinstance(trackList,FeatureStream): trackList = [trackList]
    if isinstance(selection,dict): selection = [selection]
    res = [FeatureStream(_filter(t,selection), fields=t.fields) for t in trackList]
    return res[0] if len(res)==1 else res
Exemple #21
0
def parse_meme_xml( ex, meme_file, chrmeta ):
    """ Parse meme xml file and convert to track """
    from xml.etree import ElementTree as ET
    touch(ex,meme_file)
    tree = ET.parse(meme_file)
    ncol = {}
    allmatrices = {}
    for motif in tree.find('motifs').findall('motif'):
        mid = motif.attrib['id']
        ncol[mid] = 0
        allmatrices[mid] = unique_filename_in()
        with open(allmatrices[mid],'w') as mat_out:
            for parray in motif.find('probabilities')[0].findall('alphabet_array'):
                ncol[mid] += 1
                m = {'letter_A':0,'letter_C':0,'letter_G':0,'letter_T':0}
                for col in parray:
                    m[col.attrib['letter_id']] = float(col.text)
                mat_out.write("1\t%f\t%f\t%f\t%f\n" %(m['letter_A'],m['letter_C'],m['letter_G'],m['letter_T']))
    def _xmltree(_t):#(_c,_t):
        seq_name = {}
        seq_chr = None
        for it in _t.getiterator():
            if it.tag == 'sequence':
                seq_name[it.attrib['id']] = it.attrib['name']
            if it.tag == 'scanned_sites':
                name = seq_name[it.attrib['sequence_id']]
                name,seq_chr,start,end = re.search(r'(.*)\|(.+):(\d+)-(\d+)',name).groups()
            if it.tag == 'scanned_site':# and _c == seq_chr:
                start = int(start)+int(it.attrib['position'])-1
                end = start+ncol[it.attrib['motif_id']]
                strnd = it.attrib['strand'] == 'plus' and 1 or -1
                score = it.attrib['pvalue']
                yield (seq_chr,str(start),str(end),it.attrib['motif_id'],score,strnd)
    outsql = unique_filename_in()+".sql"
    outtrack = track(outsql, chrmeta=chrmeta, info={'datatype':'qualitative'},
                     fields=['start','end','name','score','strand'])
    outtrack.write(FeatureStream(_xmltree(tree),fields=['chr']+outtrack.fields))
    outtrack.close()
    return {'sql':outsql,'matrices':allmatrices}
Exemple #22
0
def duplicate(stream, infield, outfields):
    """
    Duplicate one of *stream*'s fields. If outfields has more than one element,
    the field is copied as many times.

    :param stream: FeatureStream object.
    :param infield: (str) name of the field to be duplicated.
    :param outfields: (str, or list of str) the new field(s) to be created.
    """
    def _duplicate(stream, infield, outfields):
        in_idx = stream.fields.index(infield)
        for x in stream:
            yield x + (x[in_idx], ) * len(outfields)

    assert infield in stream.fields, "Field %s not found." % infield
    assert isinstance(infield,
                      str), "Expected string, %s found." % type(infield)
    assert isinstance(
        outfields,
        (str, list)), "Expected string or list, % found." % type(outfields)
    if isinstance(outfields, str): outfields = [outfields]
    return FeatureStream(_duplicate(stream, infield, outfields),
                         fields=stream.fields + outfields)
Exemple #23
0
def select(stream, fields=None, selection={}):
    """
    Keeps only specified *fields* from a stream, and/or only elements matching *selection*.

    :param stream: FeatureStream.
    :param fields: (list of str) list of fields to keep in the output.
    :param selection: (dict {*field*:*val*}) keep only lines s.t. *field* has a value
        equal to *val*, or is an element of *val*. E.g. `select(f,None,{'chr':['chr1','chr2']})`.
        *val* can also be a function returning True or False when applied to an element of the field;
        if True, the element is kept.
    :rtype: FeatureStream, or list of FeatureStream objects.
    """
    def _select(stream, idxs):
        if selection:
            sel = dict([(stream.fields.index(f), val)
                        for f, val in selection.iteritems()])
            for x in stream:
                for k, val in sel.iteritems():
                    if isinstance(val, (list, tuple)):
                        if not x[k] in val: continue
                    elif hasattr(val, '__call__'):
                        if not val(x[k]): continue
                    else:
                        if not x[k] == val: continue
                    yield tuple([x[i] for i in idxs])
        else:
            for x in stream:
                yield tuple([x[i] for i in idxs])

    if not fields: fields = stream.fields
    idxs = [stream.fields.index(f) for f in fields]
    assert all([x > -1 for x in idxs
                ]), "Can only select amongst fields %s." % stream.fields
    assert hasattr(
        stream, 'fields'
    ) and stream.fields, "Object %s has no attribute 'fields'." % stream
    return FeatureStream(_select(stream, idxs), fields=fields)
Exemple #24
0
def reorder(stream, fields, last=False):
    """Reorders *stream.fields* so that *fields* come first.

    :param stream: FeatureStream object.
    :param fields: list of field names.
    :param last: (bool) if True, reorders fields so that *fields* come last.
    :rtype: FeatureStream
    """
    if not (hasattr(stream, 'fields')) or stream.fields is None:
        return stream
    if not (all([f in stream.fields for f in fields])):
        raise ValueError("Need %s fields in stream." % (", ".join(fields)))
    if all(stream.fields[n] == f for n, f in enumerate(fields)):
        return stream
    if last:
        _inds = [n for n, f in enumerate(stream.fields) if f not in fields
                 ] + [stream.fields.index(f) for f in fields]
    else:
        _inds = [stream.fields.index(f) for f in fields] + [
            n for n, f in enumerate(stream.fields) if f not in fields
        ]
    _flds = [stream.fields[n] for n in _inds]
    return FeatureStream((tuple(x[n] for n in _inds) for x in stream),
                         fields=_flds)
Exemple #25
0
def merge_scores(trackList, method='arithmetic'):
    """
    Creates a stream with per-base average of several score tracks::

        X1: __________666666666______
        X2: _____2222222222__________
        R:  _____11111444443333______

    :param trackList: list of FeatureStream objects.
    :param method: (str) type of average: one of 'arithmetic','geometric', or 'sum' (no average).
    :rtype: FeatureStream
    """
    tracks = [
        FeatureStream(common.sentinelize(x, [sys.maxint] * len(x.fields)),
                      x.fields) for x in trackList
    ]
    tracks = [common.reorder(t, ['start', 'end', 'score']) for t in tracks]
    fields = [
        f for f in tracks[0].fields if all([f in t.fields for t in tracks])
    ]  # common fields
    elements = [list(x.next()) for x in tracks]
    track_denom = 1.0 / len(trackList)

    if hasattr(method, '__call__'):
        mean_fn = lambda scores, denom: method(scores)
    else:
        mean_fn = _score_functions.get(method, _arithmetic_mean)
    for i in xrange(len(tracks) - 1, -1, -1):
        if elements[i][0] == sys.maxint:
            tracks.pop(i)
            elements.pop(i)

    def _stream(tracks):
        while tracks:
            start = min([x[0] for x in elements])
            end = min([x[0] for x in elements if x[0] > start] +
                      [x[1] for x in elements])
            scores = [x[2] for x in elements if x[1] > start and x[0] < end]
            if len(fields) > 3:
                rest = []
                for i in range(len(fields[3:])):
                    r = [
                        str(x[3 + i]) for x in elements if
                        not (x[3 + i] is None) and x[1] > start and x[0] < end
                    ]
                    if all([x == r[0] for x in r]):
                        rest.append(r[0])
                    else:
                        rest.append("|".join(r))
                yield (start, end, mean_fn(scores, track_denom)) + tuple(rest)
            else:
                yield (start, end, mean_fn(scores, track_denom))
            for i in xrange(len(tracks) - 1, -1, -1):
                if elements[i][0] < end:
                    elements[i][0] = end
                if elements[i][1] <= end:
                    elements[i] = list(tracks[i].next())
                if elements[i][0] == sys.maxint:
                    tracks.pop(i)
                    elements.pop(i)

    return FeatureStream(_stream(tracks), fields)
Exemple #26
0
def window_smoothing(trackList,
                     window_size,
                     step_size=1,
                     stop_val=sys.maxint,
                     featurewise=False):
    """
    Given a (list of) signal track(s) *trackList*, a *window_size* L (in base pairs by default,
    or in number of features if *featurewise* is True),  and a *step_size*,
    return as many signal tracks with, at each position p (multiple of *step_size*),
    the average score in the window [p-L/2, p+L/2]::

        X: __________666666666666____________
        R: ______12345666666666654321________ (not exact scores here)

    :param trackList: FeatureStream, or list of FeatureStream objects.
    :param window_size: (int) window size in bp.
    :param step_size: (int) step length (one score returned per *step_size* positions). [1]
    :param stop_val: (int) sequence length. [sys.maxint]
    :param featurewise: (bool) bp (False), or number of features (True). [False]
    :rtype: FeatureStream

    Example of windows, window_size=9, step_size=3:

    [0,1,2,3,4,5,6,7,8,9), [3,4,5,6,7,8,9,10,11,12), ...
    """
    def _stepping_mean(track, score, denom):
        score = 0.0
        F = []
        score = 0.0
        nmid = window_size / 2
        for x in track:
            F.append(x)
            score += x[2]
            if len(F) < window_size: continue
            yield (F[nmid][0], F[nmid][1], round(score * denom + 1e-7,
                                                 6)) + F[nmid][3:]
            for shift in xrange(step_size):
                score -= F.pop(0)[2]

    def _running_mean(track, win_start, denom):
        score = 0.0
        F = []
        for x in track:
            F.append(x)
            fstart = F[0][0]
            fend = F[0][1]
            chrom = F[0][3]
            win_start = max(win_start, fstart - window_size)
            win_end = win_start + window_size
            lstart = F[-1][0]
            lend = F[-1][1]
            while win_end < lend:
                delta = 0
                steps = [fend - win_start, lend - win_end]
                if fstart > win_start: steps.append(fstart - win_start)
                else: delta -= F[0][2]
                if lstart > win_end: steps.append(lstart - win_end)
                else: delta += F[-1][2]
                nsteps = min(steps)
                sst = -(win_start % step_size) % step_size
                sen = -(win_start + nsteps % step_size) % step_size
                win_center = (win_start + win_end) / 2
                if abs(delta) > 1e-11:
                    delta *= denom
                    score += delta * sst
                    for step in xrange(sst, nsteps, step_size):
                        if score > 1e-11 and win_center + step >= 0 and win_center + step + step_size <= stop_val:
                            yield (win_center + step,
                                   win_center + step + step_size, score, chrom)
                        score += delta * step_size
                    score -= delta * sen
                else:
                    if score > 1e-11 and win_center + sst >= 0 and win_center + sen + nsteps <= stop_val:
                        yield (win_center + sst, win_center + sen + nsteps,
                               score, chrom)
                win_start += nsteps
                win_end += nsteps
                if fend <= win_start:
                    F.pop(0)
                    if F:
                        fstart = F[0][0]
                        fend = F[0][1]
                        win_start = max(win_start, fstart - window_size)
                        win_end = win_start + window_size
                        if win_end > stop_val: break
        while F:
            delta = 0
            steps = [fend - win_start]
            if fstart > win_start: steps.append(fstart - win_start)
            else: delta -= F[0][2]
            nsteps = min(steps)
            sst = -(win_start % step_size) % step_size
            sen = -(win_start + nsteps % step_size) % step_size
            win_center = (win_start + win_end) / 2
            if abs(delta) > 1e-11:
                delta *= denom
                score += delta * sst
                for step in xrange(sst, nsteps, step_size):
                    if score > 1e-11 and win_center + step >= 0 and win_center + step + step_size <= stop_val:
                        yield (win_center + step,
                               win_center + step + step_size, score, chrom)
                    score += delta * step_size
                score -= delta * sen
            else:
                if score > 1e-11 and win_center + sst >= 0 and win_center + sen + nsteps <= stop_val:
                    yield (win_center + sst, win_center + sen + nsteps, score,
                           chrom)
            win_start += nsteps
            win_end += nsteps
            if fend <= win_start:
                F.pop(0)
                if F:
                    fstart = F[0][0]
                    fend = F[0][1]
                    win_start = max(win_start, fstart - window_size)
                    win_end = win_start + window_size
                    if win_end > stop_val: break

    denom = 1.0 / window_size
    win_start = -window_size
    _f = ['start', 'end', 'score']
    if featurewise:
        call = _stepping_mean
    else:
        call = _running_mean
    if isinstance(trackList, (list, tuple)):
        return [
            FeatureStream(call(common.reorder(t, _f), win_start, denom),
                          fields=_f) for n, t in enumerate(trackList)
        ]
    else:
        return FeatureStream(call(common.reorder(trackList, _f), win_start,
                                  denom),
                             fields=_f)
Exemple #27
0
def score_by_feature(trackScores, trackFeatures, method='mean'):
    """
    For every feature from *trackFeatures*, get the list of all scores it contains
    and apply an operation *method* on this list (by default, scores are averaged).
    Warning: both score and feature streams must be sorted! (use `common.sorted_stream` is necessary).
    The output is a stream similar to *trackFeatures* but with an additional `score` field
    for each stream in *trackScores*::
        method = 'mean':

        X: ------##########--------------##########------
        Y: ___________666666666__________6666666666______
        R: ______[   3.   ]______________[   6.   ]______


        method = 'sum':

        X : ------##########--------------##########------
        Y1: ___________666666666__________6666666666______
        Y2: ___222222_____________________333_____________
        R : ______[  30,6  ]______________[  60,9  ]______

    :param trackScores: (list of) one or several -sorted- score track(s) (FeatureStream).
    :param trackFeatures: (FeatureStream) one -sorted- feature track.
    :param method: (str of function): operation applied to the list of scores from one feature.
        Can be one of 'sum','mean','median','min','max', or a custom function.
    :rtype: FeatureStream
    """
    def _stream(ts, tf):
        X = [common.sentinelize(x, [sys.maxint] * len(x.fields)) for x in ts]
        S = [[(-sys.maxint, -sys.maxint, 0.0)] for t in ts]
        start_idx = tf.fields.index('start')
        end_idx = tf.fields.index('end')
        if hasattr(method, '__call__'):
            mean_fn = lambda scores, denom: method(scores)
        else:
            mean_fn = _score_functions.get(method, _arithmetic_mean)
        for y in tf:
            ystart = y[start_idx]
            yend = y[end_idx]
            scores = ()
            for i in range(len(ts)):
                xnext = S[i][-1]
                # Load into S all score items which intersect feature y
                while xnext[0] < yend:
                    xnext = X[i].next()
                    if xnext[1] > ystart: S[i].append(xnext)
                n = 0
                while S[i][n][1] <= ystart:
                    n += 1
                S[i] = S[i][n:]
                scores_y = []
                for s in S[i]:
                    if yend <= s[0]: continue
                    if s[0] < ystart: start = ystart
                    else: start = s[0]
                    if yend < s[1]: end = yend
                    else: end = s[1]
                    scores_y.extend([s[2]] * (end - start))
                scores += (mean_fn(scores_y, 1.0 / (yend - ystart)), )
            yield tuple(y) + scores

    if not (isinstance(trackScores, (list, tuple))):
        trackScores = [trackScores]
    if isinstance(trackFeatures, (list, tuple)):
        trackFeatures = concatenate(trackFeatures)
    if len(trackScores) > 1 or 'score' in trackFeatures.fields:
        _fields = ["score" + str(i) for i in range(len(trackScores))]
    else:
        _fields = ["score"]
    _ts = [common.reorder(t, ['start', 'end', 'score']) for t in trackScores]
    return FeatureStream(_stream(_ts, trackFeatures),
                         trackFeatures.fields + _fields)
Exemple #28
0
def filter_scores(trackScores,
                  trackFeatures,
                  method='sum',
                  strict=False,
                  annotate=False,
                  flatten=common.cobble):
    """
    Extract from *trackScores* only the regions overlapping *trackFeatures*'s regions.
    Warning: both score and features streams must be sorted! (use `common.sorted_stream` if necessary).
    Example::

        X: _____#########__________#############_______
        Y: __________666666666___2222776_444___________
        R: __________6666__________22776_444___________

    Note: *trackFeatures* is :func:`cobbled <bbcflib.gfminer.common.cobble>` by default (to avoid
    score duplications). An alternative is :func:`fusion <bbcflib.gfminer.common.fusion>`, or nothing.
    If strand information is present in both *trackScores* and *trackFeatures*, only scores inside
    a region of the same strand are kept.

    :param trackScores: (FeatureStream) one -sorted- score track.
        If a list of streams is provided, they will be merged (using `merge_scores`).
    :param trackFeatures: (FeatureStream) one -sorted- feature track.
        If a list of streams is provided, they will be merged (using `concatenate`).
    :param method: (str) `merge_scores` *method* argument, in case *trackScores* is a list. ['sum']
    :param strict: (bool) if True, only score regions from *trackScores* that are
        strictly contained in a feature region of *trackFeatures* will be returned. [False]
    :param annotate: (bool) if True, supplementary annotation (and the corresponding fields)
        from *trackFeatures* will be added to the result. [False]
    :param flatten: (func) one of None, `common.fusion` or `common.cobble`.
        Function to be applied to *trackFeatures* before all. [common.cobble]
    :rtype: FeatureStream
    """
    def _stream(ts, tf):
        tf = common.sentinelize(tf, [sys.maxint] * len(tf.fields))
        info_idx = [k for k, f in enumerate(tf.fields) if f not in ts.fields]
        if stranded:
            ts_strand_idx = ts.fields.index('strand')
            tf_strand_idx = tf.fields.index('strand')
            same_strand = lambda x, y: x[ts_strand_idx] == y[tf_strand_idx]
        else:
            same_strand = lambda x, y: True
        Y = []
        ynext = (-sys.maxint, -sys.maxint, 0.0)
        for x in ts:
            xstart = x[0]
            xend = x[1]
            # Load into Y all feature items which intersect score x
            while ynext[0] < xend:
                if ynext[1] > xstart:
                    Y.append(ynext)
                ynext = tf.next()
            # Remove features that are far behind x
            if Y:
                n = 0
                try:
                    while Y[n][1] <= xstart:
                        n += 1
                    Y = Y[n:]
                except IndexError:
                    Y = [ynext]
            # Yield intersections
            for y in Y:
                if not same_strand(x, y): continue
                info = tuple([y[k] for k in info_idx]) if annotate else ()
                if strict and (y[0] > xstart or y[1] < xend): continue
                if y[0] >= xend: continue  # keep for next iteration
                start = xstart if y[0] < xstart else y[0]
                end = xend if y[1] > xend else y[1]
                yield (start, end) + tuple(x[2:]) + info

    if isinstance(trackFeatures, (list, tuple)):
        trackFeatures = concatenate(trackFeatures)
    if isinstance(trackScores, (list, tuple)):
        trackScores = merge_scores(trackScores, method)
    _info_fields = [
        f for f in trackFeatures.fields if f not in trackScores.fields
    ] if annotate else []
    stranded = 'strand' in (set(trackScores.fields)
                            & set(trackFeatures.fields))
    if flatten is None:
        _tf = trackFeatures
    else:
        _tf = flatten(trackFeatures, stranded=stranded)
    _ts = common.reorder(trackScores, ['start', 'end'])
    _tf = common.reorder(_tf, ['start', 'end'])
    return FeatureStream(_stream(_ts, _tf), _ts.fields + _info_fields)
Exemple #29
0
 def _add_label(s, x):
     _f = s.fields + ['track_name']
     return FeatureStream((y + (x, ) for y in s), fields=_f)
Exemple #30
0
def concatenate(trackList, fields=None, remove_duplicates=False, group_by=None, aggregate={}):
    """
    Returns one stream containing all features from a list of tracks, ordered by *fields*.

    :param trackList: list of FeatureStream objects.
    :param fields: (list of str) list of fields to keep in the output (at least ['start','end']).
    :param remove_duplicates: (bool) whether to remove items that are identical in several
        of the tracks in *trackList*. [False]
    :param group_by: (list of str) if specified, elements having all values for these fields in
        common will be merged into a singe element. Other fields are merged according to *aggregate*
        if specified, or `common.generic_merge` by default.
    :aggregate: (dict) for each field name given as a key, its value is the function
        to apply to the vector containing all different values for this field in order to merge them.
        E.g. ``{'score': lambda x: sum(x)}`` will return the sum of all scores in the output.
    :rtype: FeatureStream
    """
    def _find_min(feat_tuple):
        """Return the index of the 'smallest' element amongst a tuple of features from
        different tracks. Priority is given to the first field; if the first field items
        are equal amongst several elements, it looks at the second field, a.s.o."""
        nmin = 0
        xmin = feat_tuple[0]
        for n,x in enumerate(feat_tuple[1:]):
            if x[0] == sys.maxint: continue
            for k in range(len(x)):
                if cmp(hash(x[k]),hash(xmin[k]))<0:
                    xmin = x
                    nmin = n+1
                    break
                elif cmp(hash(x[k]),hash(xmin[k]))>0:
                    break
        return nmin

    def _weave(_t,N):
        """Generator yielding all features represented in a list of tracks *_t*,
        sorted w.r.t the *N* first fields."""
        current = [x.next()[:N] for x in _t] # init
        allfields = [t.fields for t in _t]
        n = _find_min(current)
        last = current[n]
        current[n] = _t[n].next()[:N]
        if not group_by: yield last
        while 1:
            # Remove duplicates
            if remove_duplicates:
                while not all([current.count(x)==1 for x in current]):
                    for k in range(len(current)):
                        if current.count(current[k]) > 1:
                            current[k] = _t[k].next()[:N]
            n = _find_min(current)
            if current[n][0] == sys.maxint: break
            if group_by:
                idx = [allfields[n].index(f) for f in group_by]
                if all(current[n][i] == last[i] for i in idx):
                    last = tuple(current[n][i] if i in idx \
                            else aggregate.get(allfields[n][i],common.generic_merge)((last[i],current[n][i])) \
                            for i in range(len(allfields[n]))) # merge last and current
                else:
                    yield last
                    last = current[n]
            else:
                yield current[n]
            current[n] = _t[n].next()[:N]
        if group_by: yield last

    if len(trackList) == 1: return trackList[0]
    if fields is None:
        fields = trackList[0].fields
    fields = [f for f in fields if all(f in t.fields for t in trackList)]
    _of = ['start','end']
    if 'chr' in fields: _of = ['chr']+_of
    if 'name' in fields: _of += ['name']
    _of += [f for f in fields if not(f in _of)]
    tl = [common.reorder(t,_of) for t in trackList]
    tl = [FeatureStream(common.sentinelize(x,(sys.maxint,)*len(x.fields)),x.fields) for x in tl]
    return FeatureStream(_weave(tl,len(_of)),fields=_of)