Python segment_features Examples, bbcflib.gfminer.stream.segment_features Python Examples

Example #1

0

Show file

File: test_gfminer.py Project: MolbioUnige/bbcflib

    def test_segment_features(self):
        stream = fstream([('X', 10, 16, 'A'), ('X', 18, 30, 'B'),
                          ('I', 10, 16, 'C')],
                         fields=['chr', 'start', 'end', 'name'])
        res = list(
            segment_features(stream,
                             nbins=3,
                             upstream=(2, 1),
                             downstream=(3, 1)))
        expected = [('X', 8, 10, 'A', 0), ('X', 10, 12, 'A', 1),
                    ('X', 12, 14, 'A', 2), ('X', 14, 16, 'A', 3),
                    ('X', 16, 18, 'B', 0), ('X', 16, 19, 'A', 4),
                    ('X', 18, 22, 'B', 1), ('X', 22, 26, 'B', 2),
                    ('X', 26, 30, 'B', 3), ('X', 30, 33, 'B', 4),
                    ('I', 8, 10, 'C', 0), ('I', 10, 12, 'C', 1),
                    ('I', 12, 14, 'C', 2), ('I', 14, 16, 'C', 3),
                    ('I', 16, 19, 'C', 4)]
        self.assertListEqual(res, expected)

        # With negative strand
        stream = fstream([(10, 16, -1), (24, 36, 1)],
                         fields=['start', 'end', 'strand'])
        res = list(
            segment_features(stream,
                             nbins=2,
                             upstream=(2, 1),
                             downstream=(3, 1)))
        expected = [(7, 10, -1, 3), (10, 13, -1, 2), (13, 16, -1, 1),
                    (16, 18, -1, 0), (22, 24, 1, 0), (24, 30, 1, 1),
                    (30, 36, 1, 2), (36, 39, 1, 3)]
        self.assertListEqual(res, expected)

Example #2

0

Show file

File: test_gfminer.py Project: JoseEspinosa/bbcflib

    def test_segment_features(self):
        stream = fstream([('X',10,16,'A'), ('X',18,30,'B'), ('I',10,16,'C')],
                         fields=['chr','start','end','name'])
        res = list(segment_features(stream,nbins=3,upstream=(2,1),downstream=(3,1)))
        expected = [('X',8,10,'A',0), ('X',10,12,'A',1),('X',12,14,'A',2),('X',14,16,'A',3), ('X',16,18,'B',0),
                    ('X',16,19,'A',4), ('X',18,22,'B',1),('X',22,26,'B',2),('X',26,30,'B',3), ('X',30,33,'B',4),
                    ('I',8,10,'C',0), ('I',10,12,'C',1),('I',12,14,'C',2),('I',14,16,'C',3), ('I',16,19,'C',4)]
        self.assertListEqual(res,expected)

        # With negative strand
        stream = fstream([(10,16,-1), (24,36,1)], fields=['start','end','strand'])
        res = list(segment_features(stream,nbins=2,upstream=(2,1),downstream=(3,1)))
        expected = [(7,10,-1,3), (10,13,-1,2),(13,16,-1,1), (16,18,-1,0),
                    (22,24,1,0), (24,30,1,1),(30,36,1,2), (36,39,1,3)]
        self.assertListEqual(res,expected)

Example #3

0

Show file

File: dnaseseq.py Project: MolbioUnige/bbcflib

def plot_footprint_profile(ex, bedlist, signals, chrnames, groups, logfile):
    files = dict((gid, {'pdf': "", 'mat': []}) for gid in bedlist.keys())
    logfile.write("Plotting footprints:\n")
    logfile.flush()
    for gid, motifbed in bedlist.iteritems():
        #        signals = [track(sig) for sig in siglist[gid]]
        snames = [sig.name for sig in signals[gid]]
        tmotif = track(motifbed, format='bed')
        data = {}
        numregs = {}
        for chrom in chrnames:
            fread = {}
            for r in tmotif.read(chrom):
                r2 = r[3].split(":")
                key = (r2[0], len(r2[1]))
                if key in fread: fread[key].append(r[1:3])
                else: fread[key] = [r[1:3]]
            for motif, regs in fread.iteritems():
                if motif not in data:
                    data[motif] = zeros(shape=(motif[1] + 2 * _plot_flank[1],
                                               len(signals[gid])))
                    numregs[motif] = 0
                numregs[motif] += len(regs)
                tFeat = sorted_stream(
                    segment_features(FeatureStream(regs,
                                                   fields=['start', 'end']),
                                     nbins=motif[1],
                                     upstream=_plot_flank,
                                     downstream=_plot_flank))
                for t in score_by_feature(
                    [s.read(chrom) for s in signals[gid]], tFeat):
                    data[motif][t[2]] += t[3:]
        files[gid]['pdf'] = unique_filename_in()
        new = True
        last = len(data)
        for motif, dat in data.iteritems():
            last -= 1
            mname, nbins = motif
            dat /= float(numregs[motif])
            X = range(-_plot_flank[1], _plot_flank[1] + nbins)
            for k in range(nbins):
                X[k + _plot_flank[1]] = str(k + 1)
            ####### Could do a heatmap (sort by intensity)...
            lineplot(X, [dat[:, n] for n in range(dat.shape[-1])],
                     mfrow=[4, 2],
                     output=files[gid]['pdf'],
                     new=new,
                     last=(last == 0),
                     legend=snames,
                     main=mname)
            new = False
            _datf = unique_filename_in()
            with open(_datf, "w") as dff:
                dff.write("\t".join([""] + [str(x) for x in X]) + "\n")
                for n, sn in enumerate(snames):
                    dff.write("\t".join([sn] + [str(x)
                                                for x in dat[:, n]]) + "\n")
            files[gid]['mat'].append((mname, _datf))
    return files

Example #4

0

Show file

File: dnaseseq.py Project: JoseEspinosa/bbcflib

def plot_footprint_profile( ex, bedlist, signals, chrnames, groups, logfile ):
    files = dict((gid,{'pdf':"",'mat':[]}) for gid in bedlist.keys())
    logfile.write("Plotting footprints:\n");logfile.flush()
    for gid, motifbed in bedlist.iteritems():
#        signals = [track(sig) for sig in siglist[gid]]
        snames = [sig.name for sig in signals[gid]]
        tmotif = track(motifbed,format='bed')
        data = {}
        numregs = {}
        for chrom in chrnames:
            fread = {}
            for r in tmotif.read(chrom):
                r2 = r[3].split(":")
                key = (r2[0],len(r2[1]))
                if key in fread: fread[key].append(r[1:3])
                else: fread[key] = [r[1:3]]
            for motif, regs in fread.iteritems():
                if motif not in data:
                    data[motif] = zeros(shape=(motif[1]+2*_plot_flank[1], len(signals[gid])))
                    numregs[motif] = 0
                numregs[motif] += len(regs)
                tFeat = sorted_stream(segment_features(FeatureStream(regs,fields=['start','end']),
                                                       nbins=motif[1],upstream=_plot_flank,downstream=_plot_flank))
                for t in score_by_feature([s.read(chrom) for s in signals[gid]], tFeat): 
                    data[motif][t[2]] += t[3:]
        files[gid]['pdf'] = unique_filename_in()
        new = True
        last = len(data)
        for motif, dat in data.iteritems():
            last -= 1
            mname, nbins = motif
            dat /= float(numregs[motif])
            X = range(-_plot_flank[1],_plot_flank[1]+nbins)
            for k in range(nbins): X[k+_plot_flank[1]] = str(k+1)
####### Could do a heatmap (sort by intensity)...
            lineplot(X, [dat[:, n] for n in range(dat.shape[-1])], mfrow=[4,2],
                     output=files[gid]['pdf'], new=new, last=(last==0), 
                     legend=snames, main=mname)
            new = False
            _datf = unique_filename_in()
            with open(_datf,"w") as dff:
                dff.write("\t".join([""]+[str(x) for x in X])+"\n")
                for n,sn in enumerate(snames):
                    dff.write("\t".join([sn]+[str(x) for x in dat[:, n]])+"\n")
            files[gid]['mat'].append((mname,_datf))
    return files

Example #5

0

Show file

File: regions.py Project: JoseEspinosa/bbcflib

def summed_feature_matrix(trackScores,trackFeatures,method='mean',**kw):
    """
    Each feature in *trackFeatures* is segmented into bins using bbcflib.gfminer.stream.segment_features
    (with parameters passed from *\*\*kw*).
    This creates a matrix with a column for each track in *trackScores* and a row for each bin in the segmented features.
    The values of a matrix entry is the score from one track in *trackScores* in one bin summed over all features.


    Example::

                      gene1                 gene2
        X: -----#####|#####|#####--------###|###|###-----  (features, nbins=3)
        Y: _____________666|66666________666|666|666_____
        Z: _____22222|22222|22222________________________

             Y   Z
        R: [[3.  1.],   # bin 0
            [4.  1.],   # bin 1
            [6.  1.]]   # bin 2

    Note: the whole segmented features track will be loaded in memory.

    :param trackScores: (FeatureStream, or list of FeatureStream objects) score track(s).
    :param trackFeatures: (FeatureStream) feature track.
    :param method: (str) Operation applied to the list of scores for one feature.
        It is the `method` argument to `stream.score_by_feature` - one of 'sum','mean','median','min','max'.
    :param **kw: arguments to pass to segment_features (`nbins`,`upstream`,`downstream`).
    :rtype: numpy.ndarray, int (number of features)
    """
    nfields = len(trackFeatures.fields)
    trackFeatures = sorted_stream(segment_features(trackFeatures,**kw))
    all_means = score_by_feature(trackScores,trackFeatures,method=method)
    if isinstance(trackScores,(list,tuple)):
        nscores = len(trackScores)
    else:
        nscores = 1
    nbins = kw.get('nbins',segment_features.func_defaults[0]) \
            + kw.get('upstream',(0,0))[1] \
            + kw.get('downstream',(0,0))[1]
    averages = numpy.zeros(shape=(nbins,nscores))
    ntot = -1
    for ntot,x in enumerate(all_means):
        averages[x[nfields]] += x[(nfields+1):]
    return averages, (ntot+1)/nbins

Example #6

0

Show file

File: regions.py Project: JoseEspinosa/bbcflib

def feature_matrix(trackScores,trackFeatures,segment=False,method='mean',**kw):
    """
    Return an array with as many lines as there are features in *trackFeatures*, and as many columns
    as there are score tracks in *trackScores*. Each element in the matrix thus corresponds to the
    (average) score of some genomic feature.

    If *segment* is True, each feature will be segmented into bins using
    bbcflib.gfminer.stream.intervals.segment_features (additional parameters in *\*\*kw* will be passed to this function).
    Then each element of the array is itself an array with *nbins* lines and one column for each track in *trackScores*.

    If *segment* is False, then each element of the array is an array with one element for each track in *trackScores*.

    Example::

                      gene1                 gene2
        X: -----#####|#####|#####--------###|###|###-----  (features)
        Y: _____________666|66666________666|666|666_____  (scores1)
        Z: _____22222|22222|22222________________________  (scores2)

        With segment=True, nbins=3:

              Y   Z
        R: [[[0.  2.],    # bin0 \
             [2.  2.],    # bin1  } gene 1
             [6.  2.]],   # bin2 /
            [[6.  0.],    # bin0 \
             [6.  0.],    # bin1  } gene2
             [6.  0.]]]   # bin2 /

        With segment=False:

              Y   Z
        R:  [[3.  2.]
             [6.  0.]]

    Note: the whole segmented features track will be loaded in memory.

    :param trackScores: (FeatureStream, or list of FeatureStream objects) score track(s).
    :param trackFeatures: (FeatureStream) feature track.
    :param segment: (bool) segment each feature into bins.[False]
    :param method: (str) Operation applied to the list of scores for one feature.
        It is the `method` argument to `stream.score_by_feature` - one of 'sum','mean','median','min','max'.
    :param **kw: arguments to pass to segment_features (`nbins`,`upstream`,`downstream`).
    :rtype: tuple (numpy.ndarray of strings, numpy.ndarray of floats)
    """
    nbins = 1
    nscores = 1
    if segment:
        trackFeatures = sorted_stream(segment_features(trackFeatures,**kw))
        nbins = kw.get('nbins',segment_features.func_defaults[0]) \
                + kw.get('upstream',(0,0))[1] \
                + kw.get('downstream',(0,0))[1]
    all_means = score_by_feature(trackScores,trackFeatures,method=method)
    nfields = len(trackFeatures.fields)
    if isinstance(trackScores,(list,tuple)):
        nscores = len(trackScores)
    scores_dict = {}
    if segment:
        empty_mat = numpy.zeros(shape=(nbins,nscores))
    else:
        empty_mat = numpy.zeros(nscores)
    name_idx = all_means.fields.index('name')
    for t in all_means:
        _n = t[name_idx]
        scores_dict.setdefault(_n, empty_mat.copy())
        if segment:
            scores_dict[_n][t[nfields-1]] = t[nfields:]
        else:
            scores_dict[_n] = t[nfields:]
    feat_names = numpy.array(scores_dict.keys())
    scores_mat = numpy.array(scores_dict.values())
    return (feat_names,scores_mat)