def test_segment_features(self): stream = fstream([('X', 10, 16, 'A'), ('X', 18, 30, 'B'), ('I', 10, 16, 'C')], fields=['chr', 'start', 'end', 'name']) res = list( segment_features(stream, nbins=3, upstream=(2, 1), downstream=(3, 1))) expected = [('X', 8, 10, 'A', 0), ('X', 10, 12, 'A', 1), ('X', 12, 14, 'A', 2), ('X', 14, 16, 'A', 3), ('X', 16, 18, 'B', 0), ('X', 16, 19, 'A', 4), ('X', 18, 22, 'B', 1), ('X', 22, 26, 'B', 2), ('X', 26, 30, 'B', 3), ('X', 30, 33, 'B', 4), ('I', 8, 10, 'C', 0), ('I', 10, 12, 'C', 1), ('I', 12, 14, 'C', 2), ('I', 14, 16, 'C', 3), ('I', 16, 19, 'C', 4)] self.assertListEqual(res, expected) # With negative strand stream = fstream([(10, 16, -1), (24, 36, 1)], fields=['start', 'end', 'strand']) res = list( segment_features(stream, nbins=2, upstream=(2, 1), downstream=(3, 1))) expected = [(7, 10, -1, 3), (10, 13, -1, 2), (13, 16, -1, 1), (16, 18, -1, 0), (22, 24, 1, 0), (24, 30, 1, 1), (30, 36, 1, 2), (36, 39, 1, 3)] self.assertListEqual(res, expected)
def test_segment_features(self): stream = fstream([('X',10,16,'A'), ('X',18,30,'B'), ('I',10,16,'C')], fields=['chr','start','end','name']) res = list(segment_features(stream,nbins=3,upstream=(2,1),downstream=(3,1))) expected = [('X',8,10,'A',0), ('X',10,12,'A',1),('X',12,14,'A',2),('X',14,16,'A',3), ('X',16,18,'B',0), ('X',16,19,'A',4), ('X',18,22,'B',1),('X',22,26,'B',2),('X',26,30,'B',3), ('X',30,33,'B',4), ('I',8,10,'C',0), ('I',10,12,'C',1),('I',12,14,'C',2),('I',14,16,'C',3), ('I',16,19,'C',4)] self.assertListEqual(res,expected) # With negative strand stream = fstream([(10,16,-1), (24,36,1)], fields=['start','end','strand']) res = list(segment_features(stream,nbins=2,upstream=(2,1),downstream=(3,1))) expected = [(7,10,-1,3), (10,13,-1,2),(13,16,-1,1), (16,18,-1,0), (22,24,1,0), (24,30,1,1),(30,36,1,2), (36,39,1,3)] self.assertListEqual(res,expected)
def plot_footprint_profile(ex, bedlist, signals, chrnames, groups, logfile): files = dict((gid, {'pdf': "", 'mat': []}) for gid in bedlist.keys()) logfile.write("Plotting footprints:\n") logfile.flush() for gid, motifbed in bedlist.iteritems(): # signals = [track(sig) for sig in siglist[gid]] snames = [sig.name for sig in signals[gid]] tmotif = track(motifbed, format='bed') data = {} numregs = {} for chrom in chrnames: fread = {} for r in tmotif.read(chrom): r2 = r[3].split(":") key = (r2[0], len(r2[1])) if key in fread: fread[key].append(r[1:3]) else: fread[key] = [r[1:3]] for motif, regs in fread.iteritems(): if motif not in data: data[motif] = zeros(shape=(motif[1] + 2 * _plot_flank[1], len(signals[gid]))) numregs[motif] = 0 numregs[motif] += len(regs) tFeat = sorted_stream( segment_features(FeatureStream(regs, fields=['start', 'end']), nbins=motif[1], upstream=_plot_flank, downstream=_plot_flank)) for t in score_by_feature( [s.read(chrom) for s in signals[gid]], tFeat): data[motif][t[2]] += t[3:] files[gid]['pdf'] = unique_filename_in() new = True last = len(data) for motif, dat in data.iteritems(): last -= 1 mname, nbins = motif dat /= float(numregs[motif]) X = range(-_plot_flank[1], _plot_flank[1] + nbins) for k in range(nbins): X[k + _plot_flank[1]] = str(k + 1) ####### Could do a heatmap (sort by intensity)... lineplot(X, [dat[:, n] for n in range(dat.shape[-1])], mfrow=[4, 2], output=files[gid]['pdf'], new=new, last=(last == 0), legend=snames, main=mname) new = False _datf = unique_filename_in() with open(_datf, "w") as dff: dff.write("\t".join([""] + [str(x) for x in X]) + "\n") for n, sn in enumerate(snames): dff.write("\t".join([sn] + [str(x) for x in dat[:, n]]) + "\n") files[gid]['mat'].append((mname, _datf)) return files
def plot_footprint_profile( ex, bedlist, signals, chrnames, groups, logfile ): files = dict((gid,{'pdf':"",'mat':[]}) for gid in bedlist.keys()) logfile.write("Plotting footprints:\n");logfile.flush() for gid, motifbed in bedlist.iteritems(): # signals = [track(sig) for sig in siglist[gid]] snames = [sig.name for sig in signals[gid]] tmotif = track(motifbed,format='bed') data = {} numregs = {} for chrom in chrnames: fread = {} for r in tmotif.read(chrom): r2 = r[3].split(":") key = (r2[0],len(r2[1])) if key in fread: fread[key].append(r[1:3]) else: fread[key] = [r[1:3]] for motif, regs in fread.iteritems(): if motif not in data: data[motif] = zeros(shape=(motif[1]+2*_plot_flank[1], len(signals[gid]))) numregs[motif] = 0 numregs[motif] += len(regs) tFeat = sorted_stream(segment_features(FeatureStream(regs,fields=['start','end']), nbins=motif[1],upstream=_plot_flank,downstream=_plot_flank)) for t in score_by_feature([s.read(chrom) for s in signals[gid]], tFeat): data[motif][t[2]] += t[3:] files[gid]['pdf'] = unique_filename_in() new = True last = len(data) for motif, dat in data.iteritems(): last -= 1 mname, nbins = motif dat /= float(numregs[motif]) X = range(-_plot_flank[1],_plot_flank[1]+nbins) for k in range(nbins): X[k+_plot_flank[1]] = str(k+1) ####### Could do a heatmap (sort by intensity)... lineplot(X, [dat[:, n] for n in range(dat.shape[-1])], mfrow=[4,2], output=files[gid]['pdf'], new=new, last=(last==0), legend=snames, main=mname) new = False _datf = unique_filename_in() with open(_datf,"w") as dff: dff.write("\t".join([""]+[str(x) for x in X])+"\n") for n,sn in enumerate(snames): dff.write("\t".join([sn]+[str(x) for x in dat[:, n]])+"\n") files[gid]['mat'].append((mname,_datf)) return files
def summed_feature_matrix(trackScores,trackFeatures,method='mean',**kw): """ Each feature in *trackFeatures* is segmented into bins using bbcflib.gfminer.stream.segment_features (with parameters passed from *\*\*kw*). This creates a matrix with a column for each track in *trackScores* and a row for each bin in the segmented features. The values of a matrix entry is the score from one track in *trackScores* in one bin summed over all features. Example:: gene1 gene2 X: -----#####|#####|#####--------###|###|###----- (features, nbins=3) Y: _____________666|66666________666|666|666_____ Z: _____22222|22222|22222________________________ Y Z R: [[3. 1.], # bin 0 [4. 1.], # bin 1 [6. 1.]] # bin 2 Note: the whole segmented features track will be loaded in memory. :param trackScores: (FeatureStream, or list of FeatureStream objects) score track(s). :param trackFeatures: (FeatureStream) feature track. :param method: (str) Operation applied to the list of scores for one feature. It is the `method` argument to `stream.score_by_feature` - one of 'sum','mean','median','min','max'. :param **kw: arguments to pass to segment_features (`nbins`,`upstream`,`downstream`). :rtype: numpy.ndarray, int (number of features) """ nfields = len(trackFeatures.fields) trackFeatures = sorted_stream(segment_features(trackFeatures,**kw)) all_means = score_by_feature(trackScores,trackFeatures,method=method) if isinstance(trackScores,(list,tuple)): nscores = len(trackScores) else: nscores = 1 nbins = kw.get('nbins',segment_features.func_defaults[0]) \ + kw.get('upstream',(0,0))[1] \ + kw.get('downstream',(0,0))[1] averages = numpy.zeros(shape=(nbins,nscores)) ntot = -1 for ntot,x in enumerate(all_means): averages[x[nfields]] += x[(nfields+1):] return averages, (ntot+1)/nbins
def feature_matrix(trackScores,trackFeatures,segment=False,method='mean',**kw): """ Return an array with as many lines as there are features in *trackFeatures*, and as many columns as there are score tracks in *trackScores*. Each element in the matrix thus corresponds to the (average) score of some genomic feature. If *segment* is True, each feature will be segmented into bins using bbcflib.gfminer.stream.intervals.segment_features (additional parameters in *\*\*kw* will be passed to this function). Then each element of the array is itself an array with *nbins* lines and one column for each track in *trackScores*. If *segment* is False, then each element of the array is an array with one element for each track in *trackScores*. Example:: gene1 gene2 X: -----#####|#####|#####--------###|###|###----- (features) Y: _____________666|66666________666|666|666_____ (scores1) Z: _____22222|22222|22222________________________ (scores2) With segment=True, nbins=3: Y Z R: [[[0. 2.], # bin0 \ [2. 2.], # bin1 } gene 1 [6. 2.]], # bin2 / [[6. 0.], # bin0 \ [6. 0.], # bin1 } gene2 [6. 0.]]] # bin2 / With segment=False: Y Z R: [[3. 2.] [6. 0.]] Note: the whole segmented features track will be loaded in memory. :param trackScores: (FeatureStream, or list of FeatureStream objects) score track(s). :param trackFeatures: (FeatureStream) feature track. :param segment: (bool) segment each feature into bins.[False] :param method: (str) Operation applied to the list of scores for one feature. It is the `method` argument to `stream.score_by_feature` - one of 'sum','mean','median','min','max'. :param **kw: arguments to pass to segment_features (`nbins`,`upstream`,`downstream`). :rtype: tuple (numpy.ndarray of strings, numpy.ndarray of floats) """ nbins = 1 nscores = 1 if segment: trackFeatures = sorted_stream(segment_features(trackFeatures,**kw)) nbins = kw.get('nbins',segment_features.func_defaults[0]) \ + kw.get('upstream',(0,0))[1] \ + kw.get('downstream',(0,0))[1] all_means = score_by_feature(trackScores,trackFeatures,method=method) nfields = len(trackFeatures.fields) if isinstance(trackScores,(list,tuple)): nscores = len(trackScores) scores_dict = {} if segment: empty_mat = numpy.zeros(shape=(nbins,nscores)) else: empty_mat = numpy.zeros(nscores) name_idx = all_means.fields.index('name') for t in all_means: _n = t[name_idx] scores_dict.setdefault(_n, empty_mat.copy()) if segment: scores_dict[_n][t[nfields-1]] = t[nfields:] else: scores_dict[_n] = t[nfields:] feat_names = numpy.array(scores_dict.keys()) scores_mat = numpy.array(scores_dict.values()) return (feat_names,scores_mat)