def shuffled(stream, chrlen=sys.maxint, repeat_number=1, sorted=True): """Return a stream of randomly located features of the same length and annotation as these of the original stream. :param stream: FeatureStream object. :param chrlen: (int) chromosome length. [9223372036854775807] :param repeat_number: (int) *repeat_number* random features are yielded per input feature. [1] :param sorted: (bool) whether or not to sort the output stream. [True] :rtype: FeatureStream """ import random _f = ['start', 'end'] features = reorder(stream, _f) def _shuffled(_s): randpos = [] for feat in _s: feat_len = feat[1] - feat[0] for s in xrange(repeat_number): if len(randpos) == 0: randpos = [ random.randint(0, chrlen - feat_len) for i in xrange(10000) ] start = randpos.pop() yield (start, start + feat_len) + feat[2:] if sorted: return sorted_stream(FeatureStream(_shuffled(features), features.fields), fields=_f) else: return FeatureStream(_shuffled(features), features.fields)
def map_chromosomes(stream, chromosomes, keep=False): """ Translate the chromosome identifiers in *stream* into chromosome names of the type 'chr5'. :param stream: FeatureStream object. :param chromosomes: a dictionary of chromosomes, such as `genrep.Assembly.chromosomes`. :param keep: (bool) keep all features (True) or only those which chromosome identifier is recognized (False). [False] """ if not ('chr' in stream.fields): return stream ic = stream.fields.index('chr') chrom_map = {} for k, c in chromosomes.iteritems(): cname = c['name'] chrom_map[cname] = cname # {'chrIV': 'chrIV'} if cname.startswith('chr') and len(cname) > 3: chrom_map[cname[3:]] = cname # {'IV': 'chrIV'} chrom_map[k[0]] = cname # {2780: 'chrIV'} chrom_map[str(k[1]) + "." + str(k[2])] = cname # {'NC_001136.9': 'chrIV'} chrom_map[str(k[0]) + "_" + str(k[1]) + "." + str(k[2])] = cname # {'2780_NC_001136.9': 'chrIV'} if c.get('synonyms'): for s in c['synonyms'].split(','): chrom_map[s] = cname # {synonym: 'chrIV'} if keep: return FeatureStream( (x[:ic] + (chrom_map.get(x[ic], x[0]), ) + x[ic + 1:] for x in stream), stream.fields) else: return FeatureStream((x[:ic] + (chrom_map[x[ic]], ) + x[ic + 1:] for x in stream if x[ic] in chrom_map), stream.fields)
def score_threshold(stream, threshold=0.0, lower=False, strict=False, fields='score'): """ Filter the features of a track which score is above or below a certain threshold. :param stream: FeatureStream, or list of FeatureStream objects. :param threshold: (float) threshold above/below which features are retained :param lower: (bool) higher (False) or lower (True) bound. :param strict: (bool) strictly above/below threshold. :param fields: (str or list of str) names of the fields to apply the filter to. :rtype: FeatureStream, or list of FeatureStream objects """ if not (isinstance(fields, (list, tuple))): fields = [fields] def _threshold(stream, th, lower, fields): gt = operator.gt if strict else operator.ge lower = -1 if lower else 1 fidx = [stream.fields.index(f) for f in fields] for x in stream: if all([gt(lower * x[k], lower * th) for k in fidx]): yield x if isinstance(stream, (list, tuple)): return [ FeatureStream(_threshold(s, threshold, lower, fields), fields=s.fields) for s in stream ] else: return FeatureStream(_threshold(stream, threshold, lower, fields), fields=stream.fields)
def sorted_stream(stream, chrnames=[], fields=['chr', 'start', 'end'], reverse=False): """Sorts a stream according to *fields* values. Will load the entire stream in memory. The order of names in *chrnames* is used to sort the 'chr' field if available. :param stream: FeatureStream object. :param chrnames: list of chrmosome names. :param fields: list of field names. [['chr','start','end']] :param reverse: reverse order. [False] :rtype: FeatureStream """ fidx = [stream.fields.index(f) for f in fields if f in stream.fields] chri = -1 if 'chr' in fields: chri = fields.index('chr') feature_list = list(stream) sort_list = [] for n, f in enumerate(feature_list): if chri >= 0 and f[fidx[chri]] in chrnames: fchr = chrnames.index(f[fidx[chri]]) else: fchr = f[fidx[chri]] x = tuple(f[i] for i in fidx[:chri]) + (fchr, ) + tuple( f[i] for i in fidx[chri + 1:]) + (n, ) sort_list.append(x) sort_list.sort(reverse=reverse) return FeatureStream((feature_list[t[-1]] for t in sort_list), stream.fields)
def combine(trackList, fn, win_size=1000, aggregate={}): """ Applies a custom function to a list of tracks, such as union, intersection, etc., and return a single result track. The input streams need to be ordered w.r.t 'chr', 'start' and 'end'. To be applied chromosome by chromosome. Only fields of the first track are kept. Values for a common field are merged by default according to `common.strand_merge`,`common.no_merge` and `common.generic_merge`, respectively for strand, chromosome and all others. :param trackList: list of FeatureStream objects. :param fn: boolean function to apply, such as bbcflib.gfminer.stream.union. :param win_size: (int) window size, in bp. :param aggregate: (dict) for each field name given as a key, its value is the function to apply to the vector containing all trackList's values for this field in order to merge them. E.g. ``{'score': lambda x: sum(x)/len(x)}`` will return the average of all *trackList*'s scores in the output. :rtype: FeatureStream """ aggregate.setdefault('strand',common.strand_merge) aggregate.setdefault('chr',common.no_merge) _f = ['start','end'] if all('chr' in t.fields for t in trackList): _f += ['chr'] if isinstance(fn,str): fn = eval(fn) # can type "combine(...,fn='intersection')" trackList = [common.cobble(common.reorder(t,fields=_f)) for t in trackList] return common.fusion(FeatureStream(_combine(trackList,fn,win_size,aggregate), fields=trackList[0].fields))
def apply(stream, fields, functions): """ Applies custom transformations to the respective fields. :param stream: FeatureStream object. :param fields: (list of str) list of fields to transform in the output. :param functions: list of functions to apply to the respective *fields*. :rtype: FeatureStream, or list of FeatureStream objects """ def _apply(stream, fields, functions): nf = len(stream.fields) idx = [stream.fields.index(f) for f in fields] fct = dict(zip(idx, functions)) for i in range(nf): fct.setdefault(i, lambda x: x) for x in stream: yield tuple([fct[i](x[i]) for i in range(nf)]) if isinstance(fields, str): fields = [fields] if hasattr(functions, '__call__'): functions = [functions] assert len(fields) == len( functions ), "The number of fields does not equal the number of functions." return FeatureStream(_apply(stream, fields, functions), fields=stream.fields)
def __call__(self, **kw): # Create a track with the whole chromosome chrmeta = _get_chrmeta(**kw) sig0 = track(kw['TrackMulti']['tracks'][0]) fields = sig0.fields format = sig0.format is_chr = 'chr' in fields _f0 = ('chr', 'start', 'end') if is_chr else ('start', 'end') _f1 = [f for f in fields if f not in _f0] whole_chr = [] if is_chr: for chr in chrmeta: whole_chr.append((chr, 0, chrmeta[chr]['length']) + ('0', ) * len(_f1)) else: fields = [f for f in fields if f not in ['start', 'end']] fields = ['start', 'end'] + fields for chr in chrmeta: whole_chr.append((0, chrmeta[chr]['length']) + ('0', ) * len(_f1)) whole_chr = FeatureStream(whole_chr, fields=fields) temp = self.temporary_path() + '.' + format with track(temp, fields=fields) as wc: wc.write(whole_chr) kw['TrackMulti']['tracks'] = [temp] + kw['TrackMulti']['tracks'] output = self.temporary_path(fname='combined.') output = _combine(self._func, output, **kw) self.new_file(output, 'combined') return self.display_time()
def plot_footprint_profile(ex, bedlist, signals, chrnames, groups, logfile): files = dict((gid, {'pdf': "", 'mat': []}) for gid in bedlist.keys()) logfile.write("Plotting footprints:\n") logfile.flush() for gid, motifbed in bedlist.iteritems(): # signals = [track(sig) for sig in siglist[gid]] snames = [sig.name for sig in signals[gid]] tmotif = track(motifbed, format='bed') data = {} numregs = {} for chrom in chrnames: fread = {} for r in tmotif.read(chrom): r2 = r[3].split(":") key = (r2[0], len(r2[1])) if key in fread: fread[key].append(r[1:3]) else: fread[key] = [r[1:3]] for motif, regs in fread.iteritems(): if motif not in data: data[motif] = zeros(shape=(motif[1] + 2 * _plot_flank[1], len(signals[gid]))) numregs[motif] = 0 numregs[motif] += len(regs) tFeat = sorted_stream( segment_features(FeatureStream(regs, fields=['start', 'end']), nbins=motif[1], upstream=_plot_flank, downstream=_plot_flank)) for t in score_by_feature( [s.read(chrom) for s in signals[gid]], tFeat): data[motif][t[2]] += t[3:] files[gid]['pdf'] = unique_filename_in() new = True last = len(data) for motif, dat in data.iteritems(): last -= 1 mname, nbins = motif dat /= float(numregs[motif]) X = range(-_plot_flank[1], _plot_flank[1] + nbins) for k in range(nbins): X[k + _plot_flank[1]] = str(k + 1) ####### Could do a heatmap (sort by intensity)... lineplot(X, [dat[:, n] for n in range(dat.shape[-1])], mfrow=[4, 2], output=files[gid]['pdf'], new=new, last=(last == 0), legend=snames, main=mname) new = False _datf = unique_filename_in() with open(_datf, "w") as dff: dff.write("\t".join([""] + [str(x) for x in X]) + "\n") for n, sn in enumerate(snames): dff.write("\t".join([sn] + [str(x) for x in dat[:, n]]) + "\n") files[gid]['mat'].append((mname, _datf)) return files
def sentinelize(stream, sentinel=sys.maxint): """Append *sentinel* at the end of *iterable* (avoid StopIteration error).""" def _sentinelize(stream): for item in stream: yield item yield sentinel return FeatureStream(_sentinelize(stream), fields=stream.fields)
def _filter_deconv(stream, pval): ferr = re.compile(r';FERR=([\d\.]+)$') return FeatureStream( ((x[0], ) + ((x[2] + x[1]) / 2 - 150, (x[2] + x[1]) / 2 + 150) + x[3:] for x in stream if "FERR=" in x[3] and float(ferr.search(x[3]).groups()[0]) <= pval), fields=stream.fields)
def unroll(stream, regions, fields=['score']): """Creates a stream of *end*-*start* items with appropriate *fields* values at every base position. For example, ``unroll([(10,12,0.5,'a'), (14,15,1.2,'b')], regions=(9,16))`` returns:: FeatureStream([(0,),(0.5,'a'),(0.5,'a'),(0,),(0,),(1.2,'b'),(0,)]) 9 10 11 12 13 14 15 :param stream: FeatureStream object. :param regions: either a pair (start,end) or an ordered list of such pairs or a FeatureStream interpreted as bounds of the region(s) to return. :param fields: list of field names **in addition to 'start','end'**. [['score']] :rtype: FeatureStream """ if not (isinstance(fields, (list, tuple))): fields = [fields] with_chrom = False if isinstance(regions, (list, tuple)): if not isinstance(regions[0], (list, tuple)): regions = [regions] if len(regions[0]) > 2: with_chrom = True regions = iter(regions) elif isinstance(regions, FeatureStream): _f = ['start', 'end'] if 'chr' in regions.fields: _f = ['chr'] + _f with_chrom = True regions = reorder(regions, _f) else: raise ValueError("regions: Expected tuple or FeatureStream, got %s." % type(regions)) if with_chrom: s = reorder(stream, ['start', 'end', 'chr'] + fields) nf = 3 else: s = reorder(stream, ['start', 'end'] + fields) nf = 2 item0 = (0, ) + (None, ) * (len(fields) - 1) def _unr(s): for reg in regions: if with_chrom: chrom, pos, end = reg[:3] else: chrom = None pos, end = reg[:2] for x in s: if chrom and not (x[2] == chrom): continue if x[1] <= pos: continue while pos < min(x[0], end): yield item0 pos += 1 while pos < min(x[1], end): yield x[nf:] pos += 1 if pos >= end: break while pos < end: yield item0 pos += 1 return FeatureStream(_unr(s), fields=s.fields[nf:])
def _shift(stream, shift): istart = stream.fields.index('start') iend = stream.fields.index('end') i1 = min(istart, iend) i2 = max(istart, iend) def _apply_shift(x): return x[:i1] + (x[i1] + shift,) + x[i1 + 1:i2] + (x[i2] + shift,) + x[i2 + 1:] return FeatureStream((_apply_shift(x) for x in stream), fields=stream.fields)
def add_name_field(stream): """ Adds a unique name to each record in the stream. """ ci = stream.fields.index('chr') si = stream.fields.index('start') ei = stream.fields.index('end') _f = stream.fields + ['name'] return FeatureStream( (r + ("%s:%i-%i" % (r[ci], r[si], r[ei]), ) for r in stream), fields=_f)
def __call__(self, **kw): assembly = genrep.Assembly(kw.get('assembly')) format = kw['format'] if kw['feature_type'] == 'genes': map = assembly.get_gene_mapping() get_info = self.genes_annot elif kw['feature_type'] == 'exons': map = assembly.get_exon_mapping() get_info = self.exons_annot elif kw['feature_type'] == 'transcripts': map = assembly.get_transcript_mapping() get_info = self.trans_annot def _annotate(ids_list): with open(ids_list) as ids_file: for id in ids_file: id = id.strip() if map.get(id): yield get_info(id, map.get(id)) else: yield ('NA', '0', '0', id, 0.0, '0') ids_list = kw.get('ids_list') fields = ['chr', 'start', 'end', 'name', 'score', 'strand'] if ids_list: assert os.path.exists( str(ids_list)), "File not found: '%s'" % ids_list fulltrack = FeatureStream(_annotate(ids_list), fields=fields) fname = os.path.splitext(os.path.basename(ids_list))[0] else: fulltrack = FeatureStream((get_info(g, map[g]) for g in map), fields=fields) fname = kw['feature_type'] output = self.temporary_path(fname=fname + '.' + format) out = track(output, chrmeta=assembly) out.write(fulltrack) self.new_file(output, 'fulltrack') return self.display_time()
def _join_macs(stream, xlsl, _f): def _macs_row(_s): for _p in _s: for _n in _p[3].split("|"): if len(xlsl) == 1: nb = int( _n.split(";")[0][13:]) if _n[:3] == "ID=" else int( _n[10:]) yield _p + xlsl[0][nb - 1][1:] else: nb = _n.split( ";")[0][13:] if _n[:3] == "ID=" else _n[10:] nb = nb.split(":") yield _p + xlsl[int(nb[1])][int(nb[0]) - 1][1:] return FeatureStream(_macs_row(stream), fields=_f)
def overlap(trackList,trackFeatures,strict=False,annotate=False,flatten=common.cobble): """ For each stream in *trackList*, keep only items overlapping at least one element of *trackFeatures*. The input streams need to be ordered w.r.t 'chr', 'start' and 'end'. To be applied chromosome by chromosome. If several tracks are given in either trackList or trackFeatures, they will be concatenated into one. :param trackList: FeatureStream - the elements to be filtered. If a list of streams is provided, they will be merged (using `concatenate`). :param trackFeatures: FeatureStream - the filter. If a list fo streams is provided, they will be merged (using `concatenate`). :param strict: (bool) if True, only score regions from *trackList* that entirely contain a feature region of *trackFeatures* will be returned. [False] :param annotate: (bool) if True, supplementary annotation (and the corresponding fields) from *trackFeatures* will be added to the result. [False] :param flatten: (func) one of None, `common.fusion` or `common.cobble`. Function to be applied to *trackFeatures* before all. [common.cobble] :rtype: FeatureStream """ def _overlap(tl,tf,stranded,strict): if strict: olap = lambda x,y: x[0] <= y[0] and y[1] <= x[1] else: olap = lambda x,y: x[0] < y[1] if stranded: tl_strand_idx = tl.fields.index('strand') tf_strand_idx = tf.fields.index('strand') same_strand = lambda x,y:x[tl_strand_idx]==y[tf_strand_idx] else: same_strand = lambda x,y:True x = tl.next() for y in tf: try: if not same_strand(x,y): x = tl.next() while x[1] <= y[0]: x = tl.next() while olap(x,y): yield x x = tl.next() except StopIteration: break if isinstance(trackFeatures,(list,tuple)): trackList = concatenate(trackFeatures) if isinstance(trackFeatures,(list,tuple)): trackFeatures = concatenate(trackFeatures) stranded = 'strand' in (set(trackList.fields) & set(trackFeatures.fields)) if flatten is None: _tf = trackFeatures else: _tf = flatten(trackFeatures,stranded=stranded) _tl = common.reorder(trackList,['start','end']) _tf = common.reorder(trackFeatures,['start','end']) return FeatureStream(_overlap(_tl,_tf,stranded,strict), _tl.fields)
def fusion(stream, aggregate={}, stranded=False): """Fuses overlapping features in *stream* and applies *aggregate[f]* function to each field *f*. *stream* has to be sorted w.r.t. 'chr' (if any), 'start' and 'end'. Example:: [('chr1',10,15,'A',1),('chr1',13,18,'B',-1),('chr1',18,25,'C',-1)] yields ('chr1', 10, 18, 'A|B', 0) ('chr1', 18, 25, 'C', -1) :param stream: FeatureStream object. :param stranded: (bool) if True, only features of the same strand are fused. [False] :rtype: FeatureStream """ aggreg = dict(aggreg_functions) aggreg.update(aggregate) def _fuse(s, stranded): try: x = list(s.next()) except StopIteration: return has_chr = 'chr' in s.fields if has_chr: chridx = s.fields.index('chr') if stranded: stridx = s.fields.index('strand') for y in s: new_chr = has_chr and (x[chridx] != y[chridx]) new_str = stranded and (x[stridx] != y[stridx]) if y[0] < x[1] and not (new_chr or new_str): x[1] = max(x[1], y[1]) x[2:] = [ aggreg.get(f, generic_merge)((x[n + 2], y[n + 2])) for n, f in enumerate(s.fields[2:]) ] else: yield tuple(x) x = list(y) yield tuple(x) stream = reorder(stream, ['start', 'end']) return FeatureStream(_fuse(stream, stranded), fields=stream.fields)
def normalize(trackList, method='total', field='score'): """Normalizes the scores in every stream from *trackList* using the given *method*. It assumes that each of the streams represents the same features, i.e. the n-th element of one stream corresponds to the n-th element of another. [!] This function will temporarily store everything in memory. :param trackList: FeatureStream, or list of FeatureStream objects. :param method: normalization method: * ``'total'`` divides every score vector by its sum (total number of reads) x 10^7 . * ``'deseq'`` applies DESeq's normalization ("size factors") - considering every track as belonging to a different group. * ``'quantile'`` applies quantile normalization. :param field: (str) name of the field containing the scores (must be the same for all streams). """ if not isinstance(trackList, (list, tuple)): trackList = [trackList] allcontents = [list(t) for t in trackList] ncols = len(trackList) nlines = len(allcontents[0]) assert all( len(t) == nlines for t in allcontents), "All streams must have the same number of elements." # Build the matrix allscores = zeros((ncols, nlines)) for n, content in enumerate(allcontents): idx = trackList[n].fields.index(field) allscores[n] = asarray([x[idx] for x in content]) # Normalize allscores = common.normalize(asarray(allscores), method) # Reinsert the new scores in the respective tracks for n, content in enumerate(allcontents): idx = trackList[n].fields.index(field) for k, x in enumerate(content): content[k] = x[:idx] + (allscores[n][k], ) + x[idx + 1:] res = [ FeatureStream(t, fields=trackList[n].fields) for n, t in enumerate(allcontents) ] if len(trackList) == 1: return res[0] else: return res
def concat_fields(stream, infields, outfield='name', separator='|', as_tuple=False): """ Concatenate fields of a stream. Ex.: ('chr1', 12, 'aa', 'bb') -> ('chr1', 12, 'aa|bb') # as_tuple=False ('chr1', 12, 'aa', 'bb') -> ('chr1', 12, ('aa','bb')) # as_tuple=True :param stream: FeatureStream object. :param infields: (list of str) list of fields to concatenate. :param outfield: (str) name of the new field created by concatenation of *infields* (can be an already existing one). ['name'] :param separator: (str) char to add between entries from concatenated fields. ['|'] :param as_tuple: (bool) join concatenated field entries in a tuple instead of a separator in a single string. [False] :rtype: FeatureStream object. """ _infields = [f for f in stream.fields if not (f in infields)] # untouched fields in_out_indx = [stream.fields.index(f) for f in _infields] to_extend = [] if not (outfield in _infields): _infields += [outfield] to_extend = [None] out_indx = _infields.index(outfield) in_indx = [stream.fields.index(f) for f in infields] def _concat(stream): for x in stream: y = [x[i] for i in in_out_indx] + to_extend if as_tuple: y[out_indx] = tuple((x[i] for i in in_indx)) else: y[out_indx] = separator.join([str(x[i]) for i in in_indx]) yield tuple(y) return FeatureStream(_concat(stream), _infields)
def selection(trackList,selection): """ For each stream in *trackList*, keep only items satisfying the *selection*'s filters. A selection is entered as a dictionary which keys are field names, and values are the scope of possible entries for each field. Example:: sel = {'chr':['chrI','chrII'], 'start':(1,10000), 'end':(5000,15000), 'count':range(30), ...} selection(stream, selection=sel) All filters in a selection must be satisfied for an item to pass through it (AND operator). To give alternative conditions (OR operator), one must give several such selections in a list: sel = [{'chr':'chrI', 'start':(1,10000)}, {'chr':'chrI', 'end':(1000000,1500000)}] Values can be tuples (range of values), lists (of possible values), or a single element. :param trackList: FeatureStream, or list of FeatureStream objects. :param selection: (dict, or list of dict) the filter described above. """ def _check_fields(item,filter): for k,v in filter.iteritems(): z = item[k] if isinstance(v,tuple): if float(z) < v[0] or float(z) >= v[1]: return False elif isinstance(v,list): if z not in v: return False elif z!= v: return False return True def _filter(stream,selection): filters = [dict([(stream.fields.index(f),v) for f,v in sel.iteritems()]) for sel in selection] for x in stream: if any(_check_fields(x,f) for f in filters): yield x if isinstance(trackList,FeatureStream): trackList = [trackList] if isinstance(selection,dict): selection = [selection] res = [FeatureStream(_filter(t,selection), fields=t.fields) for t in trackList] return res[0] if len(res)==1 else res
def parse_meme_xml( ex, meme_file, chrmeta ): """ Parse meme xml file and convert to track """ from xml.etree import ElementTree as ET touch(ex,meme_file) tree = ET.parse(meme_file) ncol = {} allmatrices = {} for motif in tree.find('motifs').findall('motif'): mid = motif.attrib['id'] ncol[mid] = 0 allmatrices[mid] = unique_filename_in() with open(allmatrices[mid],'w') as mat_out: for parray in motif.find('probabilities')[0].findall('alphabet_array'): ncol[mid] += 1 m = {'letter_A':0,'letter_C':0,'letter_G':0,'letter_T':0} for col in parray: m[col.attrib['letter_id']] = float(col.text) mat_out.write("1\t%f\t%f\t%f\t%f\n" %(m['letter_A'],m['letter_C'],m['letter_G'],m['letter_T'])) def _xmltree(_t):#(_c,_t): seq_name = {} seq_chr = None for it in _t.getiterator(): if it.tag == 'sequence': seq_name[it.attrib['id']] = it.attrib['name'] if it.tag == 'scanned_sites': name = seq_name[it.attrib['sequence_id']] name,seq_chr,start,end = re.search(r'(.*)\|(.+):(\d+)-(\d+)',name).groups() if it.tag == 'scanned_site':# and _c == seq_chr: start = int(start)+int(it.attrib['position'])-1 end = start+ncol[it.attrib['motif_id']] strnd = it.attrib['strand'] == 'plus' and 1 or -1 score = it.attrib['pvalue'] yield (seq_chr,str(start),str(end),it.attrib['motif_id'],score,strnd) outsql = unique_filename_in()+".sql" outtrack = track(outsql, chrmeta=chrmeta, info={'datatype':'qualitative'}, fields=['start','end','name','score','strand']) outtrack.write(FeatureStream(_xmltree(tree),fields=['chr']+outtrack.fields)) outtrack.close() return {'sql':outsql,'matrices':allmatrices}
def duplicate(stream, infield, outfields): """ Duplicate one of *stream*'s fields. If outfields has more than one element, the field is copied as many times. :param stream: FeatureStream object. :param infield: (str) name of the field to be duplicated. :param outfields: (str, or list of str) the new field(s) to be created. """ def _duplicate(stream, infield, outfields): in_idx = stream.fields.index(infield) for x in stream: yield x + (x[in_idx], ) * len(outfields) assert infield in stream.fields, "Field %s not found." % infield assert isinstance(infield, str), "Expected string, %s found." % type(infield) assert isinstance( outfields, (str, list)), "Expected string or list, % found." % type(outfields) if isinstance(outfields, str): outfields = [outfields] return FeatureStream(_duplicate(stream, infield, outfields), fields=stream.fields + outfields)
def select(stream, fields=None, selection={}): """ Keeps only specified *fields* from a stream, and/or only elements matching *selection*. :param stream: FeatureStream. :param fields: (list of str) list of fields to keep in the output. :param selection: (dict {*field*:*val*}) keep only lines s.t. *field* has a value equal to *val*, or is an element of *val*. E.g. `select(f,None,{'chr':['chr1','chr2']})`. *val* can also be a function returning True or False when applied to an element of the field; if True, the element is kept. :rtype: FeatureStream, or list of FeatureStream objects. """ def _select(stream, idxs): if selection: sel = dict([(stream.fields.index(f), val) for f, val in selection.iteritems()]) for x in stream: for k, val in sel.iteritems(): if isinstance(val, (list, tuple)): if not x[k] in val: continue elif hasattr(val, '__call__'): if not val(x[k]): continue else: if not x[k] == val: continue yield tuple([x[i] for i in idxs]) else: for x in stream: yield tuple([x[i] for i in idxs]) if not fields: fields = stream.fields idxs = [stream.fields.index(f) for f in fields] assert all([x > -1 for x in idxs ]), "Can only select amongst fields %s." % stream.fields assert hasattr( stream, 'fields' ) and stream.fields, "Object %s has no attribute 'fields'." % stream return FeatureStream(_select(stream, idxs), fields=fields)
def reorder(stream, fields, last=False): """Reorders *stream.fields* so that *fields* come first. :param stream: FeatureStream object. :param fields: list of field names. :param last: (bool) if True, reorders fields so that *fields* come last. :rtype: FeatureStream """ if not (hasattr(stream, 'fields')) or stream.fields is None: return stream if not (all([f in stream.fields for f in fields])): raise ValueError("Need %s fields in stream." % (", ".join(fields))) if all(stream.fields[n] == f for n, f in enumerate(fields)): return stream if last: _inds = [n for n, f in enumerate(stream.fields) if f not in fields ] + [stream.fields.index(f) for f in fields] else: _inds = [stream.fields.index(f) for f in fields] + [ n for n, f in enumerate(stream.fields) if f not in fields ] _flds = [stream.fields[n] for n in _inds] return FeatureStream((tuple(x[n] for n in _inds) for x in stream), fields=_flds)
def merge_scores(trackList, method='arithmetic'): """ Creates a stream with per-base average of several score tracks:: X1: __________666666666______ X2: _____2222222222__________ R: _____11111444443333______ :param trackList: list of FeatureStream objects. :param method: (str) type of average: one of 'arithmetic','geometric', or 'sum' (no average). :rtype: FeatureStream """ tracks = [ FeatureStream(common.sentinelize(x, [sys.maxint] * len(x.fields)), x.fields) for x in trackList ] tracks = [common.reorder(t, ['start', 'end', 'score']) for t in tracks] fields = [ f for f in tracks[0].fields if all([f in t.fields for t in tracks]) ] # common fields elements = [list(x.next()) for x in tracks] track_denom = 1.0 / len(trackList) if hasattr(method, '__call__'): mean_fn = lambda scores, denom: method(scores) else: mean_fn = _score_functions.get(method, _arithmetic_mean) for i in xrange(len(tracks) - 1, -1, -1): if elements[i][0] == sys.maxint: tracks.pop(i) elements.pop(i) def _stream(tracks): while tracks: start = min([x[0] for x in elements]) end = min([x[0] for x in elements if x[0] > start] + [x[1] for x in elements]) scores = [x[2] for x in elements if x[1] > start and x[0] < end] if len(fields) > 3: rest = [] for i in range(len(fields[3:])): r = [ str(x[3 + i]) for x in elements if not (x[3 + i] is None) and x[1] > start and x[0] < end ] if all([x == r[0] for x in r]): rest.append(r[0]) else: rest.append("|".join(r)) yield (start, end, mean_fn(scores, track_denom)) + tuple(rest) else: yield (start, end, mean_fn(scores, track_denom)) for i in xrange(len(tracks) - 1, -1, -1): if elements[i][0] < end: elements[i][0] = end if elements[i][1] <= end: elements[i] = list(tracks[i].next()) if elements[i][0] == sys.maxint: tracks.pop(i) elements.pop(i) return FeatureStream(_stream(tracks), fields)
def window_smoothing(trackList, window_size, step_size=1, stop_val=sys.maxint, featurewise=False): """ Given a (list of) signal track(s) *trackList*, a *window_size* L (in base pairs by default, or in number of features if *featurewise* is True), and a *step_size*, return as many signal tracks with, at each position p (multiple of *step_size*), the average score in the window [p-L/2, p+L/2]:: X: __________666666666666____________ R: ______12345666666666654321________ (not exact scores here) :param trackList: FeatureStream, or list of FeatureStream objects. :param window_size: (int) window size in bp. :param step_size: (int) step length (one score returned per *step_size* positions). [1] :param stop_val: (int) sequence length. [sys.maxint] :param featurewise: (bool) bp (False), or number of features (True). [False] :rtype: FeatureStream Example of windows, window_size=9, step_size=3: [0,1,2,3,4,5,6,7,8,9), [3,4,5,6,7,8,9,10,11,12), ... """ def _stepping_mean(track, score, denom): score = 0.0 F = [] score = 0.0 nmid = window_size / 2 for x in track: F.append(x) score += x[2] if len(F) < window_size: continue yield (F[nmid][0], F[nmid][1], round(score * denom + 1e-7, 6)) + F[nmid][3:] for shift in xrange(step_size): score -= F.pop(0)[2] def _running_mean(track, win_start, denom): score = 0.0 F = [] for x in track: F.append(x) fstart = F[0][0] fend = F[0][1] chrom = F[0][3] win_start = max(win_start, fstart - window_size) win_end = win_start + window_size lstart = F[-1][0] lend = F[-1][1] while win_end < lend: delta = 0 steps = [fend - win_start, lend - win_end] if fstart > win_start: steps.append(fstart - win_start) else: delta -= F[0][2] if lstart > win_end: steps.append(lstart - win_end) else: delta += F[-1][2] nsteps = min(steps) sst = -(win_start % step_size) % step_size sen = -(win_start + nsteps % step_size) % step_size win_center = (win_start + win_end) / 2 if abs(delta) > 1e-11: delta *= denom score += delta * sst for step in xrange(sst, nsteps, step_size): if score > 1e-11 and win_center + step >= 0 and win_center + step + step_size <= stop_val: yield (win_center + step, win_center + step + step_size, score, chrom) score += delta * step_size score -= delta * sen else: if score > 1e-11 and win_center + sst >= 0 and win_center + sen + nsteps <= stop_val: yield (win_center + sst, win_center + sen + nsteps, score, chrom) win_start += nsteps win_end += nsteps if fend <= win_start: F.pop(0) if F: fstart = F[0][0] fend = F[0][1] win_start = max(win_start, fstart - window_size) win_end = win_start + window_size if win_end > stop_val: break while F: delta = 0 steps = [fend - win_start] if fstart > win_start: steps.append(fstart - win_start) else: delta -= F[0][2] nsteps = min(steps) sst = -(win_start % step_size) % step_size sen = -(win_start + nsteps % step_size) % step_size win_center = (win_start + win_end) / 2 if abs(delta) > 1e-11: delta *= denom score += delta * sst for step in xrange(sst, nsteps, step_size): if score > 1e-11 and win_center + step >= 0 and win_center + step + step_size <= stop_val: yield (win_center + step, win_center + step + step_size, score, chrom) score += delta * step_size score -= delta * sen else: if score > 1e-11 and win_center + sst >= 0 and win_center + sen + nsteps <= stop_val: yield (win_center + sst, win_center + sen + nsteps, score, chrom) win_start += nsteps win_end += nsteps if fend <= win_start: F.pop(0) if F: fstart = F[0][0] fend = F[0][1] win_start = max(win_start, fstart - window_size) win_end = win_start + window_size if win_end > stop_val: break denom = 1.0 / window_size win_start = -window_size _f = ['start', 'end', 'score'] if featurewise: call = _stepping_mean else: call = _running_mean if isinstance(trackList, (list, tuple)): return [ FeatureStream(call(common.reorder(t, _f), win_start, denom), fields=_f) for n, t in enumerate(trackList) ] else: return FeatureStream(call(common.reorder(trackList, _f), win_start, denom), fields=_f)
def score_by_feature(trackScores, trackFeatures, method='mean'): """ For every feature from *trackFeatures*, get the list of all scores it contains and apply an operation *method* on this list (by default, scores are averaged). Warning: both score and feature streams must be sorted! (use `common.sorted_stream` is necessary). The output is a stream similar to *trackFeatures* but with an additional `score` field for each stream in *trackScores*:: method = 'mean': X: ------##########--------------##########------ Y: ___________666666666__________6666666666______ R: ______[ 3. ]______________[ 6. ]______ method = 'sum': X : ------##########--------------##########------ Y1: ___________666666666__________6666666666______ Y2: ___222222_____________________333_____________ R : ______[ 30,6 ]______________[ 60,9 ]______ :param trackScores: (list of) one or several -sorted- score track(s) (FeatureStream). :param trackFeatures: (FeatureStream) one -sorted- feature track. :param method: (str of function): operation applied to the list of scores from one feature. Can be one of 'sum','mean','median','min','max', or a custom function. :rtype: FeatureStream """ def _stream(ts, tf): X = [common.sentinelize(x, [sys.maxint] * len(x.fields)) for x in ts] S = [[(-sys.maxint, -sys.maxint, 0.0)] for t in ts] start_idx = tf.fields.index('start') end_idx = tf.fields.index('end') if hasattr(method, '__call__'): mean_fn = lambda scores, denom: method(scores) else: mean_fn = _score_functions.get(method, _arithmetic_mean) for y in tf: ystart = y[start_idx] yend = y[end_idx] scores = () for i in range(len(ts)): xnext = S[i][-1] # Load into S all score items which intersect feature y while xnext[0] < yend: xnext = X[i].next() if xnext[1] > ystart: S[i].append(xnext) n = 0 while S[i][n][1] <= ystart: n += 1 S[i] = S[i][n:] scores_y = [] for s in S[i]: if yend <= s[0]: continue if s[0] < ystart: start = ystart else: start = s[0] if yend < s[1]: end = yend else: end = s[1] scores_y.extend([s[2]] * (end - start)) scores += (mean_fn(scores_y, 1.0 / (yend - ystart)), ) yield tuple(y) + scores if not (isinstance(trackScores, (list, tuple))): trackScores = [trackScores] if isinstance(trackFeatures, (list, tuple)): trackFeatures = concatenate(trackFeatures) if len(trackScores) > 1 or 'score' in trackFeatures.fields: _fields = ["score" + str(i) for i in range(len(trackScores))] else: _fields = ["score"] _ts = [common.reorder(t, ['start', 'end', 'score']) for t in trackScores] return FeatureStream(_stream(_ts, trackFeatures), trackFeatures.fields + _fields)
def filter_scores(trackScores, trackFeatures, method='sum', strict=False, annotate=False, flatten=common.cobble): """ Extract from *trackScores* only the regions overlapping *trackFeatures*'s regions. Warning: both score and features streams must be sorted! (use `common.sorted_stream` if necessary). Example:: X: _____#########__________#############_______ Y: __________666666666___2222776_444___________ R: __________6666__________22776_444___________ Note: *trackFeatures* is :func:`cobbled <bbcflib.gfminer.common.cobble>` by default (to avoid score duplications). An alternative is :func:`fusion <bbcflib.gfminer.common.fusion>`, or nothing. If strand information is present in both *trackScores* and *trackFeatures*, only scores inside a region of the same strand are kept. :param trackScores: (FeatureStream) one -sorted- score track. If a list of streams is provided, they will be merged (using `merge_scores`). :param trackFeatures: (FeatureStream) one -sorted- feature track. If a list of streams is provided, they will be merged (using `concatenate`). :param method: (str) `merge_scores` *method* argument, in case *trackScores* is a list. ['sum'] :param strict: (bool) if True, only score regions from *trackScores* that are strictly contained in a feature region of *trackFeatures* will be returned. [False] :param annotate: (bool) if True, supplementary annotation (and the corresponding fields) from *trackFeatures* will be added to the result. [False] :param flatten: (func) one of None, `common.fusion` or `common.cobble`. Function to be applied to *trackFeatures* before all. [common.cobble] :rtype: FeatureStream """ def _stream(ts, tf): tf = common.sentinelize(tf, [sys.maxint] * len(tf.fields)) info_idx = [k for k, f in enumerate(tf.fields) if f not in ts.fields] if stranded: ts_strand_idx = ts.fields.index('strand') tf_strand_idx = tf.fields.index('strand') same_strand = lambda x, y: x[ts_strand_idx] == y[tf_strand_idx] else: same_strand = lambda x, y: True Y = [] ynext = (-sys.maxint, -sys.maxint, 0.0) for x in ts: xstart = x[0] xend = x[1] # Load into Y all feature items which intersect score x while ynext[0] < xend: if ynext[1] > xstart: Y.append(ynext) ynext = tf.next() # Remove features that are far behind x if Y: n = 0 try: while Y[n][1] <= xstart: n += 1 Y = Y[n:] except IndexError: Y = [ynext] # Yield intersections for y in Y: if not same_strand(x, y): continue info = tuple([y[k] for k in info_idx]) if annotate else () if strict and (y[0] > xstart or y[1] < xend): continue if y[0] >= xend: continue # keep for next iteration start = xstart if y[0] < xstart else y[0] end = xend if y[1] > xend else y[1] yield (start, end) + tuple(x[2:]) + info if isinstance(trackFeatures, (list, tuple)): trackFeatures = concatenate(trackFeatures) if isinstance(trackScores, (list, tuple)): trackScores = merge_scores(trackScores, method) _info_fields = [ f for f in trackFeatures.fields if f not in trackScores.fields ] if annotate else [] stranded = 'strand' in (set(trackScores.fields) & set(trackFeatures.fields)) if flatten is None: _tf = trackFeatures else: _tf = flatten(trackFeatures, stranded=stranded) _ts = common.reorder(trackScores, ['start', 'end']) _tf = common.reorder(_tf, ['start', 'end']) return FeatureStream(_stream(_ts, _tf), _ts.fields + _info_fields)
def _add_label(s, x): _f = s.fields + ['track_name'] return FeatureStream((y + (x, ) for y in s), fields=_f)
def concatenate(trackList, fields=None, remove_duplicates=False, group_by=None, aggregate={}): """ Returns one stream containing all features from a list of tracks, ordered by *fields*. :param trackList: list of FeatureStream objects. :param fields: (list of str) list of fields to keep in the output (at least ['start','end']). :param remove_duplicates: (bool) whether to remove items that are identical in several of the tracks in *trackList*. [False] :param group_by: (list of str) if specified, elements having all values for these fields in common will be merged into a singe element. Other fields are merged according to *aggregate* if specified, or `common.generic_merge` by default. :aggregate: (dict) for each field name given as a key, its value is the function to apply to the vector containing all different values for this field in order to merge them. E.g. ``{'score': lambda x: sum(x)}`` will return the sum of all scores in the output. :rtype: FeatureStream """ def _find_min(feat_tuple): """Return the index of the 'smallest' element amongst a tuple of features from different tracks. Priority is given to the first field; if the first field items are equal amongst several elements, it looks at the second field, a.s.o.""" nmin = 0 xmin = feat_tuple[0] for n,x in enumerate(feat_tuple[1:]): if x[0] == sys.maxint: continue for k in range(len(x)): if cmp(hash(x[k]),hash(xmin[k]))<0: xmin = x nmin = n+1 break elif cmp(hash(x[k]),hash(xmin[k]))>0: break return nmin def _weave(_t,N): """Generator yielding all features represented in a list of tracks *_t*, sorted w.r.t the *N* first fields.""" current = [x.next()[:N] for x in _t] # init allfields = [t.fields for t in _t] n = _find_min(current) last = current[n] current[n] = _t[n].next()[:N] if not group_by: yield last while 1: # Remove duplicates if remove_duplicates: while not all([current.count(x)==1 for x in current]): for k in range(len(current)): if current.count(current[k]) > 1: current[k] = _t[k].next()[:N] n = _find_min(current) if current[n][0] == sys.maxint: break if group_by: idx = [allfields[n].index(f) for f in group_by] if all(current[n][i] == last[i] for i in idx): last = tuple(current[n][i] if i in idx \ else aggregate.get(allfields[n][i],common.generic_merge)((last[i],current[n][i])) \ for i in range(len(allfields[n]))) # merge last and current else: yield last last = current[n] else: yield current[n] current[n] = _t[n].next()[:N] if group_by: yield last if len(trackList) == 1: return trackList[0] if fields is None: fields = trackList[0].fields fields = [f for f in fields if all(f in t.fields for t in trackList)] _of = ['start','end'] if 'chr' in fields: _of = ['chr']+_of if 'name' in fields: _of += ['name'] _of += [f for f in fields if not(f in _of)] tl = [common.reorder(t,_of) for t in trackList] tl = [FeatureStream(common.sentinelize(x,(sys.maxint,)*len(x.fields)),x.fields) for x in tl] return FeatureStream(_weave(tl,len(_of)),fields=_of)