def test_concatenate(self): s1 = [('chr',1,3,0.2,'n'), ('chr',5,9,0.5,'n'), ('chr',11,15,1.2,'n')] s2 = [('chr',1,4,0.6,'m'), ('chr',8,11,0.4,'m'), ('chr',11,12,0.1,'m')] stream1 = fstream(s1, fields=['chr','start','end','score','name']) stream2 = fstream(s2, fields=['chr','start','end','score','name']) res = list(concatenate([stream1,stream2], fields=['start','score','name'])) expected = [(1,3,0.2,'n'),(1,4,0.6,'m'),(5,9,0.5,'n'),(8,11,0.4,'m'),(11,12,0.1,'m'),(11,15,1.2,'n')] self.assertListEqual(res,expected) # Keep chr and compare items w.r.t. chr s1 = [('chr',1,3,0.2,'n'), ('chr',5,9,0.5,'n'), ('chr',11,15,1.2,'n')] s2 = [('chr',1,4,0.6,'m'), ('chrX',8,11,0.4,'m'), ('chrX',11,12,0.1,'m')] stream1 = fstream(s1, fields=['chr','start','end','score','name']) stream2 = fstream(s2, fields=['chr','start','end','score','name']) res = list(concatenate([stream1,stream2], fields=['chr','start','end','score'])) expected = [('chr',1,3,0.2),('chr',1,4,0.6),('chr',5,9,0.5),('chr',11,15,1.2),('chrX',8,11,0.4),('chrX',11,12,0.1)] self.assertListEqual(sorted(res),sorted(expected)) # Remove duplicates stream1 = fstream([(1,2),(3,4),(5,6)], fields=['start','end']) stream2 = fstream([(3,4),(5,6),(7,8)], fields=['start','end']) res = list(concatenate([stream1,stream2], fields=['start','end'], remove_duplicates=True)) expected = [(1,2),(3,4),(5,6),(7,8)] self.assertListEqual(res,expected) # Group by s1 = [('chr',1,4,0.2,'n'), ('chr',5,9,0.5,'n'), ('chr',11,15,1.2,'n')] s2 = [('chr',1,4,0.6,'m'), ('chr',8,11,0.4,'m'), ('chrX',11,15,0.1,'m')] group_by = ['chr','start','end'] aggregate = {'score': lambda x:sum(x), 'name': lambda x:'-'.join(x)} stream1 = fstream(s1, fields=['chr','start','end','score','name']) stream2 = fstream(s2, fields=['chr','start','end','score','name']) res = list(concatenate([stream1,stream2], fields=['chr','start','score','name'], group_by=group_by, aggregate=aggregate)) expected = [('chr',1,4,0.8,'m-n'),('chr',5,9,0.5,'n'),('chr',8,11,0.4,'m'),('chr',11,15,1.2,'n'),('chrX',11,15,0.1,'m')] self.assertListEqual(sorted(res),sorted(expected))
def test_concatenate(self): s1 = [('chr', 1, 3, 0.2, 'n'), ('chr', 5, 9, 0.5, 'n'), ('chr', 11, 15, 1.2, 'n')] s2 = [('chr', 1, 4, 0.6, 'm'), ('chr', 8, 11, 0.4, 'm'), ('chr', 11, 12, 0.1, 'm')] stream1 = fstream(s1, fields=['chr', 'start', 'end', 'score', 'name']) stream2 = fstream(s2, fields=['chr', 'start', 'end', 'score', 'name']) res = list( concatenate([stream1, stream2], fields=['start', 'score', 'name'])) expected = [(1, 3, 0.2, 'n'), (1, 4, 0.6, 'm'), (5, 9, 0.5, 'n'), (8, 11, 0.4, 'm'), (11, 12, 0.1, 'm'), (11, 15, 1.2, 'n')] self.assertListEqual(res, expected) # Keep chr and compare items w.r.t. chr s1 = [('chr', 1, 3, 0.2, 'n'), ('chr', 5, 9, 0.5, 'n'), ('chr', 11, 15, 1.2, 'n')] s2 = [('chr', 1, 4, 0.6, 'm'), ('chrX', 8, 11, 0.4, 'm'), ('chrX', 11, 12, 0.1, 'm')] stream1 = fstream(s1, fields=['chr', 'start', 'end', 'score', 'name']) stream2 = fstream(s2, fields=['chr', 'start', 'end', 'score', 'name']) res = list( concatenate([stream1, stream2], fields=['chr', 'start', 'end', 'score'])) expected = [('chr', 1, 3, 0.2), ('chr', 1, 4, 0.6), ('chr', 5, 9, 0.5), ('chr', 11, 15, 1.2), ('chrX', 8, 11, 0.4), ('chrX', 11, 12, 0.1)] self.assertListEqual(sorted(res), sorted(expected)) # Remove duplicates stream1 = fstream([(1, 2), (3, 4), (5, 6)], fields=['start', 'end']) stream2 = fstream([(3, 4), (5, 6), (7, 8)], fields=['start', 'end']) res = list( concatenate([stream1, stream2], fields=['start', 'end'], remove_duplicates=True)) expected = [(1, 2), (3, 4), (5, 6), (7, 8)] self.assertListEqual(res, expected) # Group by s1 = [('chr', 1, 4, 0.2, 'n'), ('chr', 5, 9, 0.5, 'n'), ('chr', 11, 15, 1.2, 'n')] s2 = [('chr', 1, 4, 0.6, 'm'), ('chr', 8, 11, 0.4, 'm'), ('chrX', 11, 15, 0.1, 'm')] group_by = ['chr', 'start', 'end'] aggregate = {'score': lambda x: sum(x), 'name': lambda x: '-'.join(x)} stream1 = fstream(s1, fields=['chr', 'start', 'end', 'score', 'name']) stream2 = fstream(s2, fields=['chr', 'start', 'end', 'score', 'name']) res = list( concatenate([stream1, stream2], fields=['chr', 'start', 'score', 'name'], group_by=group_by, aggregate=aggregate)) expected = [('chr', 1, 4, 0.8, 'm-n'), ('chr', 5, 9, 0.5, 'n'), ('chr', 8, 11, 0.4, 'm'), ('chr', 11, 15, 1.2, 'n'), ('chrX', 11, 15, 0.1, 'm')] self.assertListEqual(sorted(res), sorted(expected))
def merge_junc_files(trackList,assembly): out = track('all.junc',format='txt',fields=['chr','start','end','strand','score']) from bbcflib.genrep import Assembly a = Assembly(assembly) for c in a.chromosomes: tl = [track(t,fields=['chr','start','end','strand','score'],format='txt').read(str(c[0])+'_'+c[1]+'.'+str(c[2])) for t in trackList] #all = concatenate(tl,remove_duplicates=True) all = concatenate(tl,group_by=['chr','start','end'],aggregate={'score':lambda x:sum(x)}) out.write(all,mode='append')
def merge_junc_files(trackList, assembly): out = track('all.junc', format='txt', fields=['chr', 'start', 'end', 'strand', 'score']) from bbcflib.genrep import Assembly a = Assembly(assembly) for c in a.chromosomes: tl = [ track(t, fields=['chr', 'start', 'end', 'strand', 'score'], format='txt').read(str(c[0]) + '_' + c[1] + '.' + str(c[2])) for t in trackList ] #all = concatenate(tl,remove_duplicates=True) all = concatenate(tl, group_by=['chr', 'start', 'end'], aggregate={'score': lambda x: sum(x)}) out.write(all, mode='append')
def score_by_feature(trackScores, trackFeatures, method='mean'): """ For every feature from *trackFeatures*, get the list of all scores it contains and apply an operation *method* on this list (by default, scores are averaged). Warning: both score and feature streams must be sorted! (use `common.sorted_stream` is necessary). The output is a stream similar to *trackFeatures* but with an additional `score` field for each stream in *trackScores*:: method = 'mean': X: ------##########--------------##########------ Y: ___________666666666__________6666666666______ R: ______[ 3. ]______________[ 6. ]______ method = 'sum': X : ------##########--------------##########------ Y1: ___________666666666__________6666666666______ Y2: ___222222_____________________333_____________ R : ______[ 30,6 ]______________[ 60,9 ]______ :param trackScores: (list of) one or several -sorted- score track(s) (FeatureStream). :param trackFeatures: (FeatureStream) one -sorted- feature track. :param method: (str of function): operation applied to the list of scores from one feature. Can be one of 'sum','mean','median','min','max', or a custom function. :rtype: FeatureStream """ def _stream(ts, tf): X = [common.sentinelize(x, [sys.maxint] * len(x.fields)) for x in ts] S = [[(-sys.maxint, -sys.maxint, 0.0)] for t in ts] start_idx = tf.fields.index('start') end_idx = tf.fields.index('end') if hasattr(method, '__call__'): mean_fn = lambda scores, denom: method(scores) else: mean_fn = _score_functions.get(method, _arithmetic_mean) for y in tf: ystart = y[start_idx] yend = y[end_idx] scores = () for i in range(len(ts)): xnext = S[i][-1] # Load into S all score items which intersect feature y while xnext[0] < yend: xnext = X[i].next() if xnext[1] > ystart: S[i].append(xnext) n = 0 while S[i][n][1] <= ystart: n += 1 S[i] = S[i][n:] scores_y = [] for s in S[i]: if yend <= s[0]: continue if s[0] < ystart: start = ystart else: start = s[0] if yend < s[1]: end = yend else: end = s[1] scores_y.extend([s[2]] * (end - start)) scores += (mean_fn(scores_y, 1.0 / (yend - ystart)), ) yield tuple(y) + scores if not (isinstance(trackScores, (list, tuple))): trackScores = [trackScores] if isinstance(trackFeatures, (list, tuple)): trackFeatures = concatenate(trackFeatures) if len(trackScores) > 1 or 'score' in trackFeatures.fields: _fields = ["score" + str(i) for i in range(len(trackScores))] else: _fields = ["score"] _ts = [common.reorder(t, ['start', 'end', 'score']) for t in trackScores] return FeatureStream(_stream(_ts, trackFeatures), trackFeatures.fields + _fields)
def filter_scores(trackScores, trackFeatures, method='sum', strict=False, annotate=False, flatten=common.cobble): """ Extract from *trackScores* only the regions overlapping *trackFeatures*'s regions. Warning: both score and features streams must be sorted! (use `common.sorted_stream` if necessary). Example:: X: _____#########__________#############_______ Y: __________666666666___2222776_444___________ R: __________6666__________22776_444___________ Note: *trackFeatures* is :func:`cobbled <bbcflib.gfminer.common.cobble>` by default (to avoid score duplications). An alternative is :func:`fusion <bbcflib.gfminer.common.fusion>`, or nothing. If strand information is present in both *trackScores* and *trackFeatures*, only scores inside a region of the same strand are kept. :param trackScores: (FeatureStream) one -sorted- score track. If a list of streams is provided, they will be merged (using `merge_scores`). :param trackFeatures: (FeatureStream) one -sorted- feature track. If a list of streams is provided, they will be merged (using `concatenate`). :param method: (str) `merge_scores` *method* argument, in case *trackScores* is a list. ['sum'] :param strict: (bool) if True, only score regions from *trackScores* that are strictly contained in a feature region of *trackFeatures* will be returned. [False] :param annotate: (bool) if True, supplementary annotation (and the corresponding fields) from *trackFeatures* will be added to the result. [False] :param flatten: (func) one of None, `common.fusion` or `common.cobble`. Function to be applied to *trackFeatures* before all. [common.cobble] :rtype: FeatureStream """ def _stream(ts, tf): tf = common.sentinelize(tf, [sys.maxint] * len(tf.fields)) info_idx = [k for k, f in enumerate(tf.fields) if f not in ts.fields] if stranded: ts_strand_idx = ts.fields.index('strand') tf_strand_idx = tf.fields.index('strand') same_strand = lambda x, y: x[ts_strand_idx] == y[tf_strand_idx] else: same_strand = lambda x, y: True Y = [] ynext = (-sys.maxint, -sys.maxint, 0.0) for x in ts: xstart = x[0] xend = x[1] # Load into Y all feature items which intersect score x while ynext[0] < xend: if ynext[1] > xstart: Y.append(ynext) ynext = tf.next() # Remove features that are far behind x if Y: n = 0 try: while Y[n][1] <= xstart: n += 1 Y = Y[n:] except IndexError: Y = [ynext] # Yield intersections for y in Y: if not same_strand(x, y): continue info = tuple([y[k] for k in info_idx]) if annotate else () if strict and (y[0] > xstart or y[1] < xend): continue if y[0] >= xend: continue # keep for next iteration start = xstart if y[0] < xstart else y[0] end = xend if y[1] > xend else y[1] yield (start, end) + tuple(x[2:]) + info if isinstance(trackFeatures, (list, tuple)): trackFeatures = concatenate(trackFeatures) if isinstance(trackScores, (list, tuple)): trackScores = merge_scores(trackScores, method) _info_fields = [ f for f in trackFeatures.fields if f not in trackScores.fields ] if annotate else [] stranded = 'strand' in (set(trackScores.fields) & set(trackFeatures.fields)) if flatten is None: _tf = trackFeatures else: _tf = flatten(trackFeatures, stranded=stranded) _ts = common.reorder(trackScores, ['start', 'end']) _tf = common.reorder(_tf, ['start', 'end']) return FeatureStream(_stream(_ts, _tf), _ts.fields + _info_fields)
def __call__(self, **kw): def _parse_logic(string): s = re.sub(r'[^\w\d!=><\. ]', '', string) s = re.sub(r' OR ', ')or(%f ', s) s = re.sub(r' AND ', ')and(%f ', s) return "(%f "+s+")" def _run_test(row, indx, cond): num = float(row[col_ind[indx]]) num = max(-sys.maxint,min(sys.maxint,num)) num = (num,)*c.count("%f") return eval(cond % (num)) def _add_label(s,x): _f = s.fields+['track_name'] return FeatureStream((y+(x,) for y in s), fields=_f) venn_options = {} # tune it here tracks = [] intype = kw.get("input_type") or "Table" if intype == "Table": s_cols = kw.get('id_columns','') s_filters = kw.get('filters','') infile = track(kw.get('table',''),format='txt',header=True) col_ind = [int(i)-1 for i in s_cols.split(",")] legend = [infile.fields[i] if i<len(infile.fields) else str(i) for i in col_ind] conds = [_parse_logic(x) for x in s_filters.split(",")] tlabels = [chr(k+65) for k in range(len(col_ind))] conds += ["1"]*(len(col_ind)-len(conds)) combn = [tuple(sorted(x)) for k in range(len(tlabels)) for x in combinations(tlabels,k+1)] c1 = dict(("|".join(c),0) for c in combn) c2 = dict(("|".join(c),0) for c in combn) indx = dict((c,[tlabels.index(x) for x in c]) for c in combn) for row in infile: tests = [_run_test(row,i,c) for i,c in enumerate(conds)] for c in combn: c1["|".join([tlabels[n] for n,t in enumerate(tests) if t])] += 1 c2["|".join(c)] += all([tests[i] for i in indx[c]]) nsamples = len(col_ind) combn = ['|'.join(y) for x in combn for y in x] elif intype == "Tracks": #filenames = kw['TrMulti']['files'] filenames = kw['files'] if not isinstance(filenames,(list,tuple)): filenames = [filenames] for f in filenames: assert os.path.exists(f), "File not found: %s ." % f tracks = [track(f,chrmeta='guess') for f in filenames] nsamples = len(tracks) tlabels = [chr(k+65) for k in range(len(tracks))] combn = [combinations(tlabels,k+1) for k in range(len(tlabels))] combn = ['|'.join(sorted(y)) for x in combn for y in x] c1 = dict(zip(combn,[0]*len(combn))) c2 = dict(zip(combn,[0]*len(combn))) total_cov = 0.0 _scored = (kw.get('type') == 'score') chromset = set([c for t in tracks for c in t.chrmeta]) for chrom in chromset: streams = [_add_label(t.read(chrom),tlabels[n]) for n,t in enumerate(tracks)] s = cobble(concatenate(streams),scored=_scored) name_idx = s.fields.index('track_name') start_idx = s.fields.index('start') end_idx = s.fields.index('end') if _scored: score_idx = s.fields.index('score') for x in s: length = x[end_idx]-x[start_idx] total_cov += length sub = sorted(list(set(x[name_idx].split('|')))) # avoid 'A|A' cb = [combinations(sub,k) for k in range(1,len(sub)+1)] cb = ['|'.join(sorted(y)) for c in cb for y in c] if _scored: c1['|'.join(sub)] += x[score_idx] for c in cb: c2[c] += x[score_idx] else: c1['|'.join(sub)] += length for c in cb: c2[c] += length if total_cov < 1: output = self.temporary_path(fname='venn_summary.txt') with open(output,'wb') as summary: summary.write("Empty content (no coverage) on %s." %(",".join(chromset))) self.new_file(output, 'venn_summary') return legend = [t.name for t in tracks] if _scored: for c in combn: c2[c] = round(c2[c]) else: for c in combn: c2[c] = round((100*c2[c])/total_cov) c1[c] = (100*c1[c])/total_cov else: raise ValueError("Input type '%s' not supported." %intype) if nsamples <= 4: format = kw.get('output') or 'pdf' output = self.temporary_path(fname='venn_diagram.'+format) venn(c2,legend=legend,options=venn_options,output=output,format=format) self.new_file(output, 'venn_diagram') # Text summary output = self.temporary_path(fname='venn_summary.txt') with open(output,'w') as summary: summary.write("%s\t%s\t%s\n" % ("Group","Coverage", "Cumulative coverage")) record = "%s\t%.2f\t%d\n" for c in sorted(combn, key=lambda x:(len(x),x)): summary.write(record%(c,c1[c],c2[c])) self.new_file(output, 'venn_summary') return self.display_time()
def __call__(self, **kw): def _parse_logic(string): s = re.sub(r'[^\w\d!=><\. ]', '', string) s = re.sub(r' OR ', ')or(%f ', s) s = re.sub(r' AND ', ')and(%f ', s) return "(%f " + s + ")" def _run_test(row, indx, cond): num = float(row[col_ind[indx]]) num = max(-sys.maxint, min(sys.maxint, num)) num = (num, ) * c.count("%f") return eval(cond % (num)) def _add_label(s, x): _f = s.fields + ['track_name'] return FeatureStream((y + (x, ) for y in s), fields=_f) venn_options = {} # tune it here tracks = [] intype = kw.get("input_type") or "Table" if intype == "Table": s_cols = kw.get('id_columns', '') s_filters = kw.get('filters', '') infile = track(kw.get('table', ''), format='txt', header=True) col_ind = [int(i) - 1 for i in s_cols.split(",")] legend = [ infile.fields[i] if i < len(infile.fields) else str(i) for i in col_ind ] conds = [_parse_logic(x) for x in s_filters.split(",")] tlabels = [chr(k + 65) for k in range(len(col_ind))] conds += ["1"] * (len(col_ind) - len(conds)) combn = [ tuple(sorted(x)) for k in range(len(tlabels)) for x in combinations(tlabels, k + 1) ] c1 = dict(("|".join(c), 0) for c in combn) c2 = dict(("|".join(c), 0) for c in combn) indx = dict((c, [tlabels.index(x) for x in c]) for c in combn) for row in infile: tests = [_run_test(row, i, c) for i, c in enumerate(conds)] for c in combn: c1["|".join([tlabels[n] for n, t in enumerate(tests) if t])] += 1 c2["|".join(c)] += all([tests[i] for i in indx[c]]) nsamples = len(col_ind) combn = ['|'.join(y) for x in combn for y in x] elif intype == "Tracks": filenames = kw['TrMulti']['files'] if not isinstance(filenames, (list, tuple)): filenames = [filenames] for f in filenames: assert os.path.exists(f), "File not found: %s ." % f tracks = [track(f, chrmeta='guess') for f in filenames] nsamples = len(tracks) tlabels = [chr(k + 65) for k in range(len(tracks))] combn = [combinations(tlabels, k + 1) for k in range(len(tlabels))] combn = ['|'.join(sorted(y)) for x in combn for y in x] c1 = dict(zip(combn, [0] * len(combn))) c2 = dict(zip(combn, [0] * len(combn))) total_cov = 0.0 _scored = (kw.get('type') == 'score') chromset = set([c for t in tracks for c in t.chrmeta]) for chrom in chromset: streams = [ _add_label(t.read(chrom), tlabels[n]) for n, t in enumerate(tracks) ] s = cobble(concatenate(streams), scored=_scored) name_idx = s.fields.index('track_name') start_idx = s.fields.index('start') end_idx = s.fields.index('end') if _scored: score_idx = s.fields.index('score') for x in s: length = x[end_idx] - x[start_idx] total_cov += length sub = sorted(list(set( x[name_idx].split('|')))) # avoid 'A|A' cb = [combinations(sub, k) for k in range(1, len(sub) + 1)] cb = ['|'.join(sorted(y)) for c in cb for y in c] if _scored: c1['|'.join(sub)] += x[score_idx] for c in cb: c2[c] += x[score_idx] else: c1['|'.join(sub)] += length for c in cb: c2[c] += length if total_cov < 1: output = self.temporary_path(fname='venn_summary.txt') with open(output, 'wb') as summary: summary.write("Empty content (no coverage) on %s." % (",".join(chromset))) self.new_file(output, 'venn_summary') return legend = [t.name for t in tracks] if _scored: for c in combn: c2[c] = round(c2[c]) else: for c in combn: c2[c] = round((100 * c2[c]) / total_cov) c1[c] = (100 * c1[c]) / total_cov else: raise ValueError("Input type '%s' not supported." % intype) if nsamples <= 4: format = kw.get('format') or 'pdf' output = self.temporary_path(fname='venn_diagram.' + format) venn(c2, legend=legend, options=venn_options, output=output, format=format) self.new_file(output, 'venn_diagram') # Text summary output = self.temporary_path(fname='venn_summary.txt') with open(output, 'w') as summary: summary.write("%s\t%s\t%s\n" % ("Group", "Coverage", "Cumulative coverage")) record = "%s\t%.2f\t%d\n" for c in sorted(combn, key=lambda x: (len(x), x)): summary.write(record % (c, c1[c], c2[c])) self.new_file(output, 'venn_summary') return self.display_time()
def score_by_feature(trackScores, trackFeatures, method="mean"): """ For every feature from *trackFeatures*, get the list of all scores it contains and apply an operation *method* on this list (by default, scores are averaged). Warning: both score and feature streams must be sorted! (use `common.sorted_stream` is necessary). The output is a stream similar to *trackFeatures* but with an additional `score` field for each stream in *trackScores*:: method = 'mean': X: ------##########--------------##########------ Y: ___________666666666__________6666666666______ R: ______[ 3. ]______________[ 6. ]______ method = 'sum': X : ------##########--------------##########------ Y1: ___________666666666__________6666666666______ Y2: ___222222_____________________333_____________ R : ______[ 30,6 ]______________[ 60,9 ]______ :param trackScores: (list of) one or several -sorted- score track(s) (FeatureStream). :param trackFeatures: (FeatureStream) one -sorted- feature track. :param method: (str of function): operation applied to the list of scores from one feature. Can be one of 'sum','mean','median','min','max', or a custom function. :rtype: FeatureStream """ def _stream(ts, tf): X = [common.sentinelize(x, [sys.maxint] * len(x.fields)) for x in ts] S = [[(-sys.maxint, -sys.maxint, 0.0)] for t in ts] start_idx = tf.fields.index("start") end_idx = tf.fields.index("end") if hasattr(method, "__call__"): mean_fn = lambda scores, denom: method(scores) else: mean_fn = _score_functions.get(method, _arithmetic_mean) for y in tf: ystart = y[start_idx] yend = y[end_idx] scores = () for i in range(len(ts)): xnext = S[i][-1] # Load into S all score items which intersect feature y while xnext[0] < yend: xnext = X[i].next() if xnext[1] > ystart: S[i].append(xnext) n = 0 while S[i][n][1] <= ystart: n += 1 S[i] = S[i][n:] scores_y = [] for s in S[i]: if yend <= s[0]: continue if s[0] < ystart: start = ystart else: start = s[0] if yend < s[1]: end = yend else: end = s[1] scores_y.extend([s[2]] * (end - start)) scores += (mean_fn(scores_y, 1.0 / (yend - ystart)),) yield tuple(y) + scores if not (isinstance(trackScores, (list, tuple))): trackScores = [trackScores] if isinstance(trackFeatures, (list, tuple)): trackFeatures = concatenate(trackFeatures) if len(trackScores) > 1 or "score" in trackFeatures.fields: _fields = ["score" + str(i) for i in range(len(trackScores))] else: _fields = ["score"] _ts = [common.reorder(t, ["start", "end", "score"]) for t in trackScores] return FeatureStream(_stream(_ts, trackFeatures), trackFeatures.fields + _fields)
def filter_scores(trackScores, trackFeatures, method="sum", strict=False, annotate=False, flatten=common.cobble): """ Extract from *trackScores* only the regions overlapping *trackFeatures*'s regions. Warning: both score and features streams must be sorted! (use `common.sorted_stream` if necessary). Example:: X: _____#########__________#############_______ Y: __________666666666___2222776_444___________ R: __________6666__________22776_444___________ Note: *trackFeatures* is :func:`cobbled <bbcflib.gfminer.common.cobble>` by default (to avoid score duplications). An alternative is :func:`fusion <bbcflib.gfminer.common.fusion>`, or nothing. If strand information is present in both *trackScores* and *trackFeatures*, only scores inside a region of the same strand are kept. :param trackScores: (FeatureStream) one -sorted- score track. If a list of streams is provided, they will be merged (using `merge_scores`). :param trackFeatures: (FeatureStream) one -sorted- feature track. If a list of streams is provided, they will be merged (using `concatenate`). :param method: (str) `merge_scores` *method* argument, in case *trackScores* is a list. ['sum'] :param strict: (bool) if True, only score regions from *trackScores* that are strictly contained in a feature region of *trackFeatures* will be returned. [False] :param annotate: (bool) if True, supplementary annotation (and the corresponding fields) from *trackFeatures* will be added to the result. [False] :param flatten: (func) one of None, `common.fusion` or `common.cobble`. Function to be applied to *trackFeatures* before all. [common.cobble] :rtype: FeatureStream """ def _stream(ts, tf): tf = common.sentinelize(tf, [sys.maxint] * len(tf.fields)) info_idx = [k for k, f in enumerate(tf.fields) if f not in ts.fields] if stranded: ts_strand_idx = ts.fields.index("strand") tf_strand_idx = tf.fields.index("strand") same_strand = lambda x, y: x[ts_strand_idx] == y[tf_strand_idx] else: same_strand = lambda x, y: True Y = [] ynext = (-sys.maxint, -sys.maxint, 0.0) for x in ts: xstart = x[0] xend = x[1] # Load into Y all feature items which intersect score x while ynext[0] < xend: if ynext[1] > xstart: Y.append(ynext) ynext = tf.next() # Remove features that are far behind x if Y: n = 0 try: while Y[n][1] <= xstart: n += 1 Y = Y[n:] except IndexError: Y = [ynext] # Yield intersections for y in Y: if not same_strand(x, y): continue info = tuple([y[k] for k in info_idx]) if annotate else () if strict and (y[0] > xstart or y[1] < xend): continue if y[0] >= xend: continue # keep for next iteration start = xstart if y[0] < xstart else y[0] end = xend if y[1] > xend else y[1] yield (start, end) + tuple(x[2:]) + info if isinstance(trackFeatures, (list, tuple)): trackFeatures = concatenate(trackFeatures) if isinstance(trackScores, (list, tuple)): trackScores = merge_scores(trackScores, method) _info_fields = [f for f in trackFeatures.fields if f not in trackScores.fields] if annotate else [] stranded = "strand" in (set(trackScores.fields) & set(trackFeatures.fields)) if flatten is None: _tf = trackFeatures else: _tf = flatten(trackFeatures, stranded=stranded) _ts = common.reorder(trackScores, ["start", "end"]) _tf = common.reorder(_tf, ["start", "end"]) return FeatureStream(_stream(_ts, _tf), _ts.fields + _info_fields)
def chipseq_workflow( ex, job_or_dict, assembly, script_path='', logfile=sys.stdout, via='lsf' ): """Runs a chipseq workflow over bam files obtained by mapseq. Will optionally run ``macs`` and 'run_deconv'. :param ex: a 'bein' execution environment to run jobs in, :param job_or_dict: a 'Frontend' 'job' object, or a dictionary with key 'groups', 'files' and 'options' if applicable, :param assembly: a genrep.Assembly object, :param script_path: only needed if 'run_deconv' is in the job options, must point to the location of the R scripts. Defaults ``macs`` parameters (overriden by ``job_or_dict['options']['macs_args']``) are set as follows: * ``'-bw'``: 200 ('bandwith') * ``'-m'``: 10,100 ('minimum and maximum enrichments relative to background or control') The enrichment bounds will be computed from a Poisson threshold *T*, if available, as *(min(30,5*(T+1)),50*(T+1))*. Returns a tuple of a dictionary with keys *group_id* from the job groups, *macs* and *deconv* if applicable and values file description dictionaries and a dictionary of *group_ids* to *names* used in file descriptions. """ options = {} if logfile is None: logfile = sys.stdout if isinstance(job_or_dict,frontend.Job): options = job_or_dict.options groups = job_or_dict.groups mapseq_files = job_or_dict.files elif isinstance(job_or_dict,dict) and 'groups' in job_or_dict: if 'options' in job_or_dict: options = job_or_dict['options'] groups = job_or_dict['groups'] for gid in groups.keys(): if not('name' in groups[gid]): groups[gid]['name'] = gid mapseq_files = job_or_dict.get('files',{}) else: raise TypeError("job_or_dict must be a frontend. Job object or a dictionary with key 'groups'.") merge_strands = int(options.get('merge_strands',-1)) suffixes = ["fwd","rev"] peak_deconvolution = options.get('peak_deconvolution',False) if isinstance(peak_deconvolution,basestring): peak_deconvolution = peak_deconvolution.lower() in ['1','true','t'] run_meme = options.get('run_meme',False) if isinstance(run_meme,basestring): run_meme = run_meme.lower() in ['1','true','t'] macs_args = options.get('macs_args',["--bw","200"]) b2w_args = options.get('b2w_args',[]) if not(isinstance(mapseq_files,dict)): raise TypeError("Mapseq_files must be a dictionary.") tests = [] controls = [] names = {'tests': [], 'controls': []} read_length = [] p_thresh = {} for gid,mapped in mapseq_files.iteritems(): group_name = groups[gid]['name'] if not(isinstance(mapped,dict)): raise TypeError("Mapseq_files values must be dictionaries with keys *run_ids* or 'bam'.") if 'bam' in mapped: mapped = {'_': mapped} futures = {} ptruns = [] for k in mapped.keys(): if not 'libname' in mapped[k]: mapped[k]['libname'] = group_name+"_"+str(k) if not 'stats' in mapped[k]: futures[k] = mapseq.bamstats.nonblocking( ex, mapped[k]["bam"], via=via ) if mapped[k].get('poisson_threshold',-1)>0: ptruns.append(mapped[k]['poisson_threshold']) if len(ptruns)>0: p_thresh['group_name'] = sum(ptruns)/len(ptruns) for k in futures.keys(): mapped[k]['stats'] = f.wait() if len(mapped)>1: bamfile = mapseq.merge_bam(ex, [m['bam'] for m in mapped.values()]) else: bamfile = mapped.values()[0]['bam'] if groups[gid]['control']: controls.append(bamfile) names['controls'].append((gid,group_name)) else: tests.append(bamfile) names['tests'].append((gid,group_name)) read_length.append(mapped.values()[0]['stats']['read_length']) genome_size = mapped.values()[0]['stats']['genome_size'] if len(controls)<1: controls = [None] names['controls'] = [(0,None)] logfile.write("Starting MACS.\n");logfile.flush() processed = {'macs': add_macs_results( ex, read_length, genome_size, tests, ctrlbam=controls, name=names, poisson_threshold=p_thresh, macs_args=macs_args, via=via ) } logfile.write("Done MACS.\n");logfile.flush() peak_list = {} chrlist = assembly.chrmeta ## select only peaks with p-val <= 1e-0.6 = .25 => score = -10log10(p) >= 6 _select = {'score':(6,sys.maxint)} _fields = ['chr','start','end','name','score'] for i,name in enumerate(names['tests']): if len(names['controls']) < 2: ctrl = (name,names['controls'][0]) macsbed = track(processed['macs'][ctrl]+"_summits.bed", chrmeta=chrlist, fields=_fields).read(selection=_select) else: macsbed = concatenate([apply(track(processed['macs'][(name,x)]+"_summits.bed", chrmeta=chrlist, fields=_fields).read(selection=_select), 'name', lambda __n,_n=xn: "%s:%i" %(__n,_n)) for xn,x in enumerate(names['controls'])]) ############################## macs_neighb = neighborhood( macsbed, before_start=150, after_end=150 ) peak_list[name] = unique_filename_in()+".sql" macs_final = track( peak_list[name], chrmeta=chrlist, info={'datatype':'qualitative'}, fields=['start','end','name','score'] ) macs_final.write(fusion(macs_neighb),clip=True) macs_final.close() ############################## merged_wig = {} options['read_extension'] = int(options.get('read_extension') or read_length[0]) if options['read_extension'] < 1: options['read_extension'] = read_length[0] make_wigs = merge_strands >= 0 or options['read_extension']>100 if options['read_extension'] > 100: options['read_extension'] = 50 for gid,mapped in mapseq_files.iteritems(): # if groups[gid]['control']: continue group_name = groups[gid]['name'] wig = [] for m in mapped.values(): if make_wigs or not('wig' in m) or len(m['wig'])<2: output = mapseq.parallel_density_sql( ex, m["bam"], assembly.chrmeta, nreads=m["stats"]["total"], merge=-1, read_extension=options['read_extension'], convert=False, b2w_args=b2w_args, via=via ) wig.append(dict((s,output+s+'.sql') for s in suffixes)) else: wig.append(m['wig']) if len(wig) > 1: merged_wig[group_name] = dict((s,merge_sql(ex, [x[s] for x in wig], via=via)) for s in suffixes) else: merged_wig[group_name] = wig[0] if peak_deconvolution: ############################## def _filter_deconv( stream, pval ): ferr = re.compile(r';FERR=([\d\.]+)$') return FeatureStream( ((x[0],)+((x[2]+x[1])/2-150,(x[2]+x[1])/2+150)+x[3:] for x in stream if "FERR=" in x[3] and float(ferr.search(x[3]).groups()[0]) <= pval), fields=stream.fields ) ############################## processed['deconv'] = {} for name in names['tests']: logfile.write(name[1]+" deconvolution.\n");logfile.flush() if len(names['controls']) < 2: ctrl = (name,names['controls'][0]) macsbed = processed['macs'][ctrl]+"_peaks.bed" else: macsbed = intersect_many_bed( ex, [processed['macs'][(name,x)]+"_peaks.bed" for x in names['controls']], via=via ) deconv = run_deconv( ex, merged_wig[name[1]], macsbed, assembly.chrmeta, options['read_extension'], script_path, via=via ) peak_list[name] = unique_filename_in()+".bed" trbed = track(deconv['peaks']).read() with track(peak_list[name], chrmeta=chrlist, fields=trbed.fields) as bedfile: bedfile.write(fusion(_filter_deconv(trbed,0.65))) ex.add(deconv['peaks'], description=set_file_descr(name[1]+'_peaks.sql', type='sql', step='deconvolution', groupId=name[0])) ex.add(deconv['profile'], description=set_file_descr(name[1]+'_deconv.sql', type='sql', step='deconvolution', groupId=name[0])) bigwig = unique_filename_in() try: convert(deconv['profile'],(bigwig,"bigWig")) ex.add(bigwig, description=set_file_descr(name[1]+'_deconv.bw', type='bigWig', ucsc='1', step='deconvolution', groupId=name[0])) except OSError as e: logfile.write(str(e));logfile.flush() ex.add(deconv['pdf'], description=set_file_descr(name[1]+'_deconv.pdf', type='pdf', step='deconvolution', groupId=name[0])) processed['deconv'][name] = deconv ############################## def _join_macs( stream, xlsl, _f ): def _macs_row(_s): for _p in _s: for _n in _p[3].split("|"): if len(xlsl) == 1: nb = int(_n.split(";")[0][13:]) if _n[:3] == "ID=" else int(_n[10:]) yield _p+xlsl[0][nb-1][1:] else: nb = _n.split(";")[0][13:] if _n[:3] == "ID=" else _n[10:] nb = nb.split(":") yield _p+xlsl[int(nb[1])][int(nb[0])-1][1:] return FeatureStream( _macs_row(stream), fields=_f ) ############################## peakfile_list = [] for name, plist in peak_list.iteritems(): ptrack = track(plist,chrmeta=chrlist,fields=["chr","start","end","name","score"]) peakfile = unique_filename_in() xlsh, xlsl = parse_MACS_xls([processed['macs'][(name,_c)]+"_peaks.xls" for _c in names['controls']]) try: ###### if assembly doesn't have annotations, we skip the "getNearestFeature" but still go through "_join_macs" assembly.gene_track() _fields = ['chr','start','end','name','score','gene','location_type','distance']\ +["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:] peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields) peakout.make_header("#"+"\t".join(['chromosome','start','end','info','peak_height','gene(s)','location_type','distance']+_fields[8:])) for chrom in assembly.chrnames: _feat = assembly.gene_track(chrom) peakout.write(_join_macs(getNearestFeature(ptrack.read(selection=chrom),_feat), xlsl, _fields), mode='append') except ValueError: _fields = ['chr','start','end','name','score']+["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:] peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields) peakout.make_header("#"+"\t".join(['chromosome','start','end','info','peak_height']+_fields[8:])) for chrom in assembly.chrnames: peakout.write(_join_macs(ptrack.read(selection=chrom), xlsl, _fields), mode='append') peakout.close() gzipfile(ex,peakfile) peakfile_list.append(track(peakfile+".gz", format='txt', fields=_fields)) ex.add(peakfile+".gz", description=set_file_descr(name[1]+'_annotated_peaks.txt.gz',type='text', step='annotation',groupId=name[0])) stracks = [track(wig,info={'name':name+"_"+st}) for name,wigdict in merged_wig.iteritems() for st,wig in wigdict.iteritems()] tablefile = unique_filename_in() with open(tablefile,"w") as _tf: _pnames = ["MACS_%s_vs_%s" %(_s[1],_c[1]) if _c[1] else "MACS_%s" %_s[1] for _s in names['tests'] for _c in names['controls']] _tf.write("\t".join(['#chromosome','start','end',]+_pnames+[s.name for s in stracks])+"\n") #### need to do something about peak origin (split names, write to separate columns?) for chrom in assembly.chrnames: pk_lst = [apply(pt.read(chrom,fields=['chr','start','end','name']), 'name', lambda __n,_n=npt: "%s:%i" %(__n,_n)) for npt,pt in enumerate(peakfile_list)] features = fusion(concatenate(pk_lst, fields=['chr','start','end','name'], remove_duplicates=True, group_by=['chr','start','end'])) sread = [sig.read(chrom) for sig in stracks] quantifs = score_by_feature(sread, features, method='sum') nidx = quantifs.fields.index('name') _ns = len(tests) _nc = len(controls) with open(tablefile,"a") as _tf: for row in quantifs: pcols = ['']*_ns*_nc _rnsplit = row[nidx].split(":") _n1 = _rnsplit[0] _k = 0 while ( _k < len(_rnsplit)-1-int(_nc>1) ): if _nc > 1: _k += 2 _n2 = _rnsplit[_k-1] _n = _rnsplit[_k].split("|") pcols[int(_n[0])*_nc+int(_n2)] = _n1 else: _k += 1 _n = _rnsplit[_k].split("|") pcols[int(_n[0])] = _n1 _n1 = "|".join(_n[1:]) _tf.write("\t".join(str(tt) for tt in row[:nidx]+tuple(pcols)+row[nidx+1:])+"\n") gzipfile(ex,tablefile) ex.add(tablefile+".gz", description=set_file_descr('Combined_peak_quantifications.txt.gz',type='text', step='summary')) if run_meme: from bbcflib.motif import parallel_meme logfile.write("Starting MEME.\n");logfile.flush() processed['meme'] = parallel_meme( ex, assembly, peak_list.values(), name=peak_list.keys(), chip=True, meme_args=['-meme-nmotifs','4','-meme-mod','zoops'], via=via ) return processed
def chipseq_workflow(ex, job_or_dict, assembly, script_path='', logfile=sys.stdout, via='lsf'): """Runs a chipseq workflow over bam files obtained by mapseq. Will optionally run ``macs`` and 'run_deconv'. :param ex: a 'bein' execution environment to run jobs in, :param job_or_dict: a 'Frontend' 'job' object, or a dictionary with key 'groups', 'files' and 'options' if applicable, :param assembly: a genrep.Assembly object, :param script_path: only needed if 'run_deconv' is in the job options, must point to the location of the R scripts. Defaults ``macs`` parameters (overriden by ``job_or_dict['options']['macs_args']``) are set as follows: * ``'-bw'``: 200 ('bandwith') * ``'-m'``: 10,100 ('minimum and maximum enrichments relative to background or control') The enrichment bounds will be computed from a Poisson threshold *T*, if available, as *(min(30,5*(T+1)),50*(T+1))*. Returns a tuple of a dictionary with keys *group_id* from the job groups, *macs* and *deconv* if applicable and values file description dictionaries and a dictionary of *group_ids* to *names* used in file descriptions. """ options = {} if logfile is None: logfile = sys.stdout if isinstance(job_or_dict, frontend.Job): options = job_or_dict.options groups = job_or_dict.groups mapseq_files = job_or_dict.files elif isinstance(job_or_dict, dict) and 'groups' in job_or_dict: if 'options' in job_or_dict: options = job_or_dict['options'] groups = job_or_dict['groups'] for gid in groups.keys(): if not ('name' in groups[gid]): groups[gid]['name'] = gid mapseq_files = job_or_dict.get('files', {}) else: raise TypeError( "job_or_dict must be a frontend. Job object or a dictionary with key 'groups'." ) merge_strands = int(options.get('merge_strands', -1)) suffixes = ["fwd", "rev"] peak_deconvolution = options.get('peak_deconvolution', False) if isinstance(peak_deconvolution, basestring): peak_deconvolution = peak_deconvolution.lower() in ['1', 'true', 't'] run_meme = options.get('run_meme', False) if isinstance(run_meme, basestring): run_meme = run_meme.lower() in ['1', 'true', 't'] macs_args = options.get('macs_args', ["--bw", "200"]) b2w_args = options.get('b2w_args', []) if not (isinstance(mapseq_files, dict)): raise TypeError("Mapseq_files must be a dictionary.") tests = [] controls = [] names = {'tests': [], 'controls': []} read_length = [] p_thresh = {} for gid, mapped in mapseq_files.iteritems(): group_name = groups[gid]['name'] if not (isinstance(mapped, dict)): raise TypeError( "Mapseq_files values must be dictionaries with keys *run_ids* or 'bam'." ) if 'bam' in mapped: mapped = {'_': mapped} futures = {} ptruns = [] for k in mapped.keys(): if not 'libname' in mapped[k]: mapped[k]['libname'] = group_name + "_" + str(k) if not 'stats' in mapped[k]: futures[k] = mapseq.bamstats.nonblocking(ex, mapped[k]["bam"], via=via) if mapped[k].get('poisson_threshold', -1) > 0: ptruns.append(mapped[k]['poisson_threshold']) if len(ptruns) > 0: p_thresh['group_name'] = sum(ptruns) / len(ptruns) for k in futures.keys(): mapped[k]['stats'] = f.wait() if len(mapped) > 1: bamfile = mapseq.merge_bam(ex, [m['bam'] for m in mapped.values()]) else: bamfile = mapped.values()[0]['bam'] if groups[gid]['control']: controls.append(bamfile) names['controls'].append((gid, group_name)) else: tests.append(bamfile) names['tests'].append((gid, group_name)) read_length.append(mapped.values()[0]['stats']['read_length']) genome_size = mapped.values()[0]['stats']['genome_size'] if len(controls) < 1: controls = [None] names['controls'] = [(0, None)] logfile.write("Starting MACS.\n") logfile.flush() processed = { 'macs': add_macs_results(ex, read_length, genome_size, tests, ctrlbam=controls, name=names, poisson_threshold=p_thresh, macs_args=macs_args, via=via) } logfile.write("Done MACS.\n") logfile.flush() peak_list = {} chrlist = assembly.chrmeta ## select only peaks with p-val <= 1e-0.6 = .25 => score = -10log10(p) >= 6 _select = {'score': (6, sys.maxint)} _fields = ['chr', 'start', 'end', 'name', 'score'] for i, name in enumerate(names['tests']): if len(names['controls']) < 2: ctrl = (name, names['controls'][0]) macsbed = track(processed['macs'][ctrl] + "_summits.bed", chrmeta=chrlist, fields=_fields).read(selection=_select) else: macsbed = concatenate([ apply(track(processed['macs'][(name, x)] + "_summits.bed", chrmeta=chrlist, fields=_fields).read(selection=_select), 'name', lambda __n, _n=xn: "%s:%i" % (__n, _n)) for xn, x in enumerate(names['controls']) ]) ############################## macs_neighb = neighborhood(macsbed, before_start=150, after_end=150) peak_list[name] = unique_filename_in() + ".sql" macs_final = track(peak_list[name], chrmeta=chrlist, info={'datatype': 'qualitative'}, fields=['start', 'end', 'name', 'score']) macs_final.write(fusion(macs_neighb), clip=True) macs_final.close() ############################## merged_wig = {} options['read_extension'] = int( options.get('read_extension') or read_length[0]) if options['read_extension'] < 1: options['read_extension'] = read_length[0] make_wigs = merge_strands >= 0 or options['read_extension'] > 100 if options['read_extension'] > 100: options['read_extension'] = 50 for gid, mapped in mapseq_files.iteritems(): # if groups[gid]['control']: continue group_name = groups[gid]['name'] wig = [] for m in mapped.values(): if make_wigs or not ('wig' in m) or len(m['wig']) < 2: output = mapseq.parallel_density_sql( ex, m["bam"], assembly.chrmeta, nreads=m["stats"]["total"], merge=-1, read_extension=options['read_extension'], convert=False, b2w_args=b2w_args, via=via) wig.append(dict((s, output + s + '.sql') for s in suffixes)) else: wig.append(m['wig']) if len(wig) > 1: merged_wig[group_name] = dict( (s, merge_sql(ex, [x[s] for x in wig], via=via)) for s in suffixes) else: merged_wig[group_name] = wig[0] if peak_deconvolution: ############################## def _filter_deconv(stream, pval): ferr = re.compile(r';FERR=([\d\.]+)$') return FeatureStream( ((x[0], ) + ((x[2] + x[1]) / 2 - 150, (x[2] + x[1]) / 2 + 150) + x[3:] for x in stream if "FERR=" in x[3] and float(ferr.search(x[3]).groups()[0]) <= pval), fields=stream.fields) ############################## processed['deconv'] = {} for name in names['tests']: logfile.write(name[1] + " deconvolution.\n") logfile.flush() if len(names['controls']) < 2: ctrl = (name, names['controls'][0]) macsbed = processed['macs'][ctrl] + "_peaks.bed" else: macsbed = intersect_many_bed(ex, [ processed['macs'][(name, x)] + "_peaks.bed" for x in names['controls'] ], via=via) deconv = run_deconv(ex, merged_wig[name[1]], macsbed, assembly.chrmeta, options['read_extension'], script_path, via=via) peak_list[name] = unique_filename_in() + ".bed" trbed = track(deconv['peaks']).read() with track(peak_list[name], chrmeta=chrlist, fields=trbed.fields) as bedfile: bedfile.write(fusion(_filter_deconv(trbed, 0.65))) ex.add(deconv['peaks'], description=set_file_descr(name[1] + '_peaks.sql', type='sql', step='deconvolution', groupId=name[0])) ex.add(deconv['profile'], description=set_file_descr(name[1] + '_deconv.sql', type='sql', step='deconvolution', groupId=name[0])) bigwig = unique_filename_in() try: convert(deconv['profile'], (bigwig, "bigWig")) ex.add(bigwig, description=set_file_descr(name[1] + '_deconv.bw', type='bigWig', ucsc='1', step='deconvolution', groupId=name[0])) except OSError as e: logfile.write(str(e)) logfile.flush() ex.add(deconv['pdf'], description=set_file_descr(name[1] + '_deconv.pdf', type='pdf', step='deconvolution', groupId=name[0])) processed['deconv'][name] = deconv ############################## def _join_macs(stream, xlsl, _f): def _macs_row(_s): for _p in _s: for _n in _p[3].split("|"): if len(xlsl) == 1: nb = int( _n.split(";")[0][13:]) if _n[:3] == "ID=" else int( _n[10:]) yield _p + xlsl[0][nb - 1][1:] else: nb = _n.split( ";")[0][13:] if _n[:3] == "ID=" else _n[10:] nb = nb.split(":") yield _p + xlsl[int(nb[1])][int(nb[0]) - 1][1:] return FeatureStream(_macs_row(stream), fields=_f) ############################## peakfile_list = [] for name, plist in peak_list.iteritems(): ptrack = track(plist, chrmeta=chrlist, fields=["chr", "start", "end", "name", "score"]) peakfile = unique_filename_in() xlsh, xlsl = parse_MACS_xls([ processed['macs'][(name, _c)] + "_peaks.xls" for _c in names['controls'] ]) try: ###### if assembly doesn't have annotations, we skip the "getNearestFeature" but still go through "_join_macs" assembly.gene_track() _fields = ['chr','start','end','name','score','gene','location_type','distance']\ +["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:] peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields) peakout.make_header("#" + "\t".join([ 'chromosome', 'start', 'end', 'info', 'peak_height', 'gene(s)', 'location_type', 'distance' ] + _fields[8:])) for chrom in assembly.chrnames: _feat = assembly.gene_track(chrom) peakout.write(_join_macs( getNearestFeature(ptrack.read(selection=chrom), _feat), xlsl, _fields), mode='append') except ValueError: _fields = ['chr', 'start', 'end', 'name', 'score' ] + ["MACS_%s" % h for h in xlsh[1:5]] + xlsh[5:] peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields) peakout.make_header("#" + "\t".join( ['chromosome', 'start', 'end', 'info', 'peak_height'] + _fields[8:])) for chrom in assembly.chrnames: peakout.write(_join_macs(ptrack.read(selection=chrom), xlsl, _fields), mode='append') peakout.close() gzipfile(ex, peakfile) peakfile_list.append( track(peakfile + ".gz", format='txt', fields=_fields)) ex.add(peakfile + ".gz", description=set_file_descr(name[1] + '_annotated_peaks.txt.gz', type='text', step='annotation', groupId=name[0])) stracks = [ track(wig, info={'name': name + "_" + st}) for name, wigdict in merged_wig.iteritems() for st, wig in wigdict.iteritems() ] tablefile = unique_filename_in() with open(tablefile, "w") as _tf: _pnames = [ "MACS_%s_vs_%s" % (_s[1], _c[1]) if _c[1] else "MACS_%s" % _s[1] for _s in names['tests'] for _c in names['controls'] ] _tf.write("\t".join([ '#chromosome', 'start', 'end', ] + _pnames + [s.name for s in stracks]) + "\n") #### need to do something about peak origin (split names, write to separate columns?) for chrom in assembly.chrnames: pk_lst = [ apply(pt.read(chrom, fields=['chr', 'start', 'end', 'name']), 'name', lambda __n, _n=npt: "%s:%i" % (__n, _n)) for npt, pt in enumerate(peakfile_list) ] features = fusion( concatenate(pk_lst, fields=['chr', 'start', 'end', 'name'], remove_duplicates=True, group_by=['chr', 'start', 'end'])) sread = [sig.read(chrom) for sig in stracks] quantifs = score_by_feature(sread, features, method='sum') nidx = quantifs.fields.index('name') _ns = len(tests) _nc = len(controls) with open(tablefile, "a") as _tf: for row in quantifs: pcols = [''] * _ns * _nc _rnsplit = row[nidx].split(":") _n1 = _rnsplit[0] _k = 0 while (_k < len(_rnsplit) - 1 - int(_nc > 1)): if _nc > 1: _k += 2 _n2 = _rnsplit[_k - 1] _n = _rnsplit[_k].split("|") pcols[int(_n[0]) * _nc + int(_n2)] = _n1 else: _k += 1 _n = _rnsplit[_k].split("|") pcols[int(_n[0])] = _n1 _n1 = "|".join(_n[1:]) _tf.write("\t".join( str(tt) for tt in row[:nidx] + tuple(pcols) + row[nidx + 1:]) + "\n") gzipfile(ex, tablefile) ex.add(tablefile + ".gz", description=set_file_descr('Combined_peak_quantifications.txt.gz', type='text', step='summary')) if run_meme: from bbcflib.motif import parallel_meme logfile.write("Starting MEME.\n") logfile.flush() processed['meme'] = parallel_meme( ex, assembly, peak_list.values(), name=peak_list.keys(), chip=True, meme_args=['-meme-nmotifs', '4', '-meme-mod', 'zoops'], via=via) return processed