Beispiel #1
0
    def test_concatenate(self):
        s1 = [('chr',1,3,0.2,'n'), ('chr',5,9,0.5,'n'), ('chr',11,15,1.2,'n')]
        s2 = [('chr',1,4,0.6,'m'), ('chr',8,11,0.4,'m'), ('chr',11,12,0.1,'m')]
        stream1 = fstream(s1, fields=['chr','start','end','score','name'])
        stream2 = fstream(s2, fields=['chr','start','end','score','name'])
        res = list(concatenate([stream1,stream2], fields=['start','score','name']))
        expected = [(1,3,0.2,'n'),(1,4,0.6,'m'),(5,9,0.5,'n'),(8,11,0.4,'m'),(11,12,0.1,'m'),(11,15,1.2,'n')]
        self.assertListEqual(res,expected)

        # Keep chr and compare items w.r.t. chr
        s1 = [('chr',1,3,0.2,'n'), ('chr',5,9,0.5,'n'), ('chr',11,15,1.2,'n')]
        s2 = [('chr',1,4,0.6,'m'), ('chrX',8,11,0.4,'m'), ('chrX',11,12,0.1,'m')]
        stream1 = fstream(s1, fields=['chr','start','end','score','name'])
        stream2 = fstream(s2, fields=['chr','start','end','score','name'])
        res = list(concatenate([stream1,stream2], fields=['chr','start','end','score']))
        expected = [('chr',1,3,0.2),('chr',1,4,0.6),('chr',5,9,0.5),('chr',11,15,1.2),('chrX',8,11,0.4),('chrX',11,12,0.1)]
        self.assertListEqual(sorted(res),sorted(expected))

        # Remove duplicates
        stream1 = fstream([(1,2),(3,4),(5,6)], fields=['start','end'])
        stream2 = fstream([(3,4),(5,6),(7,8)], fields=['start','end'])
        res = list(concatenate([stream1,stream2], fields=['start','end'], remove_duplicates=True))
        expected = [(1,2),(3,4),(5,6),(7,8)]
        self.assertListEqual(res,expected)

        # Group by
        s1 = [('chr',1,4,0.2,'n'), ('chr',5,9,0.5,'n'), ('chr',11,15,1.2,'n')]
        s2 = [('chr',1,4,0.6,'m'), ('chr',8,11,0.4,'m'), ('chrX',11,15,0.1,'m')]
        group_by = ['chr','start','end']
        aggregate = {'score': lambda x:sum(x), 'name': lambda x:'-'.join(x)}
        stream1 = fstream(s1, fields=['chr','start','end','score','name'])
        stream2 = fstream(s2, fields=['chr','start','end','score','name'])
        res = list(concatenate([stream1,stream2], fields=['chr','start','score','name'], group_by=group_by, aggregate=aggregate))
        expected = [('chr',1,4,0.8,'m-n'),('chr',5,9,0.5,'n'),('chr',8,11,0.4,'m'),('chr',11,15,1.2,'n'),('chrX',11,15,0.1,'m')]
        self.assertListEqual(sorted(res),sorted(expected))
Beispiel #2
0
    def test_concatenate(self):
        s1 = [('chr', 1, 3, 0.2, 'n'), ('chr', 5, 9, 0.5, 'n'),
              ('chr', 11, 15, 1.2, 'n')]
        s2 = [('chr', 1, 4, 0.6, 'm'), ('chr', 8, 11, 0.4, 'm'),
              ('chr', 11, 12, 0.1, 'm')]
        stream1 = fstream(s1, fields=['chr', 'start', 'end', 'score', 'name'])
        stream2 = fstream(s2, fields=['chr', 'start', 'end', 'score', 'name'])
        res = list(
            concatenate([stream1, stream2], fields=['start', 'score', 'name']))
        expected = [(1, 3, 0.2, 'n'), (1, 4, 0.6, 'm'), (5, 9, 0.5, 'n'),
                    (8, 11, 0.4, 'm'), (11, 12, 0.1, 'm'), (11, 15, 1.2, 'n')]
        self.assertListEqual(res, expected)

        # Keep chr and compare items w.r.t. chr
        s1 = [('chr', 1, 3, 0.2, 'n'), ('chr', 5, 9, 0.5, 'n'),
              ('chr', 11, 15, 1.2, 'n')]
        s2 = [('chr', 1, 4, 0.6, 'm'), ('chrX', 8, 11, 0.4, 'm'),
              ('chrX', 11, 12, 0.1, 'm')]
        stream1 = fstream(s1, fields=['chr', 'start', 'end', 'score', 'name'])
        stream2 = fstream(s2, fields=['chr', 'start', 'end', 'score', 'name'])
        res = list(
            concatenate([stream1, stream2],
                        fields=['chr', 'start', 'end', 'score']))
        expected = [('chr', 1, 3, 0.2), ('chr', 1, 4, 0.6), ('chr', 5, 9, 0.5),
                    ('chr', 11, 15, 1.2), ('chrX', 8, 11, 0.4),
                    ('chrX', 11, 12, 0.1)]
        self.assertListEqual(sorted(res), sorted(expected))

        # Remove duplicates
        stream1 = fstream([(1, 2), (3, 4), (5, 6)], fields=['start', 'end'])
        stream2 = fstream([(3, 4), (5, 6), (7, 8)], fields=['start', 'end'])
        res = list(
            concatenate([stream1, stream2],
                        fields=['start', 'end'],
                        remove_duplicates=True))
        expected = [(1, 2), (3, 4), (5, 6), (7, 8)]
        self.assertListEqual(res, expected)

        # Group by
        s1 = [('chr', 1, 4, 0.2, 'n'), ('chr', 5, 9, 0.5, 'n'),
              ('chr', 11, 15, 1.2, 'n')]
        s2 = [('chr', 1, 4, 0.6, 'm'), ('chr', 8, 11, 0.4, 'm'),
              ('chrX', 11, 15, 0.1, 'm')]
        group_by = ['chr', 'start', 'end']
        aggregate = {'score': lambda x: sum(x), 'name': lambda x: '-'.join(x)}
        stream1 = fstream(s1, fields=['chr', 'start', 'end', 'score', 'name'])
        stream2 = fstream(s2, fields=['chr', 'start', 'end', 'score', 'name'])
        res = list(
            concatenate([stream1, stream2],
                        fields=['chr', 'start', 'score', 'name'],
                        group_by=group_by,
                        aggregate=aggregate))
        expected = [('chr', 1, 4, 0.8, 'm-n'), ('chr', 5, 9, 0.5, 'n'),
                    ('chr', 8, 11, 0.4, 'm'), ('chr', 11, 15, 1.2, 'n'),
                    ('chrX', 11, 15, 0.1, 'm')]
        self.assertListEqual(sorted(res), sorted(expected))
Beispiel #3
0
def merge_junc_files(trackList,assembly):
    out = track('all.junc',format='txt',fields=['chr','start','end','strand','score'])
    from bbcflib.genrep import Assembly
    a = Assembly(assembly)
    for c in a.chromosomes:
        tl = [track(t,fields=['chr','start','end','strand','score'],format='txt').read(str(c[0])+'_'+c[1]+'.'+str(c[2]))
              for t in trackList]
        #all = concatenate(tl,remove_duplicates=True)
        all = concatenate(tl,group_by=['chr','start','end'],aggregate={'score':lambda x:sum(x)})
        out.write(all,mode='append')
Beispiel #4
0
def merge_junc_files(trackList, assembly):
    out = track('all.junc',
                format='txt',
                fields=['chr', 'start', 'end', 'strand', 'score'])
    from bbcflib.genrep import Assembly
    a = Assembly(assembly)
    for c in a.chromosomes:
        tl = [
            track(t,
                  fields=['chr', 'start', 'end', 'strand', 'score'],
                  format='txt').read(str(c[0]) + '_' + c[1] + '.' + str(c[2]))
            for t in trackList
        ]
        #all = concatenate(tl,remove_duplicates=True)
        all = concatenate(tl,
                          group_by=['chr', 'start', 'end'],
                          aggregate={'score': lambda x: sum(x)})
        out.write(all, mode='append')
Beispiel #5
0
def score_by_feature(trackScores, trackFeatures, method='mean'):
    """
    For every feature from *trackFeatures*, get the list of all scores it contains
    and apply an operation *method* on this list (by default, scores are averaged).
    Warning: both score and feature streams must be sorted! (use `common.sorted_stream` is necessary).
    The output is a stream similar to *trackFeatures* but with an additional `score` field
    for each stream in *trackScores*::
        method = 'mean':

        X: ------##########--------------##########------
        Y: ___________666666666__________6666666666______
        R: ______[   3.   ]______________[   6.   ]______


        method = 'sum':

        X : ------##########--------------##########------
        Y1: ___________666666666__________6666666666______
        Y2: ___222222_____________________333_____________
        R : ______[  30,6  ]______________[  60,9  ]______

    :param trackScores: (list of) one or several -sorted- score track(s) (FeatureStream).
    :param trackFeatures: (FeatureStream) one -sorted- feature track.
    :param method: (str of function): operation applied to the list of scores from one feature.
        Can be one of 'sum','mean','median','min','max', or a custom function.
    :rtype: FeatureStream
    """
    def _stream(ts, tf):
        X = [common.sentinelize(x, [sys.maxint] * len(x.fields)) for x in ts]
        S = [[(-sys.maxint, -sys.maxint, 0.0)] for t in ts]
        start_idx = tf.fields.index('start')
        end_idx = tf.fields.index('end')
        if hasattr(method, '__call__'):
            mean_fn = lambda scores, denom: method(scores)
        else:
            mean_fn = _score_functions.get(method, _arithmetic_mean)
        for y in tf:
            ystart = y[start_idx]
            yend = y[end_idx]
            scores = ()
            for i in range(len(ts)):
                xnext = S[i][-1]
                # Load into S all score items which intersect feature y
                while xnext[0] < yend:
                    xnext = X[i].next()
                    if xnext[1] > ystart: S[i].append(xnext)
                n = 0
                while S[i][n][1] <= ystart:
                    n += 1
                S[i] = S[i][n:]
                scores_y = []
                for s in S[i]:
                    if yend <= s[0]: continue
                    if s[0] < ystart: start = ystart
                    else: start = s[0]
                    if yend < s[1]: end = yend
                    else: end = s[1]
                    scores_y.extend([s[2]] * (end - start))
                scores += (mean_fn(scores_y, 1.0 / (yend - ystart)), )
            yield tuple(y) + scores

    if not (isinstance(trackScores, (list, tuple))):
        trackScores = [trackScores]
    if isinstance(trackFeatures, (list, tuple)):
        trackFeatures = concatenate(trackFeatures)
    if len(trackScores) > 1 or 'score' in trackFeatures.fields:
        _fields = ["score" + str(i) for i in range(len(trackScores))]
    else:
        _fields = ["score"]
    _ts = [common.reorder(t, ['start', 'end', 'score']) for t in trackScores]
    return FeatureStream(_stream(_ts, trackFeatures),
                         trackFeatures.fields + _fields)
Beispiel #6
0
def filter_scores(trackScores,
                  trackFeatures,
                  method='sum',
                  strict=False,
                  annotate=False,
                  flatten=common.cobble):
    """
    Extract from *trackScores* only the regions overlapping *trackFeatures*'s regions.
    Warning: both score and features streams must be sorted! (use `common.sorted_stream` if necessary).
    Example::

        X: _____#########__________#############_______
        Y: __________666666666___2222776_444___________
        R: __________6666__________22776_444___________

    Note: *trackFeatures* is :func:`cobbled <bbcflib.gfminer.common.cobble>` by default (to avoid
    score duplications). An alternative is :func:`fusion <bbcflib.gfminer.common.fusion>`, or nothing.
    If strand information is present in both *trackScores* and *trackFeatures*, only scores inside
    a region of the same strand are kept.

    :param trackScores: (FeatureStream) one -sorted- score track.
        If a list of streams is provided, they will be merged (using `merge_scores`).
    :param trackFeatures: (FeatureStream) one -sorted- feature track.
        If a list of streams is provided, they will be merged (using `concatenate`).
    :param method: (str) `merge_scores` *method* argument, in case *trackScores* is a list. ['sum']
    :param strict: (bool) if True, only score regions from *trackScores* that are
        strictly contained in a feature region of *trackFeatures* will be returned. [False]
    :param annotate: (bool) if True, supplementary annotation (and the corresponding fields)
        from *trackFeatures* will be added to the result. [False]
    :param flatten: (func) one of None, `common.fusion` or `common.cobble`.
        Function to be applied to *trackFeatures* before all. [common.cobble]
    :rtype: FeatureStream
    """
    def _stream(ts, tf):
        tf = common.sentinelize(tf, [sys.maxint] * len(tf.fields))
        info_idx = [k for k, f in enumerate(tf.fields) if f not in ts.fields]
        if stranded:
            ts_strand_idx = ts.fields.index('strand')
            tf_strand_idx = tf.fields.index('strand')
            same_strand = lambda x, y: x[ts_strand_idx] == y[tf_strand_idx]
        else:
            same_strand = lambda x, y: True
        Y = []
        ynext = (-sys.maxint, -sys.maxint, 0.0)
        for x in ts:
            xstart = x[0]
            xend = x[1]
            # Load into Y all feature items which intersect score x
            while ynext[0] < xend:
                if ynext[1] > xstart:
                    Y.append(ynext)
                ynext = tf.next()
            # Remove features that are far behind x
            if Y:
                n = 0
                try:
                    while Y[n][1] <= xstart:
                        n += 1
                    Y = Y[n:]
                except IndexError:
                    Y = [ynext]
            # Yield intersections
            for y in Y:
                if not same_strand(x, y): continue
                info = tuple([y[k] for k in info_idx]) if annotate else ()
                if strict and (y[0] > xstart or y[1] < xend): continue
                if y[0] >= xend: continue  # keep for next iteration
                start = xstart if y[0] < xstart else y[0]
                end = xend if y[1] > xend else y[1]
                yield (start, end) + tuple(x[2:]) + info

    if isinstance(trackFeatures, (list, tuple)):
        trackFeatures = concatenate(trackFeatures)
    if isinstance(trackScores, (list, tuple)):
        trackScores = merge_scores(trackScores, method)
    _info_fields = [
        f for f in trackFeatures.fields if f not in trackScores.fields
    ] if annotate else []
    stranded = 'strand' in (set(trackScores.fields)
                            & set(trackFeatures.fields))
    if flatten is None:
        _tf = trackFeatures
    else:
        _tf = flatten(trackFeatures, stranded=stranded)
    _ts = common.reorder(trackScores, ['start', 'end'])
    _tf = common.reorder(_tf, ['start', 'end'])
    return FeatureStream(_stream(_ts, _tf), _ts.fields + _info_fields)
Beispiel #7
0
    def __call__(self, **kw):

        def _parse_logic(string):
            s = re.sub(r'[^\w\d!=><\. ]', '', string)
            s = re.sub(r' OR ', ')or(%f ', s)
            s = re.sub(r' AND ', ')and(%f ', s)
            return "(%f "+s+")"

        def _run_test(row, indx, cond):
            num = float(row[col_ind[indx]])
            num = max(-sys.maxint,min(sys.maxint,num))
            num = (num,)*c.count("%f")
            return eval(cond % (num))

        def _add_label(s,x):
            _f = s.fields+['track_name']
            return FeatureStream((y+(x,) for y in s), fields=_f)

        venn_options = {} # tune it here
        tracks = []
        intype = kw.get("input_type") or "Table"
        if intype == "Table":
            s_cols = kw.get('id_columns','')
            s_filters = kw.get('filters','')
            infile = track(kw.get('table',''),format='txt',header=True)
            col_ind = [int(i)-1 for i in s_cols.split(",")]
            legend = [infile.fields[i] if i<len(infile.fields) else str(i) for i in col_ind]
            conds = [_parse_logic(x) for x in s_filters.split(",")]
            tlabels = [chr(k+65) for k in range(len(col_ind))]
            conds += ["1"]*(len(col_ind)-len(conds))
            combn = [tuple(sorted(x)) for k in range(len(tlabels)) 
                     for x in combinations(tlabels,k+1)]
            c1 = dict(("|".join(c),0) for c in combn)
            c2 = dict(("|".join(c),0) for c in combn)
            indx = dict((c,[tlabels.index(x) for x in c]) for c in combn)
            for row in infile:
                tests = [_run_test(row,i,c) for i,c in enumerate(conds)]
                for c in combn:
                    c1["|".join([tlabels[n] for n,t in enumerate(tests) if t])] += 1
                    c2["|".join(c)] += all([tests[i] for i in indx[c]])
            nsamples = len(col_ind)
            combn = ['|'.join(y) for x in combn for y in x]
        elif intype == "Tracks":
            #filenames = kw['TrMulti']['files']
            filenames = kw['files']
            if not isinstance(filenames,(list,tuple)): filenames = [filenames]
            for f in filenames: assert os.path.exists(f), "File not found: %s ." % f
            tracks = [track(f,chrmeta='guess') for f in filenames]
            nsamples = len(tracks)
            tlabels = [chr(k+65) for k in range(len(tracks))]
            combn = [combinations(tlabels,k+1) for k in range(len(tlabels))]
            combn = ['|'.join(sorted(y)) for x in combn for y in x]
            c1 = dict(zip(combn,[0]*len(combn)))
            c2 = dict(zip(combn,[0]*len(combn)))
            total_cov = 0.0
            _scored = (kw.get('type') == 'score')
            chromset = set([c for t in tracks for c in t.chrmeta])
            for chrom in chromset:
                streams = [_add_label(t.read(chrom),tlabels[n]) for n,t in enumerate(tracks)]
                s = cobble(concatenate(streams),scored=_scored)
                name_idx = s.fields.index('track_name')
                start_idx = s.fields.index('start')
                end_idx = s.fields.index('end')
                if _scored: score_idx = s.fields.index('score')
                for x in s:
                    length = x[end_idx]-x[start_idx]
                    total_cov += length
                    sub = sorted(list(set(x[name_idx].split('|')))) # avoid 'A|A'
                    cb = [combinations(sub,k) for k in range(1,len(sub)+1)]
                    cb = ['|'.join(sorted(y)) for c in cb for y in c]
                    if _scored:
                        c1['|'.join(sub)] += x[score_idx]
                        for c in cb: c2[c] += x[score_idx]
                    else:
                        c1['|'.join(sub)] += length
                        for c in cb: c2[c] += length
            if total_cov < 1:
                output = self.temporary_path(fname='venn_summary.txt')
                with open(output,'wb') as summary:
                    summary.write("Empty content (no coverage) on %s." %(",".join(chromset)))
                self.new_file(output, 'venn_summary')
                return
            legend = [t.name for t in tracks]
            if _scored:
                for c in combn:
                    c2[c] = round(c2[c])
            else:
                for c in combn:
                    c2[c] = round((100*c2[c])/total_cov)
                    c1[c] = (100*c1[c])/total_cov
        else:
            raise ValueError("Input type '%s' not supported." %intype)


        if nsamples <= 4:
            format = kw.get('output') or 'pdf'
            output = self.temporary_path(fname='venn_diagram.'+format)
            venn(c2,legend=legend,options=venn_options,output=output,format=format)
            self.new_file(output, 'venn_diagram')

        # Text summary
        output = self.temporary_path(fname='venn_summary.txt')
        with open(output,'w') as summary:
            summary.write("%s\t%s\t%s\n" % ("Group","Coverage", "Cumulative coverage"))
            record = "%s\t%.2f\t%d\n"
            for c in sorted(combn, key=lambda x:(len(x),x)):
                summary.write(record%(c,c1[c],c2[c]))
        self.new_file(output, 'venn_summary')
        return self.display_time()
Beispiel #8
0
    def __call__(self, **kw):
        def _parse_logic(string):
            s = re.sub(r'[^\w\d!=><\. ]', '', string)
            s = re.sub(r' OR ', ')or(%f ', s)
            s = re.sub(r' AND ', ')and(%f ', s)
            return "(%f " + s + ")"

        def _run_test(row, indx, cond):
            num = float(row[col_ind[indx]])
            num = max(-sys.maxint, min(sys.maxint, num))
            num = (num, ) * c.count("%f")
            return eval(cond % (num))

        def _add_label(s, x):
            _f = s.fields + ['track_name']
            return FeatureStream((y + (x, ) for y in s), fields=_f)

        venn_options = {}  # tune it here
        tracks = []
        intype = kw.get("input_type") or "Table"
        if intype == "Table":
            s_cols = kw.get('id_columns', '')
            s_filters = kw.get('filters', '')
            infile = track(kw.get('table', ''), format='txt', header=True)
            col_ind = [int(i) - 1 for i in s_cols.split(",")]
            legend = [
                infile.fields[i] if i < len(infile.fields) else str(i)
                for i in col_ind
            ]
            conds = [_parse_logic(x) for x in s_filters.split(",")]
            tlabels = [chr(k + 65) for k in range(len(col_ind))]
            conds += ["1"] * (len(col_ind) - len(conds))
            combn = [
                tuple(sorted(x)) for k in range(len(tlabels))
                for x in combinations(tlabels, k + 1)
            ]
            c1 = dict(("|".join(c), 0) for c in combn)
            c2 = dict(("|".join(c), 0) for c in combn)
            indx = dict((c, [tlabels.index(x) for x in c]) for c in combn)
            for row in infile:
                tests = [_run_test(row, i, c) for i, c in enumerate(conds)]
                for c in combn:
                    c1["|".join([tlabels[n] for n, t in enumerate(tests)
                                 if t])] += 1
                    c2["|".join(c)] += all([tests[i] for i in indx[c]])
            nsamples = len(col_ind)
            combn = ['|'.join(y) for x in combn for y in x]
        elif intype == "Tracks":
            filenames = kw['TrMulti']['files']
            if not isinstance(filenames, (list, tuple)):
                filenames = [filenames]
            for f in filenames:
                assert os.path.exists(f), "File not found: %s ." % f
            tracks = [track(f, chrmeta='guess') for f in filenames]
            nsamples = len(tracks)
            tlabels = [chr(k + 65) for k in range(len(tracks))]
            combn = [combinations(tlabels, k + 1) for k in range(len(tlabels))]
            combn = ['|'.join(sorted(y)) for x in combn for y in x]
            c1 = dict(zip(combn, [0] * len(combn)))
            c2 = dict(zip(combn, [0] * len(combn)))
            total_cov = 0.0
            _scored = (kw.get('type') == 'score')
            chromset = set([c for t in tracks for c in t.chrmeta])
            for chrom in chromset:
                streams = [
                    _add_label(t.read(chrom), tlabels[n])
                    for n, t in enumerate(tracks)
                ]
                s = cobble(concatenate(streams), scored=_scored)
                name_idx = s.fields.index('track_name')
                start_idx = s.fields.index('start')
                end_idx = s.fields.index('end')
                if _scored: score_idx = s.fields.index('score')
                for x in s:
                    length = x[end_idx] - x[start_idx]
                    total_cov += length
                    sub = sorted(list(set(
                        x[name_idx].split('|'))))  # avoid 'A|A'
                    cb = [combinations(sub, k) for k in range(1, len(sub) + 1)]
                    cb = ['|'.join(sorted(y)) for c in cb for y in c]
                    if _scored:
                        c1['|'.join(sub)] += x[score_idx]
                        for c in cb:
                            c2[c] += x[score_idx]
                    else:
                        c1['|'.join(sub)] += length
                        for c in cb:
                            c2[c] += length
            if total_cov < 1:
                output = self.temporary_path(fname='venn_summary.txt')
                with open(output, 'wb') as summary:
                    summary.write("Empty content (no coverage) on %s." %
                                  (",".join(chromset)))
                self.new_file(output, 'venn_summary')
                return
            legend = [t.name for t in tracks]
            if _scored:
                for c in combn:
                    c2[c] = round(c2[c])
            else:
                for c in combn:
                    c2[c] = round((100 * c2[c]) / total_cov)
                    c1[c] = (100 * c1[c]) / total_cov
        else:
            raise ValueError("Input type '%s' not supported." % intype)

        if nsamples <= 4:
            format = kw.get('format') or 'pdf'
            output = self.temporary_path(fname='venn_diagram.' + format)
            venn(c2,
                 legend=legend,
                 options=venn_options,
                 output=output,
                 format=format)
            self.new_file(output, 'venn_diagram')

        # Text summary
        output = self.temporary_path(fname='venn_summary.txt')
        with open(output, 'w') as summary:
            summary.write("%s\t%s\t%s\n" %
                          ("Group", "Coverage", "Cumulative coverage"))
            record = "%s\t%.2f\t%d\n"
            for c in sorted(combn, key=lambda x: (len(x), x)):
                summary.write(record % (c, c1[c], c2[c]))
        self.new_file(output, 'venn_summary')
        return self.display_time()
Beispiel #9
0
def score_by_feature(trackScores, trackFeatures, method="mean"):
    """
    For every feature from *trackFeatures*, get the list of all scores it contains
    and apply an operation *method* on this list (by default, scores are averaged).
    Warning: both score and feature streams must be sorted! (use `common.sorted_stream` is necessary).
    The output is a stream similar to *trackFeatures* but with an additional `score` field
    for each stream in *trackScores*::
        method = 'mean':

        X: ------##########--------------##########------
        Y: ___________666666666__________6666666666______
        R: ______[   3.   ]______________[   6.   ]______


        method = 'sum':

        X : ------##########--------------##########------
        Y1: ___________666666666__________6666666666______
        Y2: ___222222_____________________333_____________
        R : ______[  30,6  ]______________[  60,9  ]______

    :param trackScores: (list of) one or several -sorted- score track(s) (FeatureStream).
    :param trackFeatures: (FeatureStream) one -sorted- feature track.
    :param method: (str of function): operation applied to the list of scores from one feature.
        Can be one of 'sum','mean','median','min','max', or a custom function.
    :rtype: FeatureStream
    """

    def _stream(ts, tf):
        X = [common.sentinelize(x, [sys.maxint] * len(x.fields)) for x in ts]
        S = [[(-sys.maxint, -sys.maxint, 0.0)] for t in ts]
        start_idx = tf.fields.index("start")
        end_idx = tf.fields.index("end")
        if hasattr(method, "__call__"):
            mean_fn = lambda scores, denom: method(scores)
        else:
            mean_fn = _score_functions.get(method, _arithmetic_mean)
        for y in tf:
            ystart = y[start_idx]
            yend = y[end_idx]
            scores = ()
            for i in range(len(ts)):
                xnext = S[i][-1]
                # Load into S all score items which intersect feature y
                while xnext[0] < yend:
                    xnext = X[i].next()
                    if xnext[1] > ystart:
                        S[i].append(xnext)
                n = 0
                while S[i][n][1] <= ystart:
                    n += 1
                S[i] = S[i][n:]
                scores_y = []
                for s in S[i]:
                    if yend <= s[0]:
                        continue
                    if s[0] < ystart:
                        start = ystart
                    else:
                        start = s[0]
                    if yend < s[1]:
                        end = yend
                    else:
                        end = s[1]
                    scores_y.extend([s[2]] * (end - start))
                scores += (mean_fn(scores_y, 1.0 / (yend - ystart)),)
            yield tuple(y) + scores

    if not (isinstance(trackScores, (list, tuple))):
        trackScores = [trackScores]
    if isinstance(trackFeatures, (list, tuple)):
        trackFeatures = concatenate(trackFeatures)
    if len(trackScores) > 1 or "score" in trackFeatures.fields:
        _fields = ["score" + str(i) for i in range(len(trackScores))]
    else:
        _fields = ["score"]
    _ts = [common.reorder(t, ["start", "end", "score"]) for t in trackScores]
    return FeatureStream(_stream(_ts, trackFeatures), trackFeatures.fields + _fields)
Beispiel #10
0
def filter_scores(trackScores, trackFeatures, method="sum", strict=False, annotate=False, flatten=common.cobble):
    """
    Extract from *trackScores* only the regions overlapping *trackFeatures*'s regions.
    Warning: both score and features streams must be sorted! (use `common.sorted_stream` if necessary).
    Example::

        X: _____#########__________#############_______
        Y: __________666666666___2222776_444___________
        R: __________6666__________22776_444___________

    Note: *trackFeatures* is :func:`cobbled <bbcflib.gfminer.common.cobble>` by default (to avoid
    score duplications). An alternative is :func:`fusion <bbcflib.gfminer.common.fusion>`, or nothing.
    If strand information is present in both *trackScores* and *trackFeatures*, only scores inside
    a region of the same strand are kept.

    :param trackScores: (FeatureStream) one -sorted- score track.
        If a list of streams is provided, they will be merged (using `merge_scores`).
    :param trackFeatures: (FeatureStream) one -sorted- feature track.
        If a list of streams is provided, they will be merged (using `concatenate`).
    :param method: (str) `merge_scores` *method* argument, in case *trackScores* is a list. ['sum']
    :param strict: (bool) if True, only score regions from *trackScores* that are
        strictly contained in a feature region of *trackFeatures* will be returned. [False]
    :param annotate: (bool) if True, supplementary annotation (and the corresponding fields)
        from *trackFeatures* will be added to the result. [False]
    :param flatten: (func) one of None, `common.fusion` or `common.cobble`.
        Function to be applied to *trackFeatures* before all. [common.cobble]
    :rtype: FeatureStream
    """

    def _stream(ts, tf):
        tf = common.sentinelize(tf, [sys.maxint] * len(tf.fields))
        info_idx = [k for k, f in enumerate(tf.fields) if f not in ts.fields]
        if stranded:
            ts_strand_idx = ts.fields.index("strand")
            tf_strand_idx = tf.fields.index("strand")
            same_strand = lambda x, y: x[ts_strand_idx] == y[tf_strand_idx]
        else:
            same_strand = lambda x, y: True
        Y = []
        ynext = (-sys.maxint, -sys.maxint, 0.0)
        for x in ts:
            xstart = x[0]
            xend = x[1]
            # Load into Y all feature items which intersect score x
            while ynext[0] < xend:
                if ynext[1] > xstart:
                    Y.append(ynext)
                ynext = tf.next()
            # Remove features that are far behind x
            if Y:
                n = 0
                try:
                    while Y[n][1] <= xstart:
                        n += 1
                    Y = Y[n:]
                except IndexError:
                    Y = [ynext]
            # Yield intersections
            for y in Y:
                if not same_strand(x, y):
                    continue
                info = tuple([y[k] for k in info_idx]) if annotate else ()
                if strict and (y[0] > xstart or y[1] < xend):
                    continue
                if y[0] >= xend:
                    continue  # keep for next iteration
                start = xstart if y[0] < xstart else y[0]
                end = xend if y[1] > xend else y[1]
                yield (start, end) + tuple(x[2:]) + info

    if isinstance(trackFeatures, (list, tuple)):
        trackFeatures = concatenate(trackFeatures)
    if isinstance(trackScores, (list, tuple)):
        trackScores = merge_scores(trackScores, method)
    _info_fields = [f for f in trackFeatures.fields if f not in trackScores.fields] if annotate else []
    stranded = "strand" in (set(trackScores.fields) & set(trackFeatures.fields))
    if flatten is None:
        _tf = trackFeatures
    else:
        _tf = flatten(trackFeatures, stranded=stranded)
    _ts = common.reorder(trackScores, ["start", "end"])
    _tf = common.reorder(_tf, ["start", "end"])
    return FeatureStream(_stream(_ts, _tf), _ts.fields + _info_fields)
Beispiel #11
0
def chipseq_workflow( ex, job_or_dict, assembly, script_path='', logfile=sys.stdout, via='lsf' ):
    """Runs a chipseq workflow over bam files obtained by mapseq. Will optionally run ``macs`` and 'run_deconv'.

    :param ex: a 'bein' execution environment to run jobs in,

    :param job_or_dict: a 'Frontend' 'job' object, or a dictionary with key 'groups', 'files' and 'options' if applicable,

    :param assembly: a genrep.Assembly object,

    :param script_path: only needed if 'run_deconv' is in the job options, must point to the location of the R scripts.

    Defaults ``macs`` parameters (overriden by ``job_or_dict['options']['macs_args']``) are set as follows:

    * ``'-bw'``: 200 ('bandwith')

    * ``'-m'``: 10,100 ('minimum and maximum enrichments relative to background or control')

    The enrichment bounds will be computed from a Poisson threshold *T*, if available, as *(min(30,5*(T+1)),50*(T+1))*.

    Returns a tuple of a dictionary with keys *group_id* from the job groups, *macs* and *deconv* if applicable and values file description dictionaries and a dictionary of *group_ids* to *names* used in file descriptions.
"""
    options = {}
    if logfile is None: logfile = sys.stdout
    if isinstance(job_or_dict,frontend.Job):
        options = job_or_dict.options
        groups = job_or_dict.groups
        mapseq_files = job_or_dict.files
    elif isinstance(job_or_dict,dict) and 'groups' in job_or_dict:
        if 'options' in job_or_dict:
            options = job_or_dict['options']
        groups = job_or_dict['groups']
        for gid in groups.keys():
            if not('name' in groups[gid]):
                groups[gid]['name'] = gid
        mapseq_files = job_or_dict.get('files',{})
    else:
        raise TypeError("job_or_dict must be a frontend. Job object or a dictionary with key 'groups'.")
    merge_strands = int(options.get('merge_strands',-1))
    suffixes = ["fwd","rev"]
    peak_deconvolution = options.get('peak_deconvolution',False)
    if isinstance(peak_deconvolution,basestring):
        peak_deconvolution = peak_deconvolution.lower() in ['1','true','t']
    run_meme = options.get('run_meme',False)
    if isinstance(run_meme,basestring):
        run_meme = run_meme.lower() in ['1','true','t']
    macs_args = options.get('macs_args',["--bw","200"])
    b2w_args = options.get('b2w_args',[])
    if not(isinstance(mapseq_files,dict)):
        raise TypeError("Mapseq_files must be a dictionary.")
    tests = []
    controls = []
    names = {'tests': [], 'controls': []}
    read_length = []
    p_thresh = {}
    for gid,mapped in mapseq_files.iteritems():
        group_name = groups[gid]['name']
        if not(isinstance(mapped,dict)):
            raise TypeError("Mapseq_files values must be dictionaries with keys *run_ids* or 'bam'.")
        if 'bam' in mapped:
            mapped = {'_': mapped}
        futures = {}
        ptruns = []
        for k in mapped.keys():
            if not 'libname' in mapped[k]:
                mapped[k]['libname'] = group_name+"_"+str(k)
            if not 'stats' in mapped[k]:
                futures[k] = mapseq.bamstats.nonblocking( ex, mapped[k]["bam"], via=via )
            if mapped[k].get('poisson_threshold',-1)>0:
                ptruns.append(mapped[k]['poisson_threshold'])
        if len(ptruns)>0:
            p_thresh['group_name'] = sum(ptruns)/len(ptruns)
        for k in futures.keys():
            mapped[k]['stats'] = f.wait()
        if len(mapped)>1:
            bamfile = mapseq.merge_bam(ex, [m['bam'] for m in mapped.values()])
        else:
            bamfile = mapped.values()[0]['bam']
        if groups[gid]['control']:
            controls.append(bamfile)
            names['controls'].append((gid,group_name))
        else:
            tests.append(bamfile)
            names['tests'].append((gid,group_name))
            read_length.append(mapped.values()[0]['stats']['read_length'])
    genome_size = mapped.values()[0]['stats']['genome_size']
    if len(controls)<1:
        controls = [None]
        names['controls'] = [(0,None)]
    logfile.write("Starting MACS.\n");logfile.flush()
    processed = {'macs': add_macs_results( ex, read_length, genome_size,
                                           tests, ctrlbam=controls, name=names,
                                           poisson_threshold=p_thresh,
                                           macs_args=macs_args, via=via ) }
    logfile.write("Done MACS.\n");logfile.flush()
    peak_list = {}
    chrlist = assembly.chrmeta
## select only peaks with p-val <= 1e-0.6 = .25 => score = -10log10(p) >= 6
    _select = {'score':(6,sys.maxint)}
    _fields = ['chr','start','end','name','score']
    for i,name in enumerate(names['tests']):
        if len(names['controls']) < 2:
            ctrl = (name,names['controls'][0])
            macsbed = track(processed['macs'][ctrl]+"_summits.bed",
                            chrmeta=chrlist, fields=_fields).read(selection=_select)
        else:
            macsbed = concatenate([apply(track(processed['macs'][(name,x)]+"_summits.bed",
                                         chrmeta=chrlist, fields=_fields).read(selection=_select),
                                         'name', lambda __n,_n=xn: "%s:%i" %(__n,_n))
                                   for xn,x in enumerate(names['controls'])])
        ##############################
        macs_neighb = neighborhood( macsbed, before_start=150, after_end=150 )
        peak_list[name] = unique_filename_in()+".sql"
        macs_final = track( peak_list[name], chrmeta=chrlist,
                            info={'datatype':'qualitative'},
                            fields=['start','end','name','score'] )
        macs_final.write(fusion(macs_neighb),clip=True)
        macs_final.close()
        ##############################

    merged_wig = {}
    options['read_extension'] = int(options.get('read_extension') or read_length[0])
    if options['read_extension'] < 1: options['read_extension'] = read_length[0]
    make_wigs = merge_strands >= 0 or options['read_extension']>100
    if options['read_extension'] > 100: options['read_extension'] = 50
    for gid,mapped in mapseq_files.iteritems():
#            if groups[gid]['control']: continue
        group_name = groups[gid]['name']
        wig = []
        for m in mapped.values():
            if make_wigs or not('wig' in m) or len(m['wig'])<2:
                output = mapseq.parallel_density_sql( ex, m["bam"], assembly.chrmeta,
                                                      nreads=m["stats"]["total"],
                                                      merge=-1, read_extension=options['read_extension'],
                                                      convert=False,
                                                      b2w_args=b2w_args, via=via )
                wig.append(dict((s,output+s+'.sql') for s in suffixes))
            else:
                wig.append(m['wig'])
        if len(wig) > 1:
            merged_wig[group_name] = dict((s,merge_sql(ex, [x[s] for x in wig], via=via))
                                          for s in suffixes)
        else:
            merged_wig[group_name] = wig[0]

    if peak_deconvolution:
        ##############################
        def _filter_deconv( stream, pval ):
            ferr = re.compile(r';FERR=([\d\.]+)$')
            return FeatureStream( ((x[0],)+((x[2]+x[1])/2-150,(x[2]+x[1])/2+150)+x[3:] 
                                   for x in stream 
                                   if "FERR=" in x[3] and float(ferr.search(x[3]).groups()[0]) <= pval), 
                                  fields=stream.fields )
        ##############################
        processed['deconv'] = {}
        for name in names['tests']:
            logfile.write(name[1]+" deconvolution.\n");logfile.flush()
            if len(names['controls']) < 2:
                ctrl = (name,names['controls'][0])
                macsbed = processed['macs'][ctrl]+"_peaks.bed"
            else:
                macsbed = intersect_many_bed( ex, [processed['macs'][(name,x)]+"_peaks.bed"
                                                   for x in names['controls']], via=via )
            deconv = run_deconv( ex, merged_wig[name[1]], macsbed, assembly.chrmeta,
                                 options['read_extension'], script_path, via=via )
            peak_list[name] = unique_filename_in()+".bed"
            trbed = track(deconv['peaks']).read()
            with track(peak_list[name], chrmeta=chrlist, fields=trbed.fields) as bedfile:
                bedfile.write(fusion(_filter_deconv(trbed,0.65)))
            ex.add(deconv['peaks'],
                   description=set_file_descr(name[1]+'_peaks.sql', type='sql',
                                              step='deconvolution', groupId=name[0]))
            ex.add(deconv['profile'],
                   description=set_file_descr(name[1]+'_deconv.sql', type='sql',
                                              step='deconvolution',  groupId=name[0]))
            bigwig = unique_filename_in()
            try:
                convert(deconv['profile'],(bigwig,"bigWig"))
                ex.add(bigwig,
                       description=set_file_descr(name[1]+'_deconv.bw', type='bigWig',
                                                  ucsc='1', step='deconvolution',
                                                  groupId=name[0]))
            except OSError as e:
                logfile.write(str(e));logfile.flush()
            ex.add(deconv['pdf'],
                   description=set_file_descr(name[1]+'_deconv.pdf', type='pdf',
                                              step='deconvolution', groupId=name[0]))
            processed['deconv'][name] = deconv

    ##############################
    def _join_macs( stream, xlsl, _f ):
        def _macs_row(_s):
            for _p in _s:
                for _n in _p[3].split("|"):
                    if len(xlsl) == 1:
                        nb = int(_n.split(";")[0][13:]) if _n[:3] == "ID=" else int(_n[10:])
                        yield _p+xlsl[0][nb-1][1:]
                    else:
                        nb = _n.split(";")[0][13:] if _n[:3] == "ID=" else _n[10:]
                        nb = nb.split(":")
                        yield _p+xlsl[int(nb[1])][int(nb[0])-1][1:]
        return FeatureStream( _macs_row(stream), fields=_f )
    ##############################
    peakfile_list = []
    for name, plist in peak_list.iteritems():
        ptrack = track(plist,chrmeta=chrlist,fields=["chr","start","end","name","score"])
        peakfile = unique_filename_in()
        xlsh, xlsl = parse_MACS_xls([processed['macs'][(name,_c)]+"_peaks.xls" for _c in names['controls']])
        try:
###### if assembly doesn't have annotations, we skip the "getNearestFeature" but still go through "_join_macs"
            assembly.gene_track()
            _fields = ['chr','start','end','name','score','gene','location_type','distance']\
                +["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:]
            peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields)
            peakout.make_header("#"+"\t".join(['chromosome','start','end','info','peak_height','gene(s)','location_type','distance']+_fields[8:]))
            for chrom in assembly.chrnames:
                _feat = assembly.gene_track(chrom)
                peakout.write(_join_macs(getNearestFeature(ptrack.read(selection=chrom),_feat),
                                         xlsl, _fields), mode='append')
        except ValueError:
            _fields = ['chr','start','end','name','score']+["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:]
            peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields)
            peakout.make_header("#"+"\t".join(['chromosome','start','end','info','peak_height']+_fields[8:]))
            for chrom in assembly.chrnames:
                peakout.write(_join_macs(ptrack.read(selection=chrom), xlsl, _fields), mode='append')
        peakout.close()
        gzipfile(ex,peakfile)
        peakfile_list.append(track(peakfile+".gz", format='txt', fields=_fields))
        ex.add(peakfile+".gz",
               description=set_file_descr(name[1]+'_annotated_peaks.txt.gz',type='text',
                                          step='annotation',groupId=name[0]))
    stracks = [track(wig,info={'name':name+"_"+st}) 
               for name,wigdict in merged_wig.iteritems() for st,wig in wigdict.iteritems()]
    tablefile = unique_filename_in()
    with open(tablefile,"w") as _tf:
        _pnames = ["MACS_%s_vs_%s" %(_s[1],_c[1]) if _c[1] else "MACS_%s" %_s[1]
                   for _s in names['tests'] for _c in names['controls']]
        _tf.write("\t".join(['#chromosome','start','end',]+_pnames+[s.name for s in stracks])+"\n")
#### need to do something about peak origin (split names, write to separate columns?)
    for chrom in assembly.chrnames:
        pk_lst = [apply(pt.read(chrom,fields=['chr','start','end','name']),
                        'name', lambda __n,_n=npt: "%s:%i" %(__n,_n))
                  for npt,pt in enumerate(peakfile_list)]
        features = fusion(concatenate(pk_lst, fields=['chr','start','end','name'], 
                                      remove_duplicates=True, group_by=['chr','start','end']))
        sread = [sig.read(chrom) for sig in stracks]
        quantifs = score_by_feature(sread, features, method='sum')
        nidx = quantifs.fields.index('name')
        _ns = len(tests)
        _nc = len(controls)
        with open(tablefile,"a") as _tf:
            for row in quantifs:
                pcols = ['']*_ns*_nc
                _rnsplit = row[nidx].split(":")
                _n1 = _rnsplit[0]
                _k = 0
                while ( _k < len(_rnsplit)-1-int(_nc>1) ):
                    if _nc > 1:
                        _k += 2
                        _n2 = _rnsplit[_k-1]
                        _n = _rnsplit[_k].split("|")
                        pcols[int(_n[0])*_nc+int(_n2)] = _n1
                    else:
                        _k += 1
                        _n = _rnsplit[_k].split("|")
                        pcols[int(_n[0])] = _n1
                    _n1 = "|".join(_n[1:])
                _tf.write("\t".join(str(tt) for tt in row[:nidx]+tuple(pcols)+row[nidx+1:])+"\n")
    gzipfile(ex,tablefile)
    ex.add(tablefile+".gz",
           description=set_file_descr('Combined_peak_quantifications.txt.gz',type='text',
                                      step='summary'))

    if run_meme:
        from bbcflib.motif import parallel_meme
        logfile.write("Starting MEME.\n");logfile.flush()
        processed['meme'] = parallel_meme( ex, assembly,
                                           peak_list.values(), name=peak_list.keys(),
                                           chip=True, meme_args=['-meme-nmotifs','4','-meme-mod','zoops'],
                                           via=via )
    return processed
Beispiel #12
0
def chipseq_workflow(ex,
                     job_or_dict,
                     assembly,
                     script_path='',
                     logfile=sys.stdout,
                     via='lsf'):
    """Runs a chipseq workflow over bam files obtained by mapseq. Will optionally run ``macs`` and 'run_deconv'.

    :param ex: a 'bein' execution environment to run jobs in,

    :param job_or_dict: a 'Frontend' 'job' object, or a dictionary with key 'groups', 'files' and 'options' if applicable,

    :param assembly: a genrep.Assembly object,

    :param script_path: only needed if 'run_deconv' is in the job options, must point to the location of the R scripts.

    Defaults ``macs`` parameters (overriden by ``job_or_dict['options']['macs_args']``) are set as follows:

    * ``'-bw'``: 200 ('bandwith')

    * ``'-m'``: 10,100 ('minimum and maximum enrichments relative to background or control')

    The enrichment bounds will be computed from a Poisson threshold *T*, if available, as *(min(30,5*(T+1)),50*(T+1))*.

    Returns a tuple of a dictionary with keys *group_id* from the job groups, *macs* and *deconv* if applicable and values file description dictionaries and a dictionary of *group_ids* to *names* used in file descriptions.
"""
    options = {}
    if logfile is None: logfile = sys.stdout
    if isinstance(job_or_dict, frontend.Job):
        options = job_or_dict.options
        groups = job_or_dict.groups
        mapseq_files = job_or_dict.files
    elif isinstance(job_or_dict, dict) and 'groups' in job_or_dict:
        if 'options' in job_or_dict:
            options = job_or_dict['options']
        groups = job_or_dict['groups']
        for gid in groups.keys():
            if not ('name' in groups[gid]):
                groups[gid]['name'] = gid
        mapseq_files = job_or_dict.get('files', {})
    else:
        raise TypeError(
            "job_or_dict must be a frontend. Job object or a dictionary with key 'groups'."
        )
    merge_strands = int(options.get('merge_strands', -1))
    suffixes = ["fwd", "rev"]
    peak_deconvolution = options.get('peak_deconvolution', False)
    if isinstance(peak_deconvolution, basestring):
        peak_deconvolution = peak_deconvolution.lower() in ['1', 'true', 't']
    run_meme = options.get('run_meme', False)
    if isinstance(run_meme, basestring):
        run_meme = run_meme.lower() in ['1', 'true', 't']
    macs_args = options.get('macs_args', ["--bw", "200"])
    b2w_args = options.get('b2w_args', [])
    if not (isinstance(mapseq_files, dict)):
        raise TypeError("Mapseq_files must be a dictionary.")
    tests = []
    controls = []
    names = {'tests': [], 'controls': []}
    read_length = []
    p_thresh = {}
    for gid, mapped in mapseq_files.iteritems():
        group_name = groups[gid]['name']
        if not (isinstance(mapped, dict)):
            raise TypeError(
                "Mapseq_files values must be dictionaries with keys *run_ids* or 'bam'."
            )
        if 'bam' in mapped:
            mapped = {'_': mapped}
        futures = {}
        ptruns = []
        for k in mapped.keys():
            if not 'libname' in mapped[k]:
                mapped[k]['libname'] = group_name + "_" + str(k)
            if not 'stats' in mapped[k]:
                futures[k] = mapseq.bamstats.nonblocking(ex,
                                                         mapped[k]["bam"],
                                                         via=via)
            if mapped[k].get('poisson_threshold', -1) > 0:
                ptruns.append(mapped[k]['poisson_threshold'])
        if len(ptruns) > 0:
            p_thresh['group_name'] = sum(ptruns) / len(ptruns)
        for k in futures.keys():
            mapped[k]['stats'] = f.wait()
        if len(mapped) > 1:
            bamfile = mapseq.merge_bam(ex, [m['bam'] for m in mapped.values()])
        else:
            bamfile = mapped.values()[0]['bam']
        if groups[gid]['control']:
            controls.append(bamfile)
            names['controls'].append((gid, group_name))
        else:
            tests.append(bamfile)
            names['tests'].append((gid, group_name))
            read_length.append(mapped.values()[0]['stats']['read_length'])
    genome_size = mapped.values()[0]['stats']['genome_size']
    if len(controls) < 1:
        controls = [None]
        names['controls'] = [(0, None)]
    logfile.write("Starting MACS.\n")
    logfile.flush()
    processed = {
        'macs':
        add_macs_results(ex,
                         read_length,
                         genome_size,
                         tests,
                         ctrlbam=controls,
                         name=names,
                         poisson_threshold=p_thresh,
                         macs_args=macs_args,
                         via=via)
    }
    logfile.write("Done MACS.\n")
    logfile.flush()
    peak_list = {}
    chrlist = assembly.chrmeta
    ## select only peaks with p-val <= 1e-0.6 = .25 => score = -10log10(p) >= 6
    _select = {'score': (6, sys.maxint)}
    _fields = ['chr', 'start', 'end', 'name', 'score']
    for i, name in enumerate(names['tests']):
        if len(names['controls']) < 2:
            ctrl = (name, names['controls'][0])
            macsbed = track(processed['macs'][ctrl] + "_summits.bed",
                            chrmeta=chrlist,
                            fields=_fields).read(selection=_select)
        else:
            macsbed = concatenate([
                apply(track(processed['macs'][(name, x)] + "_summits.bed",
                            chrmeta=chrlist,
                            fields=_fields).read(selection=_select),
                      'name',
                      lambda __n, _n=xn: "%s:%i" % (__n, _n))
                for xn, x in enumerate(names['controls'])
            ])
        ##############################
        macs_neighb = neighborhood(macsbed, before_start=150, after_end=150)
        peak_list[name] = unique_filename_in() + ".sql"
        macs_final = track(peak_list[name],
                           chrmeta=chrlist,
                           info={'datatype': 'qualitative'},
                           fields=['start', 'end', 'name', 'score'])
        macs_final.write(fusion(macs_neighb), clip=True)
        macs_final.close()
        ##############################

    merged_wig = {}
    options['read_extension'] = int(
        options.get('read_extension') or read_length[0])
    if options['read_extension'] < 1:
        options['read_extension'] = read_length[0]
    make_wigs = merge_strands >= 0 or options['read_extension'] > 100
    if options['read_extension'] > 100: options['read_extension'] = 50
    for gid, mapped in mapseq_files.iteritems():
        #            if groups[gid]['control']: continue
        group_name = groups[gid]['name']
        wig = []
        for m in mapped.values():
            if make_wigs or not ('wig' in m) or len(m['wig']) < 2:
                output = mapseq.parallel_density_sql(
                    ex,
                    m["bam"],
                    assembly.chrmeta,
                    nreads=m["stats"]["total"],
                    merge=-1,
                    read_extension=options['read_extension'],
                    convert=False,
                    b2w_args=b2w_args,
                    via=via)
                wig.append(dict((s, output + s + '.sql') for s in suffixes))
            else:
                wig.append(m['wig'])
        if len(wig) > 1:
            merged_wig[group_name] = dict(
                (s, merge_sql(ex, [x[s] for x in wig], via=via))
                for s in suffixes)
        else:
            merged_wig[group_name] = wig[0]

    if peak_deconvolution:
        ##############################
        def _filter_deconv(stream, pval):
            ferr = re.compile(r';FERR=([\d\.]+)$')
            return FeatureStream(
                ((x[0], ) + ((x[2] + x[1]) / 2 - 150,
                             (x[2] + x[1]) / 2 + 150) + x[3:]
                 for x in stream if "FERR=" in x[3]
                 and float(ferr.search(x[3]).groups()[0]) <= pval),
                fields=stream.fields)

        ##############################
        processed['deconv'] = {}
        for name in names['tests']:
            logfile.write(name[1] + " deconvolution.\n")
            logfile.flush()
            if len(names['controls']) < 2:
                ctrl = (name, names['controls'][0])
                macsbed = processed['macs'][ctrl] + "_peaks.bed"
            else:
                macsbed = intersect_many_bed(ex, [
                    processed['macs'][(name, x)] + "_peaks.bed"
                    for x in names['controls']
                ],
                                             via=via)
            deconv = run_deconv(ex,
                                merged_wig[name[1]],
                                macsbed,
                                assembly.chrmeta,
                                options['read_extension'],
                                script_path,
                                via=via)
            peak_list[name] = unique_filename_in() + ".bed"
            trbed = track(deconv['peaks']).read()
            with track(peak_list[name], chrmeta=chrlist,
                       fields=trbed.fields) as bedfile:
                bedfile.write(fusion(_filter_deconv(trbed, 0.65)))
            ex.add(deconv['peaks'],
                   description=set_file_descr(name[1] + '_peaks.sql',
                                              type='sql',
                                              step='deconvolution',
                                              groupId=name[0]))
            ex.add(deconv['profile'],
                   description=set_file_descr(name[1] + '_deconv.sql',
                                              type='sql',
                                              step='deconvolution',
                                              groupId=name[0]))
            bigwig = unique_filename_in()
            try:
                convert(deconv['profile'], (bigwig, "bigWig"))
                ex.add(bigwig,
                       description=set_file_descr(name[1] + '_deconv.bw',
                                                  type='bigWig',
                                                  ucsc='1',
                                                  step='deconvolution',
                                                  groupId=name[0]))
            except OSError as e:
                logfile.write(str(e))
                logfile.flush()
            ex.add(deconv['pdf'],
                   description=set_file_descr(name[1] + '_deconv.pdf',
                                              type='pdf',
                                              step='deconvolution',
                                              groupId=name[0]))
            processed['deconv'][name] = deconv

    ##############################
    def _join_macs(stream, xlsl, _f):
        def _macs_row(_s):
            for _p in _s:
                for _n in _p[3].split("|"):
                    if len(xlsl) == 1:
                        nb = int(
                            _n.split(";")[0][13:]) if _n[:3] == "ID=" else int(
                                _n[10:])
                        yield _p + xlsl[0][nb - 1][1:]
                    else:
                        nb = _n.split(
                            ";")[0][13:] if _n[:3] == "ID=" else _n[10:]
                        nb = nb.split(":")
                        yield _p + xlsl[int(nb[1])][int(nb[0]) - 1][1:]

        return FeatureStream(_macs_row(stream), fields=_f)

    ##############################
    peakfile_list = []
    for name, plist in peak_list.iteritems():
        ptrack = track(plist,
                       chrmeta=chrlist,
                       fields=["chr", "start", "end", "name", "score"])
        peakfile = unique_filename_in()
        xlsh, xlsl = parse_MACS_xls([
            processed['macs'][(name, _c)] + "_peaks.xls"
            for _c in names['controls']
        ])
        try:
            ###### if assembly doesn't have annotations, we skip the "getNearestFeature" but still go through "_join_macs"
            assembly.gene_track()
            _fields = ['chr','start','end','name','score','gene','location_type','distance']\
                +["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:]
            peakout = track(peakfile,
                            format='txt',
                            chrmeta=chrlist,
                            fields=_fields)
            peakout.make_header("#" + "\t".join([
                'chromosome', 'start', 'end', 'info', 'peak_height', 'gene(s)',
                'location_type', 'distance'
            ] + _fields[8:]))
            for chrom in assembly.chrnames:
                _feat = assembly.gene_track(chrom)
                peakout.write(_join_macs(
                    getNearestFeature(ptrack.read(selection=chrom), _feat),
                    xlsl, _fields),
                              mode='append')
        except ValueError:
            _fields = ['chr', 'start', 'end', 'name', 'score'
                       ] + ["MACS_%s" % h for h in xlsh[1:5]] + xlsh[5:]
            peakout = track(peakfile,
                            format='txt',
                            chrmeta=chrlist,
                            fields=_fields)
            peakout.make_header("#" + "\t".join(
                ['chromosome', 'start', 'end', 'info', 'peak_height'] +
                _fields[8:]))
            for chrom in assembly.chrnames:
                peakout.write(_join_macs(ptrack.read(selection=chrom), xlsl,
                                         _fields),
                              mode='append')
        peakout.close()
        gzipfile(ex, peakfile)
        peakfile_list.append(
            track(peakfile + ".gz", format='txt', fields=_fields))
        ex.add(peakfile + ".gz",
               description=set_file_descr(name[1] + '_annotated_peaks.txt.gz',
                                          type='text',
                                          step='annotation',
                                          groupId=name[0]))
    stracks = [
        track(wig, info={'name': name + "_" + st})
        for name, wigdict in merged_wig.iteritems()
        for st, wig in wigdict.iteritems()
    ]
    tablefile = unique_filename_in()
    with open(tablefile, "w") as _tf:
        _pnames = [
            "MACS_%s_vs_%s" % (_s[1], _c[1]) if _c[1] else "MACS_%s" % _s[1]
            for _s in names['tests'] for _c in names['controls']
        ]
        _tf.write("\t".join([
            '#chromosome',
            'start',
            'end',
        ] + _pnames + [s.name for s in stracks]) + "\n")
#### need to do something about peak origin (split names, write to separate columns?)
    for chrom in assembly.chrnames:
        pk_lst = [
            apply(pt.read(chrom, fields=['chr', 'start', 'end', 'name']),
                  'name',
                  lambda __n, _n=npt: "%s:%i" % (__n, _n))
            for npt, pt in enumerate(peakfile_list)
        ]
        features = fusion(
            concatenate(pk_lst,
                        fields=['chr', 'start', 'end', 'name'],
                        remove_duplicates=True,
                        group_by=['chr', 'start', 'end']))
        sread = [sig.read(chrom) for sig in stracks]
        quantifs = score_by_feature(sread, features, method='sum')
        nidx = quantifs.fields.index('name')
        _ns = len(tests)
        _nc = len(controls)
        with open(tablefile, "a") as _tf:
            for row in quantifs:
                pcols = [''] * _ns * _nc
                _rnsplit = row[nidx].split(":")
                _n1 = _rnsplit[0]
                _k = 0
                while (_k < len(_rnsplit) - 1 - int(_nc > 1)):
                    if _nc > 1:
                        _k += 2
                        _n2 = _rnsplit[_k - 1]
                        _n = _rnsplit[_k].split("|")
                        pcols[int(_n[0]) * _nc + int(_n2)] = _n1
                    else:
                        _k += 1
                        _n = _rnsplit[_k].split("|")
                        pcols[int(_n[0])] = _n1
                    _n1 = "|".join(_n[1:])
                _tf.write("\t".join(
                    str(tt)
                    for tt in row[:nidx] + tuple(pcols) + row[nidx + 1:]) +
                          "\n")
    gzipfile(ex, tablefile)
    ex.add(tablefile + ".gz",
           description=set_file_descr('Combined_peak_quantifications.txt.gz',
                                      type='text',
                                      step='summary'))

    if run_meme:
        from bbcflib.motif import parallel_meme
        logfile.write("Starting MEME.\n")
        logfile.flush()
        processed['meme'] = parallel_meme(
            ex,
            assembly,
            peak_list.values(),
            name=peak_list.keys(),
            chip=True,
            meme_args=['-meme-nmotifs', '4', '-meme-mod', 'zoops'],
            via=via)
    return processed