Ejemplo n.º 1
0
    def test_fusion(self):
        stream = fstream([('chr1',10,15,'A',1),('chr1',13,18,'B',-1),('chr1',18,25,'C',-1)],
                         fields = ['chr','start','end','name','strand'])
        expected = [('chr1',10,18,'A|B',0),('chr1',18,25,'C',-1)]
        res = fusion(stream)
        self.assertEqual(list(res),expected)

        # stranded = True
        stream = fstream([('chr1',10,15,'A',1),('chr1',13,18,'B',-1),('chr1',15,25,'C',-1)],
                         fields = ['chr','start','end','name','strand'])
        expected = [('chr1',10,15,'A',1),('chr1',13,25,'B|C',-1)]
        res = fusion(stream, stranded=True)
        self.assertEqual(list(res),expected)
Ejemplo n.º 2
0
    def test_fusion(self):
        stream = fstream([('chr1', 10, 15, 'A', 1), ('chr1', 13, 18, 'B', -1),
                          ('chr1', 18, 25, 'C', -1)],
                         fields=['chr', 'start', 'end', 'name', 'strand'])
        expected = [('chr1', 10, 18, 'A|B', 0), ('chr1', 18, 25, 'C', -1)]
        res = fusion(stream)
        self.assertEqual(list(res), expected)

        # stranded = True
        stream = fstream([('chr1', 10, 15, 'A', 1), ('chr1', 13, 18, 'B', -1),
                          ('chr1', 15, 25, 'C', -1)],
                         fields=['chr', 'start', 'end', 'name', 'strand'])
        expected = [('chr1', 10, 15, 'A', 1), ('chr1', 13, 25, 'B|C', -1)]
        res = fusion(stream, stranded=True)
        self.assertEqual(list(res), expected)
Ejemplo n.º 3
0
def combine(trackList, fn, win_size=1000, aggregate={}):
    """
    Applies a custom function to a list of tracks, such as union, intersection,
    etc., and return a single result track. The input streams need to be ordered
    w.r.t 'chr', 'start' and 'end'. To be applied chromosome by chromosome.

    Only fields of the first track are kept. Values for a common field are
    merged by default according to `common.strand_merge`,`common.no_merge` and `common.generic_merge`,
    respectively for strand, chromosome and all others.

    :param trackList: list of FeatureStream objects.
    :param fn: boolean function to apply, such as bbcflib.gfminer.stream.union.
    :param win_size: (int) window size, in bp.
    :param aggregate: (dict) for each field name given as a key, its value is the function
        to apply to the vector containing all trackList's values for this field in order
        to merge them. E.g. ``{'score': lambda x: sum(x)/len(x)}`` will return the average of
        all *trackList*'s scores in the output.
    :rtype: FeatureStream
    """
    aggregate.setdefault('strand',common.strand_merge)
    aggregate.setdefault('chr',common.no_merge)
    _f = ['start','end']
    if all('chr' in t.fields for t in trackList):
        _f += ['chr']
    if isinstance(fn,str): fn = eval(fn) # can type "combine(...,fn='intersection')"
    trackList = [common.cobble(common.reorder(t,fields=_f)) for t in trackList]
    return common.fusion(FeatureStream(_combine(trackList,fn,win_size,aggregate),
                                       fields=trackList[0].fields))
Ejemplo n.º 4
0
def run_wellington(ex, tests, names, assembly, via, logfile):
    futures = {}
    logfile.write("Running Wellington:\n")
    logfile.flush()
    wellout = {}
    for nbam, bed_bam in enumerate(tests):
        name = names['tests'][nbam]
        wellout[name] = []
        tbed = track(bed_bam[0])
        for chrom in assembly.chrnames:
            _chrombed = unique_filename_in()
            with track(_chrombed, format="bed", fields=tbed.fields) as _tt:
                if len(bed_bam) > 2:
                    _neighb = neighborhood(tbed.read(chrom),
                                           before_start=bed_bam[2],
                                           after_end=bed_bam[2])
                else:
                    _neighb = tbed.read(chrom)
                _tt.write(fusion(_neighb), clip=True)
            if os.path.getsize(_chrombed) > 0:
                futures[(chrom, name)] = wellington.nonblocking(ex,
                                                                _chrombed,
                                                                bed_bam[1],
                                                                via=via,
                                                                memory=8)
    for chro_name, _fut in futures.iteritems():
        chrom, name = chro_name
        logfile.write(name[1] + " " + chrom + ", ")
        logfile.flush()
        wellout[name].append(_fut.wait())
    logfile.write("\n")
    logfile.flush()
    bedlist = save_wellington(ex, wellout, assembly.chrmeta)
    return bedlist
Ejemplo n.º 5
0
def run_wellington( ex, tests, names, assembly, via, logfile ):
    futures = {}
    logfile.write("Running Wellington:\n");logfile.flush()
    wellout = {}
    for nbam,bed_bam in enumerate(tests):
        name = names['tests'][nbam]
        wellout[name] = []
        tbed = track(bed_bam[0])
        for chrom in assembly.chrnames:
            _chrombed = unique_filename_in()
            with track(_chrombed,format="bed",fields=tbed.fields) as _tt:
                if len(bed_bam) > 2:
                    _neighb = neighborhood( tbed.read(chrom), before_start=bed_bam[2], after_end=bed_bam[2] )
                else:
                    _neighb = tbed.read(chrom)
                _tt.write(fusion(_neighb),clip=True)
            if os.path.getsize(_chrombed) > 0:
                futures[(chrom,name)] = wellington.nonblocking(ex, _chrombed, bed_bam[1], via=via, memory=8)
    for chro_name, _fut in futures.iteritems():
        chrom, name = chro_name
        logfile.write(name[1]+" "+chrom+", ");logfile.flush()
        wellout[name].append(_fut.wait())
    logfile.write("\n");logfile.flush()
    bedlist = save_wellington(ex, wellout, assembly.chrmeta)
    return bedlist
Ejemplo n.º 6
0
 def total_coverage(t):
     from bbcflib.gfminer.common import fusion
     total_cov = 0
     kw = dict(kwargs)
     s = t.read(**kw)
     fs = fusion(s)
     st_idx = fs.fields.index('start')
     en_idx = fs.fields.index('end')
     total_cov += sum(x[en_idx]-x[st_idx] for x in fs)
     return total_cov
Ejemplo n.º 7
0
 def total_coverage(t):
     from bbcflib.gfminer.common import fusion
     total_cov = 0
     kw = dict(kwargs)
     s = t.read(**kw)
     fs = fusion(s)
     st_idx = fs.fields.index('start')
     en_idx = fs.fields.index('end')
     total_cov += sum(x[en_idx] - x[st_idx] for x in fs)
     return total_cov
Ejemplo n.º 8
0
def chipseq_workflow( ex, job_or_dict, assembly, script_path='', logfile=sys.stdout, via='lsf' ):
    """Runs a chipseq workflow over bam files obtained by mapseq. Will optionally run ``macs`` and 'run_deconv'.

    :param ex: a 'bein' execution environment to run jobs in,

    :param job_or_dict: a 'Frontend' 'job' object, or a dictionary with key 'groups', 'files' and 'options' if applicable,

    :param assembly: a genrep.Assembly object,

    :param script_path: only needed if 'run_deconv' is in the job options, must point to the location of the R scripts.

    Defaults ``macs`` parameters (overriden by ``job_or_dict['options']['macs_args']``) are set as follows:

    * ``'-bw'``: 200 ('bandwith')

    * ``'-m'``: 10,100 ('minimum and maximum enrichments relative to background or control')

    The enrichment bounds will be computed from a Poisson threshold *T*, if available, as *(min(30,5*(T+1)),50*(T+1))*.

    Returns a tuple of a dictionary with keys *group_id* from the job groups, *macs* and *deconv* if applicable and values file description dictionaries and a dictionary of *group_ids* to *names* used in file descriptions.
"""
    options = {}
    if logfile is None: logfile = sys.stdout
    if isinstance(job_or_dict,frontend.Job):
        options = job_or_dict.options
        groups = job_or_dict.groups
        mapseq_files = job_or_dict.files
    elif isinstance(job_or_dict,dict) and 'groups' in job_or_dict:
        if 'options' in job_or_dict:
            options = job_or_dict['options']
        groups = job_or_dict['groups']
        for gid in groups.keys():
            if not('name' in groups[gid]):
                groups[gid]['name'] = gid
        mapseq_files = job_or_dict.get('files',{})
    else:
        raise TypeError("job_or_dict must be a frontend. Job object or a dictionary with key 'groups'.")
    merge_strands = int(options.get('merge_strands',-1))
    suffixes = ["fwd","rev"]
    peak_deconvolution = options.get('peak_deconvolution',False)
    if isinstance(peak_deconvolution,basestring):
        peak_deconvolution = peak_deconvolution.lower() in ['1','true','t']
    run_meme = options.get('run_meme',False)
    if isinstance(run_meme,basestring):
        run_meme = run_meme.lower() in ['1','true','t']
    macs_args = options.get('macs_args',["--bw","200"])
    b2w_args = options.get('b2w_args',[])
    if not(isinstance(mapseq_files,dict)):
        raise TypeError("Mapseq_files must be a dictionary.")
    tests = []
    controls = []
    names = {'tests': [], 'controls': []}
    read_length = []
    p_thresh = {}
    for gid,mapped in mapseq_files.iteritems():
        group_name = groups[gid]['name']
        if not(isinstance(mapped,dict)):
            raise TypeError("Mapseq_files values must be dictionaries with keys *run_ids* or 'bam'.")
        if 'bam' in mapped:
            mapped = {'_': mapped}
        futures = {}
        ptruns = []
        for k in mapped.keys():
            if not 'libname' in mapped[k]:
                mapped[k]['libname'] = group_name+"_"+str(k)
            if not 'stats' in mapped[k]:
                futures[k] = mapseq.bamstats.nonblocking( ex, mapped[k]["bam"], via=via )
            if mapped[k].get('poisson_threshold',-1)>0:
                ptruns.append(mapped[k]['poisson_threshold'])
        if len(ptruns)>0:
            p_thresh['group_name'] = sum(ptruns)/len(ptruns)
        for k in futures.keys():
            mapped[k]['stats'] = f.wait()
        if len(mapped)>1:
            bamfile = mapseq.merge_bam(ex, [m['bam'] for m in mapped.values()])
        else:
            bamfile = mapped.values()[0]['bam']
        if groups[gid]['control']:
            controls.append(bamfile)
            names['controls'].append((gid,group_name))
        else:
            tests.append(bamfile)
            names['tests'].append((gid,group_name))
            read_length.append(mapped.values()[0]['stats']['read_length'])
    genome_size = mapped.values()[0]['stats']['genome_size']
    if len(controls)<1:
        controls = [None]
        names['controls'] = [(0,None)]
    logfile.write("Starting MACS.\n");logfile.flush()
    processed = {'macs': add_macs_results( ex, read_length, genome_size,
                                           tests, ctrlbam=controls, name=names,
                                           poisson_threshold=p_thresh,
                                           macs_args=macs_args, via=via ) }
    logfile.write("Done MACS.\n");logfile.flush()
    peak_list = {}
    chrlist = assembly.chrmeta
## select only peaks with p-val <= 1e-0.6 = .25 => score = -10log10(p) >= 6
    _select = {'score':(6,sys.maxint)}
    _fields = ['chr','start','end','name','score']
    for i,name in enumerate(names['tests']):
        if len(names['controls']) < 2:
            ctrl = (name,names['controls'][0])
            macsbed = track(processed['macs'][ctrl]+"_summits.bed",
                            chrmeta=chrlist, fields=_fields).read(selection=_select)
        else:
            macsbed = concatenate([apply(track(processed['macs'][(name,x)]+"_summits.bed",
                                         chrmeta=chrlist, fields=_fields).read(selection=_select),
                                         'name', lambda __n,_n=xn: "%s:%i" %(__n,_n))
                                   for xn,x in enumerate(names['controls'])])
        ##############################
        macs_neighb = neighborhood( macsbed, before_start=150, after_end=150 )
        peak_list[name] = unique_filename_in()+".sql"
        macs_final = track( peak_list[name], chrmeta=chrlist,
                            info={'datatype':'qualitative'},
                            fields=['start','end','name','score'] )
        macs_final.write(fusion(macs_neighb),clip=True)
        macs_final.close()
        ##############################

    merged_wig = {}
    options['read_extension'] = int(options.get('read_extension') or read_length[0])
    if options['read_extension'] < 1: options['read_extension'] = read_length[0]
    make_wigs = merge_strands >= 0 or options['read_extension']>100
    if options['read_extension'] > 100: options['read_extension'] = 50
    for gid,mapped in mapseq_files.iteritems():
#            if groups[gid]['control']: continue
        group_name = groups[gid]['name']
        wig = []
        for m in mapped.values():
            if make_wigs or not('wig' in m) or len(m['wig'])<2:
                output = mapseq.parallel_density_sql( ex, m["bam"], assembly.chrmeta,
                                                      nreads=m["stats"]["total"],
                                                      merge=-1, read_extension=options['read_extension'],
                                                      convert=False,
                                                      b2w_args=b2w_args, via=via )
                wig.append(dict((s,output+s+'.sql') for s in suffixes))
            else:
                wig.append(m['wig'])
        if len(wig) > 1:
            merged_wig[group_name] = dict((s,merge_sql(ex, [x[s] for x in wig], via=via))
                                          for s in suffixes)
        else:
            merged_wig[group_name] = wig[0]

    if peak_deconvolution:
        ##############################
        def _filter_deconv( stream, pval ):
            ferr = re.compile(r';FERR=([\d\.]+)$')
            return FeatureStream( ((x[0],)+((x[2]+x[1])/2-150,(x[2]+x[1])/2+150)+x[3:] 
                                   for x in stream 
                                   if "FERR=" in x[3] and float(ferr.search(x[3]).groups()[0]) <= pval), 
                                  fields=stream.fields )
        ##############################
        processed['deconv'] = {}
        for name in names['tests']:
            logfile.write(name[1]+" deconvolution.\n");logfile.flush()
            if len(names['controls']) < 2:
                ctrl = (name,names['controls'][0])
                macsbed = processed['macs'][ctrl]+"_peaks.bed"
            else:
                macsbed = intersect_many_bed( ex, [processed['macs'][(name,x)]+"_peaks.bed"
                                                   for x in names['controls']], via=via )
            deconv = run_deconv( ex, merged_wig[name[1]], macsbed, assembly.chrmeta,
                                 options['read_extension'], script_path, via=via )
            peak_list[name] = unique_filename_in()+".bed"
            trbed = track(deconv['peaks']).read()
            with track(peak_list[name], chrmeta=chrlist, fields=trbed.fields) as bedfile:
                bedfile.write(fusion(_filter_deconv(trbed,0.65)))
            ex.add(deconv['peaks'],
                   description=set_file_descr(name[1]+'_peaks.sql', type='sql',
                                              step='deconvolution', groupId=name[0]))
            ex.add(deconv['profile'],
                   description=set_file_descr(name[1]+'_deconv.sql', type='sql',
                                              step='deconvolution',  groupId=name[0]))
            bigwig = unique_filename_in()
            try:
                convert(deconv['profile'],(bigwig,"bigWig"))
                ex.add(bigwig,
                       description=set_file_descr(name[1]+'_deconv.bw', type='bigWig',
                                                  ucsc='1', step='deconvolution',
                                                  groupId=name[0]))
            except OSError as e:
                logfile.write(str(e));logfile.flush()
            ex.add(deconv['pdf'],
                   description=set_file_descr(name[1]+'_deconv.pdf', type='pdf',
                                              step='deconvolution', groupId=name[0]))
            processed['deconv'][name] = deconv

    ##############################
    def _join_macs( stream, xlsl, _f ):
        def _macs_row(_s):
            for _p in _s:
                for _n in _p[3].split("|"):
                    if len(xlsl) == 1:
                        nb = int(_n.split(";")[0][13:]) if _n[:3] == "ID=" else int(_n[10:])
                        yield _p+xlsl[0][nb-1][1:]
                    else:
                        nb = _n.split(";")[0][13:] if _n[:3] == "ID=" else _n[10:]
                        nb = nb.split(":")
                        yield _p+xlsl[int(nb[1])][int(nb[0])-1][1:]
        return FeatureStream( _macs_row(stream), fields=_f )
    ##############################
    peakfile_list = []
    for name, plist in peak_list.iteritems():
        ptrack = track(plist,chrmeta=chrlist,fields=["chr","start","end","name","score"])
        peakfile = unique_filename_in()
        xlsh, xlsl = parse_MACS_xls([processed['macs'][(name,_c)]+"_peaks.xls" for _c in names['controls']])
        try:
###### if assembly doesn't have annotations, we skip the "getNearestFeature" but still go through "_join_macs"
            assembly.gene_track()
            _fields = ['chr','start','end','name','score','gene','location_type','distance']\
                +["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:]
            peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields)
            peakout.make_header("#"+"\t".join(['chromosome','start','end','info','peak_height','gene(s)','location_type','distance']+_fields[8:]))
            for chrom in assembly.chrnames:
                _feat = assembly.gene_track(chrom)
                peakout.write(_join_macs(getNearestFeature(ptrack.read(selection=chrom),_feat),
                                         xlsl, _fields), mode='append')
        except ValueError:
            _fields = ['chr','start','end','name','score']+["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:]
            peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields)
            peakout.make_header("#"+"\t".join(['chromosome','start','end','info','peak_height']+_fields[8:]))
            for chrom in assembly.chrnames:
                peakout.write(_join_macs(ptrack.read(selection=chrom), xlsl, _fields), mode='append')
        peakout.close()
        gzipfile(ex,peakfile)
        peakfile_list.append(track(peakfile+".gz", format='txt', fields=_fields))
        ex.add(peakfile+".gz",
               description=set_file_descr(name[1]+'_annotated_peaks.txt.gz',type='text',
                                          step='annotation',groupId=name[0]))
    stracks = [track(wig,info={'name':name+"_"+st}) 
               for name,wigdict in merged_wig.iteritems() for st,wig in wigdict.iteritems()]
    tablefile = unique_filename_in()
    with open(tablefile,"w") as _tf:
        _pnames = ["MACS_%s_vs_%s" %(_s[1],_c[1]) if _c[1] else "MACS_%s" %_s[1]
                   for _s in names['tests'] for _c in names['controls']]
        _tf.write("\t".join(['#chromosome','start','end',]+_pnames+[s.name for s in stracks])+"\n")
#### need to do something about peak origin (split names, write to separate columns?)
    for chrom in assembly.chrnames:
        pk_lst = [apply(pt.read(chrom,fields=['chr','start','end','name']),
                        'name', lambda __n,_n=npt: "%s:%i" %(__n,_n))
                  for npt,pt in enumerate(peakfile_list)]
        features = fusion(concatenate(pk_lst, fields=['chr','start','end','name'], 
                                      remove_duplicates=True, group_by=['chr','start','end']))
        sread = [sig.read(chrom) for sig in stracks]
        quantifs = score_by_feature(sread, features, method='sum')
        nidx = quantifs.fields.index('name')
        _ns = len(tests)
        _nc = len(controls)
        with open(tablefile,"a") as _tf:
            for row in quantifs:
                pcols = ['']*_ns*_nc
                _rnsplit = row[nidx].split(":")
                _n1 = _rnsplit[0]
                _k = 0
                while ( _k < len(_rnsplit)-1-int(_nc>1) ):
                    if _nc > 1:
                        _k += 2
                        _n2 = _rnsplit[_k-1]
                        _n = _rnsplit[_k].split("|")
                        pcols[int(_n[0])*_nc+int(_n2)] = _n1
                    else:
                        _k += 1
                        _n = _rnsplit[_k].split("|")
                        pcols[int(_n[0])] = _n1
                    _n1 = "|".join(_n[1:])
                _tf.write("\t".join(str(tt) for tt in row[:nidx]+tuple(pcols)+row[nidx+1:])+"\n")
    gzipfile(ex,tablefile)
    ex.add(tablefile+".gz",
           description=set_file_descr('Combined_peak_quantifications.txt.gz',type='text',
                                      step='summary'))

    if run_meme:
        from bbcflib.motif import parallel_meme
        logfile.write("Starting MEME.\n");logfile.flush()
        processed['meme'] = parallel_meme( ex, assembly,
                                           peak_list.values(), name=peak_list.keys(),
                                           chip=True, meme_args=['-meme-nmotifs','4','-meme-mod','zoops'],
                                           via=via )
    return processed
Ejemplo n.º 9
0
def chipseq_workflow(ex,
                     job_or_dict,
                     assembly,
                     script_path='',
                     logfile=sys.stdout,
                     via='lsf'):
    """Runs a chipseq workflow over bam files obtained by mapseq. Will optionally run ``macs`` and 'run_deconv'.

    :param ex: a 'bein' execution environment to run jobs in,

    :param job_or_dict: a 'Frontend' 'job' object, or a dictionary with key 'groups', 'files' and 'options' if applicable,

    :param assembly: a genrep.Assembly object,

    :param script_path: only needed if 'run_deconv' is in the job options, must point to the location of the R scripts.

    Defaults ``macs`` parameters (overriden by ``job_or_dict['options']['macs_args']``) are set as follows:

    * ``'-bw'``: 200 ('bandwith')

    * ``'-m'``: 10,100 ('minimum and maximum enrichments relative to background or control')

    The enrichment bounds will be computed from a Poisson threshold *T*, if available, as *(min(30,5*(T+1)),50*(T+1))*.

    Returns a tuple of a dictionary with keys *group_id* from the job groups, *macs* and *deconv* if applicable and values file description dictionaries and a dictionary of *group_ids* to *names* used in file descriptions.
"""
    options = {}
    if logfile is None: logfile = sys.stdout
    if isinstance(job_or_dict, frontend.Job):
        options = job_or_dict.options
        groups = job_or_dict.groups
        mapseq_files = job_or_dict.files
    elif isinstance(job_or_dict, dict) and 'groups' in job_or_dict:
        if 'options' in job_or_dict:
            options = job_or_dict['options']
        groups = job_or_dict['groups']
        for gid in groups.keys():
            if not ('name' in groups[gid]):
                groups[gid]['name'] = gid
        mapseq_files = job_or_dict.get('files', {})
    else:
        raise TypeError(
            "job_or_dict must be a frontend. Job object or a dictionary with key 'groups'."
        )
    merge_strands = int(options.get('merge_strands', -1))
    suffixes = ["fwd", "rev"]
    peak_deconvolution = options.get('peak_deconvolution', False)
    if isinstance(peak_deconvolution, basestring):
        peak_deconvolution = peak_deconvolution.lower() in ['1', 'true', 't']
    run_meme = options.get('run_meme', False)
    if isinstance(run_meme, basestring):
        run_meme = run_meme.lower() in ['1', 'true', 't']
    macs_args = options.get('macs_args', ["--bw", "200"])
    b2w_args = options.get('b2w_args', [])
    if not (isinstance(mapseq_files, dict)):
        raise TypeError("Mapseq_files must be a dictionary.")
    tests = []
    controls = []
    names = {'tests': [], 'controls': []}
    read_length = []
    p_thresh = {}
    for gid, mapped in mapseq_files.iteritems():
        group_name = groups[gid]['name']
        if not (isinstance(mapped, dict)):
            raise TypeError(
                "Mapseq_files values must be dictionaries with keys *run_ids* or 'bam'."
            )
        if 'bam' in mapped:
            mapped = {'_': mapped}
        futures = {}
        ptruns = []
        for k in mapped.keys():
            if not 'libname' in mapped[k]:
                mapped[k]['libname'] = group_name + "_" + str(k)
            if not 'stats' in mapped[k]:
                futures[k] = mapseq.bamstats.nonblocking(ex,
                                                         mapped[k]["bam"],
                                                         via=via)
            if mapped[k].get('poisson_threshold', -1) > 0:
                ptruns.append(mapped[k]['poisson_threshold'])
        if len(ptruns) > 0:
            p_thresh['group_name'] = sum(ptruns) / len(ptruns)
        for k in futures.keys():
            mapped[k]['stats'] = f.wait()
        if len(mapped) > 1:
            bamfile = mapseq.merge_bam(ex, [m['bam'] for m in mapped.values()])
        else:
            bamfile = mapped.values()[0]['bam']
        if groups[gid]['control']:
            controls.append(bamfile)
            names['controls'].append((gid, group_name))
        else:
            tests.append(bamfile)
            names['tests'].append((gid, group_name))
            read_length.append(mapped.values()[0]['stats']['read_length'])
    genome_size = mapped.values()[0]['stats']['genome_size']
    if len(controls) < 1:
        controls = [None]
        names['controls'] = [(0, None)]
    logfile.write("Starting MACS.\n")
    logfile.flush()
    processed = {
        'macs':
        add_macs_results(ex,
                         read_length,
                         genome_size,
                         tests,
                         ctrlbam=controls,
                         name=names,
                         poisson_threshold=p_thresh,
                         macs_args=macs_args,
                         via=via)
    }
    logfile.write("Done MACS.\n")
    logfile.flush()
    peak_list = {}
    chrlist = assembly.chrmeta
    ## select only peaks with p-val <= 1e-0.6 = .25 => score = -10log10(p) >= 6
    _select = {'score': (6, sys.maxint)}
    _fields = ['chr', 'start', 'end', 'name', 'score']
    for i, name in enumerate(names['tests']):
        if len(names['controls']) < 2:
            ctrl = (name, names['controls'][0])
            macsbed = track(processed['macs'][ctrl] + "_summits.bed",
                            chrmeta=chrlist,
                            fields=_fields).read(selection=_select)
        else:
            macsbed = concatenate([
                apply(track(processed['macs'][(name, x)] + "_summits.bed",
                            chrmeta=chrlist,
                            fields=_fields).read(selection=_select),
                      'name',
                      lambda __n, _n=xn: "%s:%i" % (__n, _n))
                for xn, x in enumerate(names['controls'])
            ])
        ##############################
        macs_neighb = neighborhood(macsbed, before_start=150, after_end=150)
        peak_list[name] = unique_filename_in() + ".sql"
        macs_final = track(peak_list[name],
                           chrmeta=chrlist,
                           info={'datatype': 'qualitative'},
                           fields=['start', 'end', 'name', 'score'])
        macs_final.write(fusion(macs_neighb), clip=True)
        macs_final.close()
        ##############################

    merged_wig = {}
    options['read_extension'] = int(
        options.get('read_extension') or read_length[0])
    if options['read_extension'] < 1:
        options['read_extension'] = read_length[0]
    make_wigs = merge_strands >= 0 or options['read_extension'] > 100
    if options['read_extension'] > 100: options['read_extension'] = 50
    for gid, mapped in mapseq_files.iteritems():
        #            if groups[gid]['control']: continue
        group_name = groups[gid]['name']
        wig = []
        for m in mapped.values():
            if make_wigs or not ('wig' in m) or len(m['wig']) < 2:
                output = mapseq.parallel_density_sql(
                    ex,
                    m["bam"],
                    assembly.chrmeta,
                    nreads=m["stats"]["total"],
                    merge=-1,
                    read_extension=options['read_extension'],
                    convert=False,
                    b2w_args=b2w_args,
                    via=via)
                wig.append(dict((s, output + s + '.sql') for s in suffixes))
            else:
                wig.append(m['wig'])
        if len(wig) > 1:
            merged_wig[group_name] = dict(
                (s, merge_sql(ex, [x[s] for x in wig], via=via))
                for s in suffixes)
        else:
            merged_wig[group_name] = wig[0]

    if peak_deconvolution:
        ##############################
        def _filter_deconv(stream, pval):
            ferr = re.compile(r';FERR=([\d\.]+)$')
            return FeatureStream(
                ((x[0], ) + ((x[2] + x[1]) / 2 - 150,
                             (x[2] + x[1]) / 2 + 150) + x[3:]
                 for x in stream if "FERR=" in x[3]
                 and float(ferr.search(x[3]).groups()[0]) <= pval),
                fields=stream.fields)

        ##############################
        processed['deconv'] = {}
        for name in names['tests']:
            logfile.write(name[1] + " deconvolution.\n")
            logfile.flush()
            if len(names['controls']) < 2:
                ctrl = (name, names['controls'][0])
                macsbed = processed['macs'][ctrl] + "_peaks.bed"
            else:
                macsbed = intersect_many_bed(ex, [
                    processed['macs'][(name, x)] + "_peaks.bed"
                    for x in names['controls']
                ],
                                             via=via)
            deconv = run_deconv(ex,
                                merged_wig[name[1]],
                                macsbed,
                                assembly.chrmeta,
                                options['read_extension'],
                                script_path,
                                via=via)
            peak_list[name] = unique_filename_in() + ".bed"
            trbed = track(deconv['peaks']).read()
            with track(peak_list[name], chrmeta=chrlist,
                       fields=trbed.fields) as bedfile:
                bedfile.write(fusion(_filter_deconv(trbed, 0.65)))
            ex.add(deconv['peaks'],
                   description=set_file_descr(name[1] + '_peaks.sql',
                                              type='sql',
                                              step='deconvolution',
                                              groupId=name[0]))
            ex.add(deconv['profile'],
                   description=set_file_descr(name[1] + '_deconv.sql',
                                              type='sql',
                                              step='deconvolution',
                                              groupId=name[0]))
            bigwig = unique_filename_in()
            try:
                convert(deconv['profile'], (bigwig, "bigWig"))
                ex.add(bigwig,
                       description=set_file_descr(name[1] + '_deconv.bw',
                                                  type='bigWig',
                                                  ucsc='1',
                                                  step='deconvolution',
                                                  groupId=name[0]))
            except OSError as e:
                logfile.write(str(e))
                logfile.flush()
            ex.add(deconv['pdf'],
                   description=set_file_descr(name[1] + '_deconv.pdf',
                                              type='pdf',
                                              step='deconvolution',
                                              groupId=name[0]))
            processed['deconv'][name] = deconv

    ##############################
    def _join_macs(stream, xlsl, _f):
        def _macs_row(_s):
            for _p in _s:
                for _n in _p[3].split("|"):
                    if len(xlsl) == 1:
                        nb = int(
                            _n.split(";")[0][13:]) if _n[:3] == "ID=" else int(
                                _n[10:])
                        yield _p + xlsl[0][nb - 1][1:]
                    else:
                        nb = _n.split(
                            ";")[0][13:] if _n[:3] == "ID=" else _n[10:]
                        nb = nb.split(":")
                        yield _p + xlsl[int(nb[1])][int(nb[0]) - 1][1:]

        return FeatureStream(_macs_row(stream), fields=_f)

    ##############################
    peakfile_list = []
    for name, plist in peak_list.iteritems():
        ptrack = track(plist,
                       chrmeta=chrlist,
                       fields=["chr", "start", "end", "name", "score"])
        peakfile = unique_filename_in()
        xlsh, xlsl = parse_MACS_xls([
            processed['macs'][(name, _c)] + "_peaks.xls"
            for _c in names['controls']
        ])
        try:
            ###### if assembly doesn't have annotations, we skip the "getNearestFeature" but still go through "_join_macs"
            assembly.gene_track()
            _fields = ['chr','start','end','name','score','gene','location_type','distance']\
                +["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:]
            peakout = track(peakfile,
                            format='txt',
                            chrmeta=chrlist,
                            fields=_fields)
            peakout.make_header("#" + "\t".join([
                'chromosome', 'start', 'end', 'info', 'peak_height', 'gene(s)',
                'location_type', 'distance'
            ] + _fields[8:]))
            for chrom in assembly.chrnames:
                _feat = assembly.gene_track(chrom)
                peakout.write(_join_macs(
                    getNearestFeature(ptrack.read(selection=chrom), _feat),
                    xlsl, _fields),
                              mode='append')
        except ValueError:
            _fields = ['chr', 'start', 'end', 'name', 'score'
                       ] + ["MACS_%s" % h for h in xlsh[1:5]] + xlsh[5:]
            peakout = track(peakfile,
                            format='txt',
                            chrmeta=chrlist,
                            fields=_fields)
            peakout.make_header("#" + "\t".join(
                ['chromosome', 'start', 'end', 'info', 'peak_height'] +
                _fields[8:]))
            for chrom in assembly.chrnames:
                peakout.write(_join_macs(ptrack.read(selection=chrom), xlsl,
                                         _fields),
                              mode='append')
        peakout.close()
        gzipfile(ex, peakfile)
        peakfile_list.append(
            track(peakfile + ".gz", format='txt', fields=_fields))
        ex.add(peakfile + ".gz",
               description=set_file_descr(name[1] + '_annotated_peaks.txt.gz',
                                          type='text',
                                          step='annotation',
                                          groupId=name[0]))
    stracks = [
        track(wig, info={'name': name + "_" + st})
        for name, wigdict in merged_wig.iteritems()
        for st, wig in wigdict.iteritems()
    ]
    tablefile = unique_filename_in()
    with open(tablefile, "w") as _tf:
        _pnames = [
            "MACS_%s_vs_%s" % (_s[1], _c[1]) if _c[1] else "MACS_%s" % _s[1]
            for _s in names['tests'] for _c in names['controls']
        ]
        _tf.write("\t".join([
            '#chromosome',
            'start',
            'end',
        ] + _pnames + [s.name for s in stracks]) + "\n")
#### need to do something about peak origin (split names, write to separate columns?)
    for chrom in assembly.chrnames:
        pk_lst = [
            apply(pt.read(chrom, fields=['chr', 'start', 'end', 'name']),
                  'name',
                  lambda __n, _n=npt: "%s:%i" % (__n, _n))
            for npt, pt in enumerate(peakfile_list)
        ]
        features = fusion(
            concatenate(pk_lst,
                        fields=['chr', 'start', 'end', 'name'],
                        remove_duplicates=True,
                        group_by=['chr', 'start', 'end']))
        sread = [sig.read(chrom) for sig in stracks]
        quantifs = score_by_feature(sread, features, method='sum')
        nidx = quantifs.fields.index('name')
        _ns = len(tests)
        _nc = len(controls)
        with open(tablefile, "a") as _tf:
            for row in quantifs:
                pcols = [''] * _ns * _nc
                _rnsplit = row[nidx].split(":")
                _n1 = _rnsplit[0]
                _k = 0
                while (_k < len(_rnsplit) - 1 - int(_nc > 1)):
                    if _nc > 1:
                        _k += 2
                        _n2 = _rnsplit[_k - 1]
                        _n = _rnsplit[_k].split("|")
                        pcols[int(_n[0]) * _nc + int(_n2)] = _n1
                    else:
                        _k += 1
                        _n = _rnsplit[_k].split("|")
                        pcols[int(_n[0])] = _n1
                    _n1 = "|".join(_n[1:])
                _tf.write("\t".join(
                    str(tt)
                    for tt in row[:nidx] + tuple(pcols) + row[nidx + 1:]) +
                          "\n")
    gzipfile(ex, tablefile)
    ex.add(tablefile + ".gz",
           description=set_file_descr('Combined_peak_quantifications.txt.gz',
                                      type='text',
                                      step='summary'))

    if run_meme:
        from bbcflib.motif import parallel_meme
        logfile.write("Starting MEME.\n")
        logfile.flush()
        processed['meme'] = parallel_meme(
            ex,
            assembly,
            peak_list.values(),
            name=peak_list.keys(),
            chip=True,
            meme_args=['-meme-nmotifs', '4', '-meme-mod', 'zoops'],
            via=via)
    return processed