Example #1
0
    def test_score_by_feature(self):
        features = fstream([('chr',5,15,'gene1'),('chr',30,40,'gene2')], fields=['chr','start','end','name'])
        scores1 = fstream([('chr',10,20,6.),('chr',30,40,6.)], fields=['chr','start','end','score'])
        scores2 = fstream([('chr',30,40,2.)], fields=['chr','start','end','score'])
        res = list(score_by_feature([scores1,scores2],features))
        expected = [('chr',5,15,'gene1',3.,0.),('chr',30,40,'gene2',6.,2.)]
        self.assertListEqual(res,expected)

        # normalize = False
        features = fstream([('chr',5,15,'gene1'),('chr',30,40,'gene2')], fields=['chr','start','end','name'])
        scores1 = fstream([('chr',10,20,6.),('chr',30,40,6.)], fields=['chr','start','end','score'])
        scores2 = fstream([('chr',2,8,2.),('chr',30,33,3.)], fields=['chr','start','end','score'])
        res = list(score_by_feature([scores1,scores2],features,method=sum))
        expected = [('chr',5,15,'gene1',30.,6.),('chr',30,40,'gene2',60.,9.)]
        self.assertListEqual(res,expected)
Example #2
0
def plot_footprint_profile(ex, bedlist, signals, chrnames, groups, logfile):
    files = dict((gid, {'pdf': "", 'mat': []}) for gid in bedlist.keys())
    logfile.write("Plotting footprints:\n")
    logfile.flush()
    for gid, motifbed in bedlist.iteritems():
        #        signals = [track(sig) for sig in siglist[gid]]
        snames = [sig.name for sig in signals[gid]]
        tmotif = track(motifbed, format='bed')
        data = {}
        numregs = {}
        for chrom in chrnames:
            fread = {}
            for r in tmotif.read(chrom):
                r2 = r[3].split(":")
                key = (r2[0], len(r2[1]))
                if key in fread: fread[key].append(r[1:3])
                else: fread[key] = [r[1:3]]
            for motif, regs in fread.iteritems():
                if motif not in data:
                    data[motif] = zeros(shape=(motif[1] + 2 * _plot_flank[1],
                                               len(signals[gid])))
                    numregs[motif] = 0
                numregs[motif] += len(regs)
                tFeat = sorted_stream(
                    segment_features(FeatureStream(regs,
                                                   fields=['start', 'end']),
                                     nbins=motif[1],
                                     upstream=_plot_flank,
                                     downstream=_plot_flank))
                for t in score_by_feature(
                    [s.read(chrom) for s in signals[gid]], tFeat):
                    data[motif][t[2]] += t[3:]
        files[gid]['pdf'] = unique_filename_in()
        new = True
        last = len(data)
        for motif, dat in data.iteritems():
            last -= 1
            mname, nbins = motif
            dat /= float(numregs[motif])
            X = range(-_plot_flank[1], _plot_flank[1] + nbins)
            for k in range(nbins):
                X[k + _plot_flank[1]] = str(k + 1)
            ####### Could do a heatmap (sort by intensity)...
            lineplot(X, [dat[:, n] for n in range(dat.shape[-1])],
                     mfrow=[4, 2],
                     output=files[gid]['pdf'],
                     new=new,
                     last=(last == 0),
                     legend=snames,
                     main=mname)
            new = False
            _datf = unique_filename_in()
            with open(_datf, "w") as dff:
                dff.write("\t".join([""] + [str(x) for x in X]) + "\n")
                for n, sn in enumerate(snames):
                    dff.write("\t".join([sn] + [str(x)
                                                for x in dat[:, n]]) + "\n")
            files[gid]['mat'].append((mname, _datf))
    return files
Example #3
0
 def quantify(self,**kw):
     feature_type = kw.get('feature_type', 0)
     if str(feature_type) in [str(x[0]) for x in ftypes]:
         feature_type = int(feature_type)
     func = str(kw.get('score_op', 'mean'))
     assembly_id = kw.get('assembly')
     format = kw.get('output') or 'txt'
     chrmeta = "guess"
     if assembly_id:
         assembly = genrep.Assembly(assembly_id)
         chrmeta = assembly.chrmeta
         genes = assembly.gene_track
         exons = assembly.exon_track
     elif not(feature_type in ftypes[3]):
         raise ValueError("Please specify an assembly")
     #signals = kw['SigMulti'].get('signals',[])
     signals = kw.get('signals',[])
     if not isinstance(signals, list): signals = [signals]
     signals = [track(sig, chrmeta=chrmeta) for sig in signals]
     if feature_type in ftypes[0]:
         features = genes
     elif feature_type in ftypes[1]:
         prom_pars = {'before_start': int(kw.get('upstream') or prom_up_def),
                      'after_start': int(kw.get('downstream') or prom_down_def),
                      'on_strand': True}
         features = lambda c: neighborhood(genes(c), **prom_pars)
     elif feature_type in ftypes[2]:
         features = exons
     elif feature_type in ftypes[3]:
         assert os.path.exists(str(kw.get('features'))), "Features file not found: '%s'" % kw.get("features")
         _t = track(kw['features'], chrmeta=chrmeta)
         chrmeta = _t.chrmeta
         features = _t.read
     else:
         raise ValueError("Take feature_type in %s." %ftypes)
     output = self.temporary_path(fname='quantification.'+format)
     if len(signals) > 1:
         _f = ["score%i"%i for i in range(len(signals))]
     else:
         _f = ["score"]
     tout = track(output, format, fields=['chr','start','end','name']+_f,
                  chrmeta=chrmeta, info={'datatype':'qualitative'})
     if format == 'txt': 
         header = ['#chr','start','end','name']+[s.name for s in signals]
         tout.make_header("\t".join(header))
     for chrom in chrmeta:
         sread = [sig.read(chrom) for sig in signals]
         tout.write(score_by_feature(sread, features(chrom), method=func),
                    chrom=chrom, clip=True, mode="append")
     return output
Example #4
0
    def test_score_by_feature(self):
        features = fstream([('chr', 5, 15, 'gene1'), ('chr', 30, 40, 'gene2')],
                           fields=['chr', 'start', 'end', 'name'])
        scores1 = fstream([('chr', 10, 20, 6.), ('chr', 30, 40, 6.)],
                          fields=['chr', 'start', 'end', 'score'])
        scores2 = fstream([('chr', 30, 40, 2.)],
                          fields=['chr', 'start', 'end', 'score'])
        res = list(score_by_feature([scores1, scores2], features))
        expected = [('chr', 5, 15, 'gene1', 3., 0.),
                    ('chr', 30, 40, 'gene2', 6., 2.)]
        self.assertListEqual(res, expected)

        # normalize = False
        features = fstream([('chr', 5, 15, 'gene1'), ('chr', 30, 40, 'gene2')],
                           fields=['chr', 'start', 'end', 'name'])
        scores1 = fstream([('chr', 10, 20, 6.), ('chr', 30, 40, 6.)],
                          fields=['chr', 'start', 'end', 'score'])
        scores2 = fstream([('chr', 2, 8, 2.), ('chr', 30, 33, 3.)],
                          fields=['chr', 'start', 'end', 'score'])
        res = list(score_by_feature([scores1, scores2], features, method=sum))
        expected = [('chr', 5, 15, 'gene1', 30., 6.),
                    ('chr', 30, 40, 'gene2', 60., 9.)]
        self.assertListEqual(res, expected)
Example #5
0
def plot_footprint_profile( ex, bedlist, signals, chrnames, groups, logfile ):
    files = dict((gid,{'pdf':"",'mat':[]}) for gid in bedlist.keys())
    logfile.write("Plotting footprints:\n");logfile.flush()
    for gid, motifbed in bedlist.iteritems():
#        signals = [track(sig) for sig in siglist[gid]]
        snames = [sig.name for sig in signals[gid]]
        tmotif = track(motifbed,format='bed')
        data = {}
        numregs = {}
        for chrom in chrnames:
            fread = {}
            for r in tmotif.read(chrom):
                r2 = r[3].split(":")
                key = (r2[0],len(r2[1]))
                if key in fread: fread[key].append(r[1:3])
                else: fread[key] = [r[1:3]]
            for motif, regs in fread.iteritems():
                if motif not in data:
                    data[motif] = zeros(shape=(motif[1]+2*_plot_flank[1], len(signals[gid])))
                    numregs[motif] = 0
                numregs[motif] += len(regs)
                tFeat = sorted_stream(segment_features(FeatureStream(regs,fields=['start','end']),
                                                       nbins=motif[1],upstream=_plot_flank,downstream=_plot_flank))
                for t in score_by_feature([s.read(chrom) for s in signals[gid]], tFeat): 
                    data[motif][t[2]] += t[3:]
        files[gid]['pdf'] = unique_filename_in()
        new = True
        last = len(data)
        for motif, dat in data.iteritems():
            last -= 1
            mname, nbins = motif
            dat /= float(numregs[motif])
            X = range(-_plot_flank[1],_plot_flank[1]+nbins)
            for k in range(nbins): X[k+_plot_flank[1]] = str(k+1)
####### Could do a heatmap (sort by intensity)...
            lineplot(X, [dat[:, n] for n in range(dat.shape[-1])], mfrow=[4,2],
                     output=files[gid]['pdf'], new=new, last=(last==0), 
                     legend=snames, main=mname)
            new = False
            _datf = unique_filename_in()
            with open(_datf,"w") as dff:
                dff.write("\t".join([""]+[str(x) for x in X])+"\n")
                for n,sn in enumerate(snames):
                    dff.write("\t".join([sn]+[str(x) for x in dat[:, n]])+"\n")
            files[gid]['mat'].append((mname,_datf))
    return files
Example #6
0
def summed_feature_matrix(trackScores,trackFeatures,method='mean',**kw):
    """
    Each feature in *trackFeatures* is segmented into bins using bbcflib.gfminer.stream.segment_features
    (with parameters passed from *\*\*kw*).
    This creates a matrix with a column for each track in *trackScores* and a row for each bin in the segmented features.
    The values of a matrix entry is the score from one track in *trackScores* in one bin summed over all features.


    Example::

                      gene1                 gene2
        X: -----#####|#####|#####--------###|###|###-----  (features, nbins=3)
        Y: _____________666|66666________666|666|666_____
        Z: _____22222|22222|22222________________________

             Y   Z
        R: [[3.  1.],   # bin 0
            [4.  1.],   # bin 1
            [6.  1.]]   # bin 2

    Note: the whole segmented features track will be loaded in memory.

    :param trackScores: (FeatureStream, or list of FeatureStream objects) score track(s).
    :param trackFeatures: (FeatureStream) feature track.
    :param method: (str) Operation applied to the list of scores for one feature.
        It is the `method` argument to `stream.score_by_feature` - one of 'sum','mean','median','min','max'.
    :param **kw: arguments to pass to segment_features (`nbins`,`upstream`,`downstream`).
    :rtype: numpy.ndarray, int (number of features)
    """
    nfields = len(trackFeatures.fields)
    trackFeatures = sorted_stream(segment_features(trackFeatures,**kw))
    all_means = score_by_feature(trackScores,trackFeatures,method=method)
    if isinstance(trackScores,(list,tuple)):
        nscores = len(trackScores)
    else:
        nscores = 1
    nbins = kw.get('nbins',segment_features.func_defaults[0]) \
            + kw.get('upstream',(0,0))[1] \
            + kw.get('downstream',(0,0))[1]
    averages = numpy.zeros(shape=(nbins,nscores))
    ntot = -1
    for ntot,x in enumerate(all_means):
        averages[x[nfields]] += x[(nfields+1):]
    return averages, (ntot+1)/nbins
Example #7
0
 def __call__(self, **kw):
     feature_type = int(kw.get('feature_type') or 0)
     assembly_id = kw.get('assembly') or None
     chrmeta = "guess"
     if assembly_id:
         assembly = genrep.Assembly(assembly_id)
         chrmeta = assembly.chrmeta
         genes = assembly.gene_track
         exons = assembly.exon_track
     elif not (feature_type == 3):
         raise ValueError("Please specify an assembly")
     signals = kw.get('SigMulti', {}).get('signals', [])
     if not isinstance(signals, list): signals = [signals]
     signals = [track(sig, chrmeta=chrmeta) for sig in signals]
     snames = [sig.name for sig in signals]
     if feature_type == 0:  #bodies
         features = genes
     elif feature_type == 1:  #promoters
         prom_pars = {
             'before_start': int(kw.get('upstream') or prom_up_def),
             'after_start': int(kw.get('downstream') or prom_down_def),
             'on_strand': True
         }
         features = lambda c: neighborhood(genes(c), **prom_pars)
     elif feature_type == 2:  #exons
         features = exons
     elif feature_type == 3:  #custom track
         _t = track(kw.get('features'), chrmeta=chrmeta)
         chrmeta = _t.chrmeta
         features = _t.read
     else:
         raise ValueError("Feature type not known: %i" % feature_type)
     highlights = kw.get('HiMulti', {}).get('highlights', [])
     if not isinstance(highlights, list): highlights = [highlights]
     if highlights is not None:
         highlights = [track(hi, chrmeta=chrmeta) for hi in highlights]
         hinames = [t.name for t in highlights]
     pdf = self.temporary_path(fname='plot_pairs.pdf')
     narr = None
     set_index = []
     set_labels = []
     if int(kw['mode']) == 0:  #correl
         cormax = int(kw.get('cormax') or _cormax)
         xarr = array(range(-cormax, cormax + 1))
         srtdchrom = sorted(chrmeta.keys())
         features = [
             x[:3] for chrom in srtdchrom
             for x in sorted_stream(features(chrom))
         ]
         _f = ['chr', 'start', 'end', 'score']
         narr = correlation([s.read(fields=_f) for s in signals], features,
                            (-cormax, cormax), True)
     elif int(kw['mode']) == 1:  #density
         xarr = None
         for chrom in chrmeta:
             feat = features(chrom)
             if 'name' not in feat.fields:
                 feat = add_name_field(feat)
             means = score_by_feature([s.read(chrom) for s in signals],
                                      feat)
             mf = means.fields[len(feat.fields):]
             _n, _l = score_array(means, mf)
             if _n.size == 0: continue
             if narr is None: narr = _n
             else: narr = vstack((narr, _n))
         set_index = [narr.shape[0]]
         for hitrack in highlights:
             for chrom in chrmeta:
                 hiread = hitrack.read(chrom)
                 if 'name' not in hiread.fields:
                     hiread = add_name_field(hiread)
                 means = score_by_feature([s.read(chrom) for s in signals],
                                          hiread)
                 mf = means.fields[len(hiread.fields):]
                 _n, _l = score_array(means, mf)
                 if _n.size == 0: continue
                 narr = vstack((narr, _n))
                 set_labels.extend(_l)
             set_index.append(narr.shape[0])
     else:
         raise ValueError("Mode not implemented: %s" % kw['mode'])
     if narr is None:
         raise ValueError("No data")
     pairs(narr,
           xarr,
           labels=snames,
           output=pdf,
           highlights=[set_index, set_labels])
     self.new_file(pdf, 'plot_pairs')
     return self.display_time()
Example #8
0
    def __call__(self, **kw):
        feature_type = int(kw.get("feature_type") or 0)
        individual = kw.get("individual", False)
        if isinstance(individual, basestring):
            individual = individual.lower() in ["1", "true", "t", "on"]
        if individual and int(kw["mode"]) != 1:
            raise ValueError("Only correlation plots can work with the 'individual' option.")

        assembly_id = kw.get("assembly") or None
        chrmeta = "guess"
        if assembly_id:
            assembly = genrep.Assembly(assembly_id)
            chrmeta = assembly.chrmeta
            genes = assembly.gene_track
            exons = assembly.exon_track
        elif not (feature_type == 3):
            raise ValueError("Please specify an assembly")
        # signals = kw.get('SigMulti',{}).get('signals', [])
        signals = kw.get("signals", [])
        if not isinstance(signals, list):
            signals = [signals]
        signals = [track(sig, chrmeta=chrmeta) for sig in signals]
        snames = [sig.name for sig in signals]
        if feature_type == 0:  # bodies
            features = genes
        elif feature_type == 1:  # promoters
            prom_pars = {
                "before_start": int(kw.get("upstream") or prom_up_def),
                "after_start": int(kw.get("downstream") or prom_down_def),
                "on_strand": True,
            }
            features = lambda c: neighborhood(genes(c), **prom_pars)
        elif feature_type == 2:  # exons
            features = exons
        elif feature_type == 3:  # custom track
            _t = track(kw.get("features"), chrmeta=chrmeta)
            chrmeta = _t.chrmeta
            features = _t.read
        else:
            raise ValueError("Feature type not known: %i" % feature_type)
        # highlights = kw.get('HiMulti',{}).get('highlights', [])
        highlights = kw.get("highlights", [])
        if not isinstance(highlights, list):
            highlights = [highlights]
        if highlights is not None:
            highlights = [track(hi, chrmeta=chrmeta) for hi in highlights]
            hinames = [t.name for t in highlights]
        pdf = self.temporary_path(fname="plot_pairs.pdf")
        narr = None
        set_index = []
        set_labels = []
        _new = True
        if int(kw["mode"]) == 1:  # correl
            cormax = int(kw.get("cormax") or _cormax)
            xarr = array(range(-cormax, cormax + 1))
            _f = ["chr", "start", "end", "score"]
            features = [x[:3] for chrom in chrmeta for x in sorted_stream(features(chrom))]
            table = self.temporary_path(fname="table.txt")
            with open(table, "w") as t:
                t.write("\t".join(["chr", "start", "end", "max(correlation)", "lag_max"]) + "\n")
                if individual:
                    for nplot, feature in enumerate(features):
                        if narr is not None and nplot < _MAX_PLOTS_:
                            pairs(narr, xarr, labels=snames, output=pdf, new=_new, last=False)
                            _new = False
                        narr = correlation([s.read(fields=_f) for s in signals], [feature], (-cormax, cormax), True)
                        list_corr = list(narr[0][0])
                        max_corr = max(list_corr)
                        lag_max = list_corr.index(max_corr) - cormax
                        t.write("\t".join([str(x) for x in feature[:3] + (max_corr, lag_max)]) + "\n")
                else:
                    narr = correlation([s.read(fields=_f) for s in signals], features, (-cormax, cormax), True)
                    list_corr = list(narr[0][0])
                    max_corr = max(list_corr)
                    lag_max = list_corr.index(max_corr) - cormax
                    t.write("\t".join(["-", "-", "-"] + [str(max_corr), str(lag_max)]) + "\n")
        elif int(kw["mode"]) == 0:  # density
            xarr = None
            for chrom in chrmeta:
                feat = features(chrom)
                if "name" not in feat.fields:
                    feat = add_name_field(feat)
                means = score_by_feature([s.read(chrom) for s in signals], feat)
                mf = means.fields[len(feat.fields) :]
                _n, _l = score_array(means, mf)
                if _n.size == 0:
                    continue
                if narr is None:
                    narr = _n
                else:
                    narr = vstack((narr, _n))
            set_index = [narr.shape[0]]
            for hitrack in highlights:
                for chrom in chrmeta:
                    hiread = hitrack.read(chrom)
                    if "name" not in hiread.fields:
                        hiread = add_name_field(hiread)
                    means = score_by_feature([s.read(chrom) for s in signals], hiread)
                    mf = means.fields[len(hiread.fields) :]
                    _n, _l = score_array(means, mf)
                    if _n.size == 0:
                        continue
                    narr = vstack((narr, _n))
                    set_labels.extend(_l)
                set_index.append(narr.shape[0])
        else:
            raise ValueError("Mode not implemented: %s" % kw["mode"])
        if narr is None:
            raise ValueError("No data")
        pairs(narr, xarr, labels=snames, output=pdf, highlights=[set_index, set_labels], new=_new, last=True)
        if int(kw["mode"]) == 1:
            self.new_file(table, "table")
        self.new_file(pdf, "plot_pairs")
        return self.display_time()
Example #9
0
 def __call__(self, **kw):
     feature_type = int(kw.get('feature_type') or 0)
     assembly_id = kw.get('assembly') or None
     chrmeta = "guess"
     if assembly_id:
         assembly = genrep.Assembly(assembly_id)
         chrmeta = assembly.chrmeta
         genes = assembly.gene_track
         exons = assembly.exon_track
     elif not(feature_type == 3):
         raise ValueError("Please specify an assembly")
     signals = kw.get('SigMulti',{}).get('signals', [])
     if not isinstance(signals, list): signals = [signals]
     signals = [track(sig, chrmeta=chrmeta) for sig in signals]
     snames = [sig.name for sig in signals]
     if feature_type == 0: #bodies
         features = genes
     elif feature_type == 1: #promoters
         prom_pars = {'before_start': int(kw.get('upstream') or prom_up_def),
                      'after_start': int(kw.get('downstream') or prom_down_def),
                      'on_strand': True}
         features = lambda c: neighborhood(genes(c), **prom_pars)
     elif feature_type == 2: #exons
         features = exons
     elif feature_type == 3: #custom track
         _t = track(kw.get('features'), chrmeta=chrmeta)
         chrmeta = _t.chrmeta
         features = _t.read
     else:
         raise ValueError("Feature type not known: %i" % feature_type)
     highlights = kw.get('HiMulti',{}).get('highlights', [])
     if not isinstance(highlights, list): highlights = [highlights]
     if highlights is not None:
         highlights = [track(hi, chrmeta=chrmeta) for hi in highlights]
         hinames = [t.name for t in highlights]
     pdf = self.temporary_path(fname='plot_pairs.pdf')
     narr = None
     set_index = []
     set_labels = []
     if int(kw['mode']) == 0: #correl
         cormax = int(kw.get('cormax') or _cormax)
         xarr = array(range(-cormax, cormax + 1))
         srtdchrom = sorted(chrmeta.keys())
         features = [x[:3] for chrom in srtdchrom
                     for x in sorted_stream(features(chrom))]
         _f = ['chr', 'start', 'end', 'score']
         narr = correlation([s.read(fields=_f) for s in signals],
                            features, (-cormax, cormax), True)
     elif int(kw['mode']) == 1: #density
         xarr = None
         for chrom in chrmeta:
             feat = features(chrom)
             if 'name' not in feat.fields:
                 feat = add_name_field(feat)
             means = score_by_feature([s.read(chrom) for s in signals], feat)
             mf = means.fields[len(feat.fields):]
             _n, _l = score_array(means, mf)
             if _n.size == 0: continue
             if narr is None: narr = _n
             else:            narr = vstack((narr, _n))
         set_index = [narr.shape[0]]
         for hitrack in highlights:
             for chrom in chrmeta:
                 hiread = hitrack.read(chrom)
                 if 'name' not in hiread.fields:
                     hiread = add_name_field(hiread)
                 means = score_by_feature([s.read(chrom) for s in signals], hiread)
                 mf = means.fields[len(hiread.fields):]
                 _n, _l = score_array(means, mf)
                 if _n.size == 0: continue
                 narr = vstack((narr, _n))
                 set_labels.extend(_l)
             set_index.append(narr.shape[0])
     else:
         raise ValueError("Mode not implemented: %s" % kw['mode'])
     if narr is None:
         raise ValueError("No data")
     pairs(narr, xarr, labels=snames, output=pdf, highlights=[set_index,set_labels])
     self.new_file(pdf, 'plot_pairs')
     return self.display_time()
Example #10
0
def chipseq_workflow( ex, job_or_dict, assembly, script_path='', logfile=sys.stdout, via='lsf' ):
    """Runs a chipseq workflow over bam files obtained by mapseq. Will optionally run ``macs`` and 'run_deconv'.

    :param ex: a 'bein' execution environment to run jobs in,

    :param job_or_dict: a 'Frontend' 'job' object, or a dictionary with key 'groups', 'files' and 'options' if applicable,

    :param assembly: a genrep.Assembly object,

    :param script_path: only needed if 'run_deconv' is in the job options, must point to the location of the R scripts.

    Defaults ``macs`` parameters (overriden by ``job_or_dict['options']['macs_args']``) are set as follows:

    * ``'-bw'``: 200 ('bandwith')

    * ``'-m'``: 10,100 ('minimum and maximum enrichments relative to background or control')

    The enrichment bounds will be computed from a Poisson threshold *T*, if available, as *(min(30,5*(T+1)),50*(T+1))*.

    Returns a tuple of a dictionary with keys *group_id* from the job groups, *macs* and *deconv* if applicable and values file description dictionaries and a dictionary of *group_ids* to *names* used in file descriptions.
"""
    options = {}
    if logfile is None: logfile = sys.stdout
    if isinstance(job_or_dict,frontend.Job):
        options = job_or_dict.options
        groups = job_or_dict.groups
        mapseq_files = job_or_dict.files
    elif isinstance(job_or_dict,dict) and 'groups' in job_or_dict:
        if 'options' in job_or_dict:
            options = job_or_dict['options']
        groups = job_or_dict['groups']
        for gid in groups.keys():
            if not('name' in groups[gid]):
                groups[gid]['name'] = gid
        mapseq_files = job_or_dict.get('files',{})
    else:
        raise TypeError("job_or_dict must be a frontend. Job object or a dictionary with key 'groups'.")
    merge_strands = int(options.get('merge_strands',-1))
    suffixes = ["fwd","rev"]
    peak_deconvolution = options.get('peak_deconvolution',False)
    if isinstance(peak_deconvolution,basestring):
        peak_deconvolution = peak_deconvolution.lower() in ['1','true','t']
    run_meme = options.get('run_meme',False)
    if isinstance(run_meme,basestring):
        run_meme = run_meme.lower() in ['1','true','t']
    macs_args = options.get('macs_args',["--bw","200"])
    b2w_args = options.get('b2w_args',[])
    if not(isinstance(mapseq_files,dict)):
        raise TypeError("Mapseq_files must be a dictionary.")
    tests = []
    controls = []
    names = {'tests': [], 'controls': []}
    read_length = []
    p_thresh = {}
    for gid,mapped in mapseq_files.iteritems():
        group_name = groups[gid]['name']
        if not(isinstance(mapped,dict)):
            raise TypeError("Mapseq_files values must be dictionaries with keys *run_ids* or 'bam'.")
        if 'bam' in mapped:
            mapped = {'_': mapped}
        futures = {}
        ptruns = []
        for k in mapped.keys():
            if not 'libname' in mapped[k]:
                mapped[k]['libname'] = group_name+"_"+str(k)
            if not 'stats' in mapped[k]:
                futures[k] = mapseq.bamstats.nonblocking( ex, mapped[k]["bam"], via=via )
            if mapped[k].get('poisson_threshold',-1)>0:
                ptruns.append(mapped[k]['poisson_threshold'])
        if len(ptruns)>0:
            p_thresh['group_name'] = sum(ptruns)/len(ptruns)
        for k in futures.keys():
            mapped[k]['stats'] = f.wait()
        if len(mapped)>1:
            bamfile = mapseq.merge_bam(ex, [m['bam'] for m in mapped.values()])
        else:
            bamfile = mapped.values()[0]['bam']
        if groups[gid]['control']:
            controls.append(bamfile)
            names['controls'].append((gid,group_name))
        else:
            tests.append(bamfile)
            names['tests'].append((gid,group_name))
            read_length.append(mapped.values()[0]['stats']['read_length'])
    genome_size = mapped.values()[0]['stats']['genome_size']
    if len(controls)<1:
        controls = [None]
        names['controls'] = [(0,None)]
    logfile.write("Starting MACS.\n");logfile.flush()
    processed = {'macs': add_macs_results( ex, read_length, genome_size,
                                           tests, ctrlbam=controls, name=names,
                                           poisson_threshold=p_thresh,
                                           macs_args=macs_args, via=via ) }
    logfile.write("Done MACS.\n");logfile.flush()
    peak_list = {}
    chrlist = assembly.chrmeta
## select only peaks with p-val <= 1e-0.6 = .25 => score = -10log10(p) >= 6
    _select = {'score':(6,sys.maxint)}
    _fields = ['chr','start','end','name','score']
    for i,name in enumerate(names['tests']):
        if len(names['controls']) < 2:
            ctrl = (name,names['controls'][0])
            macsbed = track(processed['macs'][ctrl]+"_summits.bed",
                            chrmeta=chrlist, fields=_fields).read(selection=_select)
        else:
            macsbed = concatenate([apply(track(processed['macs'][(name,x)]+"_summits.bed",
                                         chrmeta=chrlist, fields=_fields).read(selection=_select),
                                         'name', lambda __n,_n=xn: "%s:%i" %(__n,_n))
                                   for xn,x in enumerate(names['controls'])])
        ##############################
        macs_neighb = neighborhood( macsbed, before_start=150, after_end=150 )
        peak_list[name] = unique_filename_in()+".sql"
        macs_final = track( peak_list[name], chrmeta=chrlist,
                            info={'datatype':'qualitative'},
                            fields=['start','end','name','score'] )
        macs_final.write(fusion(macs_neighb),clip=True)
        macs_final.close()
        ##############################

    merged_wig = {}
    options['read_extension'] = int(options.get('read_extension') or read_length[0])
    if options['read_extension'] < 1: options['read_extension'] = read_length[0]
    make_wigs = merge_strands >= 0 or options['read_extension']>100
    if options['read_extension'] > 100: options['read_extension'] = 50
    for gid,mapped in mapseq_files.iteritems():
#            if groups[gid]['control']: continue
        group_name = groups[gid]['name']
        wig = []
        for m in mapped.values():
            if make_wigs or not('wig' in m) or len(m['wig'])<2:
                output = mapseq.parallel_density_sql( ex, m["bam"], assembly.chrmeta,
                                                      nreads=m["stats"]["total"],
                                                      merge=-1, read_extension=options['read_extension'],
                                                      convert=False,
                                                      b2w_args=b2w_args, via=via )
                wig.append(dict((s,output+s+'.sql') for s in suffixes))
            else:
                wig.append(m['wig'])
        if len(wig) > 1:
            merged_wig[group_name] = dict((s,merge_sql(ex, [x[s] for x in wig], via=via))
                                          for s in suffixes)
        else:
            merged_wig[group_name] = wig[0]

    if peak_deconvolution:
        ##############################
        def _filter_deconv( stream, pval ):
            ferr = re.compile(r';FERR=([\d\.]+)$')
            return FeatureStream( ((x[0],)+((x[2]+x[1])/2-150,(x[2]+x[1])/2+150)+x[3:] 
                                   for x in stream 
                                   if "FERR=" in x[3] and float(ferr.search(x[3]).groups()[0]) <= pval), 
                                  fields=stream.fields )
        ##############################
        processed['deconv'] = {}
        for name in names['tests']:
            logfile.write(name[1]+" deconvolution.\n");logfile.flush()
            if len(names['controls']) < 2:
                ctrl = (name,names['controls'][0])
                macsbed = processed['macs'][ctrl]+"_peaks.bed"
            else:
                macsbed = intersect_many_bed( ex, [processed['macs'][(name,x)]+"_peaks.bed"
                                                   for x in names['controls']], via=via )
            deconv = run_deconv( ex, merged_wig[name[1]], macsbed, assembly.chrmeta,
                                 options['read_extension'], script_path, via=via )
            peak_list[name] = unique_filename_in()+".bed"
            trbed = track(deconv['peaks']).read()
            with track(peak_list[name], chrmeta=chrlist, fields=trbed.fields) as bedfile:
                bedfile.write(fusion(_filter_deconv(trbed,0.65)))
            ex.add(deconv['peaks'],
                   description=set_file_descr(name[1]+'_peaks.sql', type='sql',
                                              step='deconvolution', groupId=name[0]))
            ex.add(deconv['profile'],
                   description=set_file_descr(name[1]+'_deconv.sql', type='sql',
                                              step='deconvolution',  groupId=name[0]))
            bigwig = unique_filename_in()
            try:
                convert(deconv['profile'],(bigwig,"bigWig"))
                ex.add(bigwig,
                       description=set_file_descr(name[1]+'_deconv.bw', type='bigWig',
                                                  ucsc='1', step='deconvolution',
                                                  groupId=name[0]))
            except OSError as e:
                logfile.write(str(e));logfile.flush()
            ex.add(deconv['pdf'],
                   description=set_file_descr(name[1]+'_deconv.pdf', type='pdf',
                                              step='deconvolution', groupId=name[0]))
            processed['deconv'][name] = deconv

    ##############################
    def _join_macs( stream, xlsl, _f ):
        def _macs_row(_s):
            for _p in _s:
                for _n in _p[3].split("|"):
                    if len(xlsl) == 1:
                        nb = int(_n.split(";")[0][13:]) if _n[:3] == "ID=" else int(_n[10:])
                        yield _p+xlsl[0][nb-1][1:]
                    else:
                        nb = _n.split(";")[0][13:] if _n[:3] == "ID=" else _n[10:]
                        nb = nb.split(":")
                        yield _p+xlsl[int(nb[1])][int(nb[0])-1][1:]
        return FeatureStream( _macs_row(stream), fields=_f )
    ##############################
    peakfile_list = []
    for name, plist in peak_list.iteritems():
        ptrack = track(plist,chrmeta=chrlist,fields=["chr","start","end","name","score"])
        peakfile = unique_filename_in()
        xlsh, xlsl = parse_MACS_xls([processed['macs'][(name,_c)]+"_peaks.xls" for _c in names['controls']])
        try:
###### if assembly doesn't have annotations, we skip the "getNearestFeature" but still go through "_join_macs"
            assembly.gene_track()
            _fields = ['chr','start','end','name','score','gene','location_type','distance']\
                +["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:]
            peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields)
            peakout.make_header("#"+"\t".join(['chromosome','start','end','info','peak_height','gene(s)','location_type','distance']+_fields[8:]))
            for chrom in assembly.chrnames:
                _feat = assembly.gene_track(chrom)
                peakout.write(_join_macs(getNearestFeature(ptrack.read(selection=chrom),_feat),
                                         xlsl, _fields), mode='append')
        except ValueError:
            _fields = ['chr','start','end','name','score']+["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:]
            peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields)
            peakout.make_header("#"+"\t".join(['chromosome','start','end','info','peak_height']+_fields[8:]))
            for chrom in assembly.chrnames:
                peakout.write(_join_macs(ptrack.read(selection=chrom), xlsl, _fields), mode='append')
        peakout.close()
        gzipfile(ex,peakfile)
        peakfile_list.append(track(peakfile+".gz", format='txt', fields=_fields))
        ex.add(peakfile+".gz",
               description=set_file_descr(name[1]+'_annotated_peaks.txt.gz',type='text',
                                          step='annotation',groupId=name[0]))
    stracks = [track(wig,info={'name':name+"_"+st}) 
               for name,wigdict in merged_wig.iteritems() for st,wig in wigdict.iteritems()]
    tablefile = unique_filename_in()
    with open(tablefile,"w") as _tf:
        _pnames = ["MACS_%s_vs_%s" %(_s[1],_c[1]) if _c[1] else "MACS_%s" %_s[1]
                   for _s in names['tests'] for _c in names['controls']]
        _tf.write("\t".join(['#chromosome','start','end',]+_pnames+[s.name for s in stracks])+"\n")
#### need to do something about peak origin (split names, write to separate columns?)
    for chrom in assembly.chrnames:
        pk_lst = [apply(pt.read(chrom,fields=['chr','start','end','name']),
                        'name', lambda __n,_n=npt: "%s:%i" %(__n,_n))
                  for npt,pt in enumerate(peakfile_list)]
        features = fusion(concatenate(pk_lst, fields=['chr','start','end','name'], 
                                      remove_duplicates=True, group_by=['chr','start','end']))
        sread = [sig.read(chrom) for sig in stracks]
        quantifs = score_by_feature(sread, features, method='sum')
        nidx = quantifs.fields.index('name')
        _ns = len(tests)
        _nc = len(controls)
        with open(tablefile,"a") as _tf:
            for row in quantifs:
                pcols = ['']*_ns*_nc
                _rnsplit = row[nidx].split(":")
                _n1 = _rnsplit[0]
                _k = 0
                while ( _k < len(_rnsplit)-1-int(_nc>1) ):
                    if _nc > 1:
                        _k += 2
                        _n2 = _rnsplit[_k-1]
                        _n = _rnsplit[_k].split("|")
                        pcols[int(_n[0])*_nc+int(_n2)] = _n1
                    else:
                        _k += 1
                        _n = _rnsplit[_k].split("|")
                        pcols[int(_n[0])] = _n1
                    _n1 = "|".join(_n[1:])
                _tf.write("\t".join(str(tt) for tt in row[:nidx]+tuple(pcols)+row[nidx+1:])+"\n")
    gzipfile(ex,tablefile)
    ex.add(tablefile+".gz",
           description=set_file_descr('Combined_peak_quantifications.txt.gz',type='text',
                                      step='summary'))

    if run_meme:
        from bbcflib.motif import parallel_meme
        logfile.write("Starting MEME.\n");logfile.flush()
        processed['meme'] = parallel_meme( ex, assembly,
                                           peak_list.values(), name=peak_list.keys(),
                                           chip=True, meme_args=['-meme-nmotifs','4','-meme-mod','zoops'],
                                           via=via )
    return processed
Example #11
0
def chipseq_workflow(ex,
                     job_or_dict,
                     assembly,
                     script_path='',
                     logfile=sys.stdout,
                     via='lsf'):
    """Runs a chipseq workflow over bam files obtained by mapseq. Will optionally run ``macs`` and 'run_deconv'.

    :param ex: a 'bein' execution environment to run jobs in,

    :param job_or_dict: a 'Frontend' 'job' object, or a dictionary with key 'groups', 'files' and 'options' if applicable,

    :param assembly: a genrep.Assembly object,

    :param script_path: only needed if 'run_deconv' is in the job options, must point to the location of the R scripts.

    Defaults ``macs`` parameters (overriden by ``job_or_dict['options']['macs_args']``) are set as follows:

    * ``'-bw'``: 200 ('bandwith')

    * ``'-m'``: 10,100 ('minimum and maximum enrichments relative to background or control')

    The enrichment bounds will be computed from a Poisson threshold *T*, if available, as *(min(30,5*(T+1)),50*(T+1))*.

    Returns a tuple of a dictionary with keys *group_id* from the job groups, *macs* and *deconv* if applicable and values file description dictionaries and a dictionary of *group_ids* to *names* used in file descriptions.
"""
    options = {}
    if logfile is None: logfile = sys.stdout
    if isinstance(job_or_dict, frontend.Job):
        options = job_or_dict.options
        groups = job_or_dict.groups
        mapseq_files = job_or_dict.files
    elif isinstance(job_or_dict, dict) and 'groups' in job_or_dict:
        if 'options' in job_or_dict:
            options = job_or_dict['options']
        groups = job_or_dict['groups']
        for gid in groups.keys():
            if not ('name' in groups[gid]):
                groups[gid]['name'] = gid
        mapseq_files = job_or_dict.get('files', {})
    else:
        raise TypeError(
            "job_or_dict must be a frontend. Job object or a dictionary with key 'groups'."
        )
    merge_strands = int(options.get('merge_strands', -1))
    suffixes = ["fwd", "rev"]
    peak_deconvolution = options.get('peak_deconvolution', False)
    if isinstance(peak_deconvolution, basestring):
        peak_deconvolution = peak_deconvolution.lower() in ['1', 'true', 't']
    run_meme = options.get('run_meme', False)
    if isinstance(run_meme, basestring):
        run_meme = run_meme.lower() in ['1', 'true', 't']
    macs_args = options.get('macs_args', ["--bw", "200"])
    b2w_args = options.get('b2w_args', [])
    if not (isinstance(mapseq_files, dict)):
        raise TypeError("Mapseq_files must be a dictionary.")
    tests = []
    controls = []
    names = {'tests': [], 'controls': []}
    read_length = []
    p_thresh = {}
    for gid, mapped in mapseq_files.iteritems():
        group_name = groups[gid]['name']
        if not (isinstance(mapped, dict)):
            raise TypeError(
                "Mapseq_files values must be dictionaries with keys *run_ids* or 'bam'."
            )
        if 'bam' in mapped:
            mapped = {'_': mapped}
        futures = {}
        ptruns = []
        for k in mapped.keys():
            if not 'libname' in mapped[k]:
                mapped[k]['libname'] = group_name + "_" + str(k)
            if not 'stats' in mapped[k]:
                futures[k] = mapseq.bamstats.nonblocking(ex,
                                                         mapped[k]["bam"],
                                                         via=via)
            if mapped[k].get('poisson_threshold', -1) > 0:
                ptruns.append(mapped[k]['poisson_threshold'])
        if len(ptruns) > 0:
            p_thresh['group_name'] = sum(ptruns) / len(ptruns)
        for k in futures.keys():
            mapped[k]['stats'] = f.wait()
        if len(mapped) > 1:
            bamfile = mapseq.merge_bam(ex, [m['bam'] for m in mapped.values()])
        else:
            bamfile = mapped.values()[0]['bam']
        if groups[gid]['control']:
            controls.append(bamfile)
            names['controls'].append((gid, group_name))
        else:
            tests.append(bamfile)
            names['tests'].append((gid, group_name))
            read_length.append(mapped.values()[0]['stats']['read_length'])
    genome_size = mapped.values()[0]['stats']['genome_size']
    if len(controls) < 1:
        controls = [None]
        names['controls'] = [(0, None)]
    logfile.write("Starting MACS.\n")
    logfile.flush()
    processed = {
        'macs':
        add_macs_results(ex,
                         read_length,
                         genome_size,
                         tests,
                         ctrlbam=controls,
                         name=names,
                         poisson_threshold=p_thresh,
                         macs_args=macs_args,
                         via=via)
    }
    logfile.write("Done MACS.\n")
    logfile.flush()
    peak_list = {}
    chrlist = assembly.chrmeta
    ## select only peaks with p-val <= 1e-0.6 = .25 => score = -10log10(p) >= 6
    _select = {'score': (6, sys.maxint)}
    _fields = ['chr', 'start', 'end', 'name', 'score']
    for i, name in enumerate(names['tests']):
        if len(names['controls']) < 2:
            ctrl = (name, names['controls'][0])
            macsbed = track(processed['macs'][ctrl] + "_summits.bed",
                            chrmeta=chrlist,
                            fields=_fields).read(selection=_select)
        else:
            macsbed = concatenate([
                apply(track(processed['macs'][(name, x)] + "_summits.bed",
                            chrmeta=chrlist,
                            fields=_fields).read(selection=_select),
                      'name',
                      lambda __n, _n=xn: "%s:%i" % (__n, _n))
                for xn, x in enumerate(names['controls'])
            ])
        ##############################
        macs_neighb = neighborhood(macsbed, before_start=150, after_end=150)
        peak_list[name] = unique_filename_in() + ".sql"
        macs_final = track(peak_list[name],
                           chrmeta=chrlist,
                           info={'datatype': 'qualitative'},
                           fields=['start', 'end', 'name', 'score'])
        macs_final.write(fusion(macs_neighb), clip=True)
        macs_final.close()
        ##############################

    merged_wig = {}
    options['read_extension'] = int(
        options.get('read_extension') or read_length[0])
    if options['read_extension'] < 1:
        options['read_extension'] = read_length[0]
    make_wigs = merge_strands >= 0 or options['read_extension'] > 100
    if options['read_extension'] > 100: options['read_extension'] = 50
    for gid, mapped in mapseq_files.iteritems():
        #            if groups[gid]['control']: continue
        group_name = groups[gid]['name']
        wig = []
        for m in mapped.values():
            if make_wigs or not ('wig' in m) or len(m['wig']) < 2:
                output = mapseq.parallel_density_sql(
                    ex,
                    m["bam"],
                    assembly.chrmeta,
                    nreads=m["stats"]["total"],
                    merge=-1,
                    read_extension=options['read_extension'],
                    convert=False,
                    b2w_args=b2w_args,
                    via=via)
                wig.append(dict((s, output + s + '.sql') for s in suffixes))
            else:
                wig.append(m['wig'])
        if len(wig) > 1:
            merged_wig[group_name] = dict(
                (s, merge_sql(ex, [x[s] for x in wig], via=via))
                for s in suffixes)
        else:
            merged_wig[group_name] = wig[0]

    if peak_deconvolution:
        ##############################
        def _filter_deconv(stream, pval):
            ferr = re.compile(r';FERR=([\d\.]+)$')
            return FeatureStream(
                ((x[0], ) + ((x[2] + x[1]) / 2 - 150,
                             (x[2] + x[1]) / 2 + 150) + x[3:]
                 for x in stream if "FERR=" in x[3]
                 and float(ferr.search(x[3]).groups()[0]) <= pval),
                fields=stream.fields)

        ##############################
        processed['deconv'] = {}
        for name in names['tests']:
            logfile.write(name[1] + " deconvolution.\n")
            logfile.flush()
            if len(names['controls']) < 2:
                ctrl = (name, names['controls'][0])
                macsbed = processed['macs'][ctrl] + "_peaks.bed"
            else:
                macsbed = intersect_many_bed(ex, [
                    processed['macs'][(name, x)] + "_peaks.bed"
                    for x in names['controls']
                ],
                                             via=via)
            deconv = run_deconv(ex,
                                merged_wig[name[1]],
                                macsbed,
                                assembly.chrmeta,
                                options['read_extension'],
                                script_path,
                                via=via)
            peak_list[name] = unique_filename_in() + ".bed"
            trbed = track(deconv['peaks']).read()
            with track(peak_list[name], chrmeta=chrlist,
                       fields=trbed.fields) as bedfile:
                bedfile.write(fusion(_filter_deconv(trbed, 0.65)))
            ex.add(deconv['peaks'],
                   description=set_file_descr(name[1] + '_peaks.sql',
                                              type='sql',
                                              step='deconvolution',
                                              groupId=name[0]))
            ex.add(deconv['profile'],
                   description=set_file_descr(name[1] + '_deconv.sql',
                                              type='sql',
                                              step='deconvolution',
                                              groupId=name[0]))
            bigwig = unique_filename_in()
            try:
                convert(deconv['profile'], (bigwig, "bigWig"))
                ex.add(bigwig,
                       description=set_file_descr(name[1] + '_deconv.bw',
                                                  type='bigWig',
                                                  ucsc='1',
                                                  step='deconvolution',
                                                  groupId=name[0]))
            except OSError as e:
                logfile.write(str(e))
                logfile.flush()
            ex.add(deconv['pdf'],
                   description=set_file_descr(name[1] + '_deconv.pdf',
                                              type='pdf',
                                              step='deconvolution',
                                              groupId=name[0]))
            processed['deconv'][name] = deconv

    ##############################
    def _join_macs(stream, xlsl, _f):
        def _macs_row(_s):
            for _p in _s:
                for _n in _p[3].split("|"):
                    if len(xlsl) == 1:
                        nb = int(
                            _n.split(";")[0][13:]) if _n[:3] == "ID=" else int(
                                _n[10:])
                        yield _p + xlsl[0][nb - 1][1:]
                    else:
                        nb = _n.split(
                            ";")[0][13:] if _n[:3] == "ID=" else _n[10:]
                        nb = nb.split(":")
                        yield _p + xlsl[int(nb[1])][int(nb[0]) - 1][1:]

        return FeatureStream(_macs_row(stream), fields=_f)

    ##############################
    peakfile_list = []
    for name, plist in peak_list.iteritems():
        ptrack = track(plist,
                       chrmeta=chrlist,
                       fields=["chr", "start", "end", "name", "score"])
        peakfile = unique_filename_in()
        xlsh, xlsl = parse_MACS_xls([
            processed['macs'][(name, _c)] + "_peaks.xls"
            for _c in names['controls']
        ])
        try:
            ###### if assembly doesn't have annotations, we skip the "getNearestFeature" but still go through "_join_macs"
            assembly.gene_track()
            _fields = ['chr','start','end','name','score','gene','location_type','distance']\
                +["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:]
            peakout = track(peakfile,
                            format='txt',
                            chrmeta=chrlist,
                            fields=_fields)
            peakout.make_header("#" + "\t".join([
                'chromosome', 'start', 'end', 'info', 'peak_height', 'gene(s)',
                'location_type', 'distance'
            ] + _fields[8:]))
            for chrom in assembly.chrnames:
                _feat = assembly.gene_track(chrom)
                peakout.write(_join_macs(
                    getNearestFeature(ptrack.read(selection=chrom), _feat),
                    xlsl, _fields),
                              mode='append')
        except ValueError:
            _fields = ['chr', 'start', 'end', 'name', 'score'
                       ] + ["MACS_%s" % h for h in xlsh[1:5]] + xlsh[5:]
            peakout = track(peakfile,
                            format='txt',
                            chrmeta=chrlist,
                            fields=_fields)
            peakout.make_header("#" + "\t".join(
                ['chromosome', 'start', 'end', 'info', 'peak_height'] +
                _fields[8:]))
            for chrom in assembly.chrnames:
                peakout.write(_join_macs(ptrack.read(selection=chrom), xlsl,
                                         _fields),
                              mode='append')
        peakout.close()
        gzipfile(ex, peakfile)
        peakfile_list.append(
            track(peakfile + ".gz", format='txt', fields=_fields))
        ex.add(peakfile + ".gz",
               description=set_file_descr(name[1] + '_annotated_peaks.txt.gz',
                                          type='text',
                                          step='annotation',
                                          groupId=name[0]))
    stracks = [
        track(wig, info={'name': name + "_" + st})
        for name, wigdict in merged_wig.iteritems()
        for st, wig in wigdict.iteritems()
    ]
    tablefile = unique_filename_in()
    with open(tablefile, "w") as _tf:
        _pnames = [
            "MACS_%s_vs_%s" % (_s[1], _c[1]) if _c[1] else "MACS_%s" % _s[1]
            for _s in names['tests'] for _c in names['controls']
        ]
        _tf.write("\t".join([
            '#chromosome',
            'start',
            'end',
        ] + _pnames + [s.name for s in stracks]) + "\n")
#### need to do something about peak origin (split names, write to separate columns?)
    for chrom in assembly.chrnames:
        pk_lst = [
            apply(pt.read(chrom, fields=['chr', 'start', 'end', 'name']),
                  'name',
                  lambda __n, _n=npt: "%s:%i" % (__n, _n))
            for npt, pt in enumerate(peakfile_list)
        ]
        features = fusion(
            concatenate(pk_lst,
                        fields=['chr', 'start', 'end', 'name'],
                        remove_duplicates=True,
                        group_by=['chr', 'start', 'end']))
        sread = [sig.read(chrom) for sig in stracks]
        quantifs = score_by_feature(sread, features, method='sum')
        nidx = quantifs.fields.index('name')
        _ns = len(tests)
        _nc = len(controls)
        with open(tablefile, "a") as _tf:
            for row in quantifs:
                pcols = [''] * _ns * _nc
                _rnsplit = row[nidx].split(":")
                _n1 = _rnsplit[0]
                _k = 0
                while (_k < len(_rnsplit) - 1 - int(_nc > 1)):
                    if _nc > 1:
                        _k += 2
                        _n2 = _rnsplit[_k - 1]
                        _n = _rnsplit[_k].split("|")
                        pcols[int(_n[0]) * _nc + int(_n2)] = _n1
                    else:
                        _k += 1
                        _n = _rnsplit[_k].split("|")
                        pcols[int(_n[0])] = _n1
                    _n1 = "|".join(_n[1:])
                _tf.write("\t".join(
                    str(tt)
                    for tt in row[:nidx] + tuple(pcols) + row[nidx + 1:]) +
                          "\n")
    gzipfile(ex, tablefile)
    ex.add(tablefile + ".gz",
           description=set_file_descr('Combined_peak_quantifications.txt.gz',
                                      type='text',
                                      step='summary'))

    if run_meme:
        from bbcflib.motif import parallel_meme
        logfile.write("Starting MEME.\n")
        logfile.flush()
        processed['meme'] = parallel_meme(
            ex,
            assembly,
            peak_list.values(),
            name=peak_list.keys(),
            chip=True,
            meme_args=['-meme-nmotifs', '4', '-meme-mod', 'zoops'],
            via=via)
    return processed
Example #12
0
def feature_matrix(trackScores,trackFeatures,segment=False,method='mean',**kw):
    """
    Return an array with as many lines as there are features in *trackFeatures*, and as many columns
    as there are score tracks in *trackScores*. Each element in the matrix thus corresponds to the
    (average) score of some genomic feature.

    If *segment* is True, each feature will be segmented into bins using
    bbcflib.gfminer.stream.intervals.segment_features (additional parameters in *\*\*kw* will be passed to this function).
    Then each element of the array is itself an array with *nbins* lines and one column for each track in *trackScores*.

    If *segment* is False, then each element of the array is an array with one element for each track in *trackScores*.

    Example::

                      gene1                 gene2
        X: -----#####|#####|#####--------###|###|###-----  (features)
        Y: _____________666|66666________666|666|666_____  (scores1)
        Z: _____22222|22222|22222________________________  (scores2)

        With segment=True, nbins=3:

              Y   Z
        R: [[[0.  2.],    # bin0 \
             [2.  2.],    # bin1  } gene 1
             [6.  2.]],   # bin2 /
            [[6.  0.],    # bin0 \
             [6.  0.],    # bin1  } gene2
             [6.  0.]]]   # bin2 /

        With segment=False:

              Y   Z
        R:  [[3.  2.]
             [6.  0.]]

    Note: the whole segmented features track will be loaded in memory.

    :param trackScores: (FeatureStream, or list of FeatureStream objects) score track(s).
    :param trackFeatures: (FeatureStream) feature track.
    :param segment: (bool) segment each feature into bins.[False]
    :param method: (str) Operation applied to the list of scores for one feature.
        It is the `method` argument to `stream.score_by_feature` - one of 'sum','mean','median','min','max'.
    :param **kw: arguments to pass to segment_features (`nbins`,`upstream`,`downstream`).
    :rtype: tuple (numpy.ndarray of strings, numpy.ndarray of floats)
    """
    nbins = 1
    nscores = 1
    if segment:
        trackFeatures = sorted_stream(segment_features(trackFeatures,**kw))
        nbins = kw.get('nbins',segment_features.func_defaults[0]) \
                + kw.get('upstream',(0,0))[1] \
                + kw.get('downstream',(0,0))[1]
    all_means = score_by_feature(trackScores,trackFeatures,method=method)
    nfields = len(trackFeatures.fields)
    if isinstance(trackScores,(list,tuple)):
        nscores = len(trackScores)
    scores_dict = {}
    if segment:
        empty_mat = numpy.zeros(shape=(nbins,nscores))
    else:
        empty_mat = numpy.zeros(nscores)
    name_idx = all_means.fields.index('name')
    for t in all_means:
        _n = t[name_idx]
        scores_dict.setdefault(_n, empty_mat.copy())
        if segment:
            scores_dict[_n][t[nfields-1]] = t[nfields:]
        else:
            scores_dict[_n] = t[nfields:]
    feat_names = numpy.array(scores_dict.keys())
    scores_mat = numpy.array(scores_dict.values())
    return (feat_names,scores_mat)