예제 #1
0
 def __call__(self, **kw):
     tinput = track.track(kw.get('track'), chrmeta=kw.get('assembly') or None)
     wsize = int(kw.get('window_size', size_def))
     wstep = int(kw.get('window_step', step_def))
     featurewise = kw.get('by_feature', False)
     if isinstance(featurewise, basestring):
         featurewise = (featurewise.lower() in ['1', 'true', 't'])
     output = self.temporary_path(fname='smoothed_track', ext='sql')
     if featurewise:
         outfields = tinput.fields
         datatype = "qualitative"
     else:
         outfields = ["start", "end", "score"]
         datatype = "quantitative"
     tout = track.track(output, fields=outfields,
                        chrmeta=tinput.chrmeta,
                        info={'datatype': datatype})
     for chrom in tout.chrmeta.keys():
         tout.write(gm_stream.window_smoothing(
                 tinput.read(selection=chrom, fields=outfields),
                 window_size=wsize, step_size=wstep,
                 featurewise=featurewise), chrom=chrom)
     tout.close()
     self.new_file(output, 'smoothed_track')
     return 1
예제 #2
0
    def __call__(self, **kw):
        # Create a track with the whole chromosome
        chrmeta = _get_chrmeta(**kw)
        sig0 = track(kw['signals'][0])
        fields = sig0.fields
        format = sig0.format
        is_chr = 'chr' in fields
        _f0 = ('chr','start','end') if is_chr else ('start','end')
        _f1 = [f for f in fields if f not in _f0]
        whole_chr = []
        if is_chr:
            for chr in chrmeta:
                whole_chr.append( (chr,0,chrmeta[chr]['length'])+('0',)*len(_f1) )
        else:
            fields = [f for f in fields if f not in ['start','end']]
            fields = ['start','end']+fields
            for chr in chrmeta:
                whole_chr.append( (0,chrmeta[chr]['length'])+('0',)*len(_f1) )
        whole_chr = FeatureStream(whole_chr,fields=fields)
        temp = self.temporary_path()+'.'+format
        with track(temp,fields=fields) as wc:
            wc.write(whole_chr)

        kw['signals'] = [temp] + kw['signals']
        output = self.temporary_path(fname='combined.')
        output = _combine(self._func,output,**kw)
        self.new_file(output, 'combined')
        return self.display_time()
예제 #3
0
 def __call__(self, **kw):
     b2wargs = []
     control = None
     if kw.get('control'):
         control = kw['control']
         b2wargs = ["-c", str(control)]
     bamfile = track(kw['sample'], format='bam')
     nreads = int(kw.get('normalization') or -1)
     if nreads < 0:
         if control is None:
             nreads = len(set((t[4] for t in bamfile.read())))
         else:
             b2wargs += ["-r"]
     merge_strands = int(kw.get('merge_strands') or -1)
     if merge_strands >= 0:
         suffixes = ["merged"]
     else:
         suffixes = ["fwd", "rev"]
     read_extension = int(kw.get('read_extension') or -1)
     output = self.temporary_path(fname='density_')
     with execution(None) as ex:
         files = bam_to_density(ex, kw['sample'], output,
                                 nreads=nreads, merge=merge_strands,
                                 read_extension=read_extension,
                                 sql=True, args=b2wargs)
     for n, x in enumerate(files):
         tout = track(x, format='sql', fields=['start', 'end', 'score'],
                       chrmeta=bamfile.chrmeta, info={'datatype': 'quantitative'})
         tout.save()
         self.new_file(x, 'density_' + suffixes[n])
     return 1
예제 #4
0
    def __call__(self, **kw):
        def _shift(stream, shift):
            istart = stream.fields.index('start')
            iend = stream.fields.index('end')
            i1 = min(istart, iend)
            i2 = max(istart, iend)

            def _apply_shift(x):
                return x[:i1] + (x[i1] + shift,) + x[i1 + 1:i2] + (x[i2] + shift,) + x[i2 + 1:]
            return track.FeatureStream((_apply_shift(x) for x in stream),
                                       fields=stream.fields)

        tfwd = track.track(kw.get('forward'), chrmeta=kw.get('assembly') or None)
        trev = track.track(kw.get('reverse'), chrmeta=kw.get('assembly') or None)
        if not kw.get('assembly'):  # btrack does the job, take the max of both chromosome lengths
            chrmeta = tfwd.chrmeta
            for k, v in trev.chrmeta.iteritems():
                chrmeta.setdefault(k, {})['length'] = max(v['length'], chrmeta.get(k, {}).get('length', 0))
        elif tfwd.chrmeta:
            chrmeta = tfwd.chrmeta  # For sql files, btrack doesn't make it,
        elif trev.chrmeta:
            chrmeta = trev.chrmeta  # so one can contain the info while the second does not.
        else:
            raise ValueError("Must specify an assembly.")  # In case nothing works - should not happen

        shiftval = int(kw.get('shift', 0))
        if shiftval < 0:  # Determine shift automatically
            shiftval = None
            xcor_lim = 300
            for chrom, v in chrmeta.iteritems():
                chrsize = v['length']
                xcor_lim = min(xcor_lim, 0.01 * chrsize)
                xcor = correlation([tfwd.read(chrom), trev.read(chrom)], regions=(1, chrsize),
                                   limits=(-xcor_lim, xcor_lim))
                max_xcor_idx = xcor.argmax()
                if xcor[max_xcor_idx] > 0.2:
                    shiftval = (max_xcor_idx - xcor_lim - 1) / 2
                    #print "Autocorrelation shift=%i, correlation is %f at index %d for chromosome %s." \
                    #       % (shiftval,xcor[max_xcor_idx],max_xcor_idx,chrom)
                    break
            if not shiftval:
                raise ValueError("Unable to detect shift automatically. Must specify a shift value.")

        output = self.temporary_path(fname='density_merged', ext='sql')
        fields = ['chr', 'start', 'end', 'score']
        tout = track.track(output, format='sql', fields=fields, chrmeta=chrmeta,
                           info={'datatype': 'quantitative'})
        mode = 'write'
        for chrom in chrmeta.keys():
            tout.write(merge_scores([_shift(tfwd.read(selection=chrom), shiftval[chrom]),
                                     _shift(trev.read(selection=chrom), -shiftval[chrom])]),
                       chrom=chrom, mode=mode, clip=True)
            mode = 'append'
        tout.close()
        trev.close()
        tfwd.close()
        self.new_file(output, 'density_merged')
        return 1
예제 #5
0
 def __call__(self, **kw):
     feature_type = int(kw.get('feature_type') or 0)
     assembly_id = kw.get('assembly') or None
     chrmeta = "guess"
     if assembly_id:
         assembly = genrep.Assembly(assembly_id)
         chrmeta = assembly.chrmeta
         genes = assembly.gene_track
         exons = assembly.exon_track
     elif not(feature_type == 2):
         raise ValueError("Please specify an assembly")
     signals = kw.get('signals', [])
     if not isinstance(signals, list): signals = [signals]
     snames = [os.path.splitext(os.path.basename(sig))[0] for sig in signals]
     signals = [track(sig, chrmeta=chrmeta) for sig in signals]
     if feature_type == 0: #bodies
         features = genes
     elif feature_type == 1: #promoters
         prom_pars = {'before_start': int(kw.get('upstream') or prom_up_def),
                      'after_start': int(kw.get('downstream') or prom_down_def),
                      'on_strand': True}
         features = lambda c: neighborhood(genes(c), **prom_pars)
     elif feature_type == 2: #exons
         features = exons
     elif feature_type == 3: #custom track
         _t = track(kw.get('features'), chrmeta=chrmeta)
         chrmeta = _t.chrmeta
         features = _t.read
     else:
         raise ValueError("Feature type not known: %i" % feature_type)
     pdf = self.temporary_path(fname='plot_pairs.pdf')
     narr = None
     if int(kw['mode']) == 0: #correl
         xarr = array(range(-cormax, cormax + 1))
         srtdchrom = sorted(chrmeta.keys())
         features = [x[:3] for chrom in srtdchrom
                     for x in sorted_stream(features(chrom))]
         _f = ['chr', 'start', 'end', 'score']
         narr = correlation([s.read(fields=_f) for s in signals],
                            features, (-cormax, cormax), True)
     elif int(kw['mode']) == 1: #density
         xarr = None
         for chrom in chrmeta:
             feat = features(chrom)
             means = score_by_feature([s.read(chrom) for s in signals], feat)
             mf = means.fields[len(feat.fields):]
             _n, _l = score_array(means, mf)
             if _n.size == 0: continue
             if narr is None: narr = _n
             else:            narr = vstack((narr, _n))
     else:
         raise ValueError("Mode not implemented: %s" % kw['mode'])
     if narr is None:
         raise ValueError("No data")
     pairs(narr, xarr, labels=snames, output=pdf)
     self.new_file(pdf, 'plot_pairs')
     return self.display_time()
예제 #6
0
    def __call__(self, **kw):

        if kw.get('input_type') == 'Table':
            table = kw.get('table')
            assert os.path.exists(str(table)), "File not found: '%s'" % table
            with open(table) as t:
                colnames = t.readline()
                _f = colnames.strip().split()
                nscores = len(_f)-1
            groups = len(list(set([x.split('.')[0] for x in _f])))
            if nscores == 2: # 3 columns, cols 2 and 3 contain the scores
                sample1 = [2]
                sample2 = [3]
            elif len(groups) == 2: # more columns, look if there are two groups of prefixes
                sample1 = [_f.index(x) for x in _f if x.split('.')==groups[0]]
                sample2 = [_f.index(x) for x in _f if x.split('.')==groups[1]]
            else: # not implemented yet, ask the user to choose the columns he wants? Checkboxes...
                raise ValueError("For the moment, either have only 2 columns of scores, \
                                 or use names of the form <group_name>.<run_id>")
        else:
            # Use QuantifyTablePlugin to build a table from score tracks
            from QuantifyTable import QuantifyTablePlugin
            # Set QuantifyTablePlugin options
            kw['score_op'] = 'sum'
            kw['format'] = 'txt'
            signals1 = kw.get('signals1',[])
            signals2 = kw.get('signals2',[])
            if not isinstance(signals1,(list,tuple)): signals1 = [signals1]
            if not isinstance(signals2,(list,tuple)): signals2 = [signals2]
            kw['signals'] = signals1 + signals2
            signals = kw['signals']
            nscores = len(signals)
            qtable = QuantifyTablePlugin().quantify(**kw)
            # Remove useless fields and add header based on file names
            qtable = track(qtable, format='txt', fields=['chr','start','end','name']+['score'+str(i) for i in range(nscores)])
            table = self.temporary_path('scores_table.txt')
            _f = ['score'+str(i) for i in range(nscores)]
            strack = track(table, fields=['name']+_f)
            signal_tracks = [track(s) for s in signals]
            signames = [s.info.get('name',os.path.splitext(os.path.basename(s.path))[0]) for s in signal_tracks]
            strack.write([('Name',signames[0],signames[1])])
            strack.write(qtable.read(fields=strack.fields))
            sample1 = range(len(signals1))
            sample2 = range(nscores-len(signals1))

        output_filename = MAplot(table, cols={1:sample1, 2:sample2})
        output = self.temporary_path(fname='maplot.png')
        shutil.move(output_filename,output)
        self.new_file(output, 'MA-plot')
        return self.display_time()
예제 #7
0
    def __call__(self, **kw):
        features = track(kw.get('features'))
        signal = [track(sig, chrmeta=features.chrmeta) for sig in kw.get('signal', [])]
        labels = None
        data = None
        for chrom in features.chrmeta:
            _l, _d = feature_matrix([s.read(chrom) for s in signal],
                                    features.read(chrom), segment=True)

            if _d.size == 0:
                continue
            if data is None:
                labels = _l
                data = _d
            else:
                labels = concatenate((labels, _l))
                data = vstack((data, _d))
        pdf = self.temporary_path(fname='plot_features', ext='.pdf')
        if data is None:
            raise ValueError("No data")
        kw['mode'] = int(kw.get('mode', 0))
        if kw['mode'] == 0:
            new = True
            for n in range(data.shape[-1] - 1):
                heatmap(data[:, :, n], output=pdf, new=new, last=False,
                        rows=labels, orderRows=True, orderCols=False)
                new = False
            heatmap(data[:, :, -1], output=pdf, new=new, last=True,
                    rows=labels, orderRows=True, orderCols=False)
        elif kw['mode'] == 1:
            X = range(data.shape[1])
            Y = data.mean(axis=0)
            lineplot(X, [Y[:, n] for n in range(data.shape[-1])],
                     output=pdf, new=True, last=True)
        elif kw['mode'] == 2:
            X = range(data.shape[1])
            new = True
            mfrow = [4, 3]
            nplot = min(data.shape[0], max_pages * mfrow[0] * mfrow[1])
            for reg in range(nplot - 1):
                lineplot(X, [data[reg, :, n] for n in range(data.shape[-1])],
                         output=pdf, new=new, last=False, mfrow=mfrow)
                new = False
                mfrow = []
            lineplot(X, [data[nplot - 1, :, n] for n in range(data.shape[-1])],
                     output=pdf, new=new, last=True)
        else:
            raise ValueError("Mode not implemented: %s" % kw['mode'])
        self.new_file(pdf, 'plot_features')
        return 1
예제 #8
0
def _combine(func,output,**kw):
    chrmeta = _get_chrmeta(**kw)
    format = kw.get('format','sql')
    output += format
    signals = kw.get('signals', [])
    if not isinstance(signals, list): signals = [signals]
    signals = [track(sig, chrmeta=chrmeta) for sig in signals]
    tout = track(output, chrmeta=chrmeta, info={'datatype':'qualitative'})
    for chrom in chrmeta:
        trackList = [sig.read(chrom) for sig in signals]
        res = combine(trackList, fn=func)
        tout.fields = res.fields
        tout.write(res, chrom=chrom, clip=True)
    tout.close()
    return output
예제 #9
0
 def test_subtract(self):
     self.subtract(**self.kw)
     with track(self.subtract.output_files[0][0]) as t:
         s = t.read(fields=self.fields)
         content = list(s)
         expected = [('chr1',21,24,17.0)]
         self.assertListEqual(content,expected)
예제 #10
0
 def test_complement(self):
     self.complement(**self.kw)
     with track(self.complement.output_files[0][0]) as t:
         s = t.read('chr1',fields=self.fields)
         content = list(s)
         expected = [('chr1',0,8,0.0),('chr1',19,21,0.0),('chr1',39,197195432,0.0)]
         self.assertListEqual(content,expected)
예제 #11
0
 def test_intersect(self):
     self.intersect(**self.kw)
     with track(self.intersect.output_files[0][0]) as t:
         s = t.read(fields=self.fields)
         content = list(s)
         expected = [('chr1',10,15,17.0),('chr1',24,35,107.0)]
         self.assertListEqual(content,expected)
예제 #12
0
 def test_quantify_table_text(self):
     self.plugin(**{'input_type':'Signal','signals':[path+'KO50.bedGraph', path+'WT50.bedGraph'],
                    'features':path+'features.bed', 'feature_type':3, 'assembly':'mm9', 'format':'txt'})
     with track(self.plugin.output_files[0][0], fields=["chr","start","end","name","score0","score1"]) as t:
         s = t.read()
         content = list(s)
         self.assertEqual(len(content),9)
예제 #13
0
 def test_union(self):
     self.union(**self.kw)
     with track(self.union.output_files[0][0]) as t:
         s = t.read(fields=self.fields)
         content = list(s)
         expected = [('chr1',8,10,12.0),('chr1',10,15,17.0),('chr1',15,19,12.0),
                     ('chr1',21,24,17.0),('chr1',24,35,107.0),('chr1',35,39,90.0)]
         self.assertListEqual(content,expected)
예제 #14
0
 def quantify(self,**kw):
     feature_type = int(kw.get('feature_type', 0))
     func = str(kw.get('score_op', 'mean'))
     assembly_id = kw.get('assembly')
     format = kw.get('format','sql')
     chrmeta = "guess"
     if assembly_id:
         assembly = genrep.Assembly(assembly_id)
         chrmeta = assembly.chrmeta
         genes = assembly.gene_track
         exons = assembly.exon_track
     elif not(feature_type == 3):
         raise ValueError("Please specify an assembly")
     signals = kw.get('signals', [])
     if not isinstance(signals, list): signals = [signals]
     signals = [track(sig, chrmeta=chrmeta) for sig in signals]
     if feature_type == 0:
         features = genes
     elif feature_type == 1:
         prom_pars = {'before_start': int(kw.get('upstream') or prom_up_def),
                      'after_start': int(kw.get('downstream') or prom_down_def),
                      'on_strand': True}
         features = lambda c: neighborhood(genes(c), **prom_pars)
     elif feature_type == 2:
             features = exons
     elif feature_type == 3:
         assert os.path.exists(str(kw.get('features'))), "Features file not found: '%s'" % kw.get("features")
         _t = track(kw.get('features'), chrmeta=chrmeta)
         chrmeta = _t.chrmeta
         features = _t.read
     else:
         raise ValueError("Take feature_type in %s." %ftypes)
     output = self.temporary_path(fname='features_quantification.'+format)
     if len(signals) > 1:
         _f = ["score" + str(i) for i in range(len(signals))]
     else:
         _f = ["score"]
     tout = track(output, format, fields=['chr','start','end','name'] + _f,
                  chrmeta=chrmeta, info={'datatype':'qualitative'})
     for chrom in chrmeta:
         sread = [sig.read(chrom) for sig in signals]
         tout.write(score_by_feature(sread, features(chrom), fn=func),
                    chrom=chrom, clip=True)
     tout.close()
     return output
예제 #15
0
 def __call__(self, **kw):
     b2wargs = []
     control = None
     sample = kw.get("sample")
     assert os.path.exists(str(sample)), "Bam file not found: '%s'." % sample
     if kw.get('control'):
         control = kw['control']
         b2wargs = ["-c", str(control)]
         assert os.path.exists(str(control)), "Control file not found: '%s'." % control
         control = os.path.abspath(control)
     sample = os.path.abspath(sample)
     nreads = int(kw.get('normalization') or -1)
     bamfile = track(sample, format='bam')
     if nreads < 0:
         if control is None:
             nreads = len(set((t[4] for t in bamfile.read())))
         else:
             b2wargs += ["-r"]
     merge_strands = int(kw.get('merge_strands') or -1)
     read_extension = int(kw.get('read_extension') or -1)
     output = self.temporary_path(fname='density_')
     format = kw.get("format", "sql")
     with execution(None) as ex:
         files = bam_to_density(ex, sample, output,
                                nreads=nreads, merge=merge_strands,
                                read_extension=read_extension,
                                sql=True, args=b2wargs)
     if merge_strands >= 0:
         suffixes = ["merged"]
     else:
         suffixes = ["fwd", "rev"]
     for n, x in enumerate(files):
         tsql = track(x, format='sql', fields=['start', 'end', 'score'],
                      chrmeta=bamfile.chrmeta,
                      info={'datatype': 'quantitative'})
         tsql.save()
         if format == "sql":
             outname = x
         else:
             outname = os.path.splitext(x)[0]+"."+format
             convert(x, outname, mode="overwrite")
         self.new_file(outname, 'density_'+suffixes[n])
     return self.display_time()
예제 #16
0
 def __call__(self, **kw):
     assembly_id = kw.get('assembly') or None
     assembly = genrep.Assembly(assembly_id)
     tinput = track(kw.get('track'), chrmeta=assembly.chrmeta)
     thPromot = int(kw.get("promoter", prom_def))
     thInter = int(kw.get('intergenic', inter_def))
     thUTR = int(kw.get('UTR', utr_def))
     output = self.temporary_path(fname='Annotated_table.txt')
     tout = track(output, format='txt', fields=['chr', 'start', 'end', 'name', 'strand',
                                                'gene', 'location_type', 'distance'])
     mode = 'write'
     for chrom in assembly.chrnames:
         tout.write(gm_stream.getNearestFeature(
                 tinput.read(selection=chrom),
                 assembly.gene_track(chrom),
                 thPromot, thInter, thUTR), mode=mode)
         mode = 'append'
     tout.close()
     self.new_file(output, 'table')
     return 1
예제 #17
0
def guess_vizualisations(fileinfo):
    debug('guess vizualisation', 3)
    if not fileinfo.extension == 'sql':
        fileinfo.vizualisations.extend(mappings['viz'][fileinfo.extension])
        debug(', '.join(fileinfo.vizualisations), 4)
        return fileinfo
    dt = btrack.track(fileinfo.paths['upload_to']).info['datatype']
    if dt is not None and dt.lower() in mappings['viz']:
        fileinfo.vizualisations.extend(mappings['viz'][dt.lower()])
        debug(', '.join(fileinfo.vizualisations), 4)
        return fileinfo
    raise Exception('Cannot guess the vizualisation for file "%s".' % fileinfo.trackname)
예제 #18
0
def guess_vizualisations(fileinfo):
    debug('guess vizualisation', 3)
    if not fileinfo.extension == 'sql':
        fileinfo.vizualisations.extend(mappings['viz'][fileinfo.extension])
        debug(', '.join(fileinfo.vizualisations), 4)
        return fileinfo
    dt = btrack.track(fileinfo.paths['upload_to']).info['datatype']
    if dt is not None and dt.lower() in mappings['viz']:
        fileinfo.vizualisations.extend(mappings['viz'][dt.lower()])
        debug(', '.join(fileinfo.vizualisations), 4)
        return fileinfo
    raise Exception('Cannot guess the vizualisation for file "%s".' %
                    fileinfo.trackname)
예제 #19
0
 def __call__(self, **kw):
     feature_type = int(kw.get('feature_type', 0))
     func = str(kw.get('score_op', 'mean'))
     assembly_id = kw.get('assembly') or None
     chrmeta = "guess"
     if assembly_id:
         assembly = genrep.Assembly(assembly_id)
         chrmeta = assembly.chrmeta
         genes = assembly.gene_track
     elif not(feature_type == 2):
         raise ValueError("Please specify an assembly")
     signals = [track(sig, chrmeta=chrmeta) for sig in kw.get('signals', [])]
     if feature_type == 0:
         features = genes
     elif feature_type == 1:
         prom_pars = {'before_start': int(kw.get('upstream') or prom_up_def),
                       'after_start': int(kw.get('downstream') or prom_down_def),
                       'on_strand': True}
         features = lambda c: neighborhood(genes(c), **prom_pars)
     elif feature_type == 2:
         _t = track(kw.get('features'), chrmeta=chrmeta)
         chrmeta = _t.chrmeta
         features = _t.read
     else:
         return 2
     output = self.temporary_path(fname='features_quantification.sql')
     if len(signals) > 1:
         _f = ["score" + str(i) for i in range(len(signals))]
     else:
         _f = ["score"]
     tout = track(output, format='sql', fields=['start', 'end', 'name'] + _f,
                  chrmeta=chrmeta, info={'datatype': 'qualitative'})
     for chrom in chrmeta:
         sread = [sig.read(chrom) for sig in signals]
         tout.write(score_by_feature(sread, features(chrom), fn=func),
                    chrom=chrom, clip=True)
     tout.close()
     self.new_file(output, 'features_quantification')
     return 1
예제 #20
0
 def test_smoothing(self):
     self.plugin(**{'track':path+'KO50.bedGraph', 'assembly':'mm9', 'format':'bedGraph'})
     with track(self.plugin.output_files[0][0]) as t:
         content = list(t.read())
         self.assertEqual(len(content),501)
예제 #21
0
파일: DESeq.py 프로젝트: benmoham/bsPlugins
    def __call__(self, **kw):
        if kw.get('input_type') == 'Table':
            filename = kw.get('table')
            assert os.path.exists(str(filename)), "File not found: '%s'" % filename
            colnames = numpy.asarray(open(filename).readline().split()[1:])
            robjects.r.assign('col_names', numpy2ri.numpy2ri(colnames))
            robjects.r("""
            Mdata <- read.table('%s',sep='\t',header=T,row.names=1)
            conds <- unlist(strsplit(col_names,".",fixed=T))
            conds <- colnames(Mdata)
            """ % filename)
        else:
            from QuantifyTable import QuantifyTablePlugin
            assembly = genrep.Assembly(kw.get('assembly'))
            chrmeta = assembly.chrmeta or "guess"
            kw['score_op'] = 'sum'
            signals1 = kw.get('signals1',[])
            signals2 = kw.get('signals2',[])
            if not isinstance(signals1,(list,tuple)): signals1 = [signals1]
            if not isinstance(signals2,(list,tuple)): signals2 = [signals2]
            kw['signals'] = signals1 + signals2
            signals = kw['signals']
            table = QuantifyTablePlugin().quantify(**kw)
            stracks = []
            norm_factors = []
            for sig in signals:
                assert os.path.exists(str(sig)), "Signal file not found: '%s'." % sig
                _t = track(sig, chrmeta=chrmeta)
                if 'normalization' in _t.info:
                    print 'normalized'
                    _nf = float(_t.info['normalization'])
                elif 'nreads' in _t.info:
                    print 'nreads'
                    _nf = float(_t.info['nreads']) * 1e-7 / float(_t.info.get('read_extension', 1))
                else:
                    _nf = 1
                stracks.append(_t)
                norm_factors.append(_nf)
            t = track(table,chrmeta=chrmeta)
            _f = [f for f in t.fields if f.startswith('score')]
            de_list = list(t.read(fields=['name']+_f))
            t.close(); os.remove(table)
            # Turn all scores into integers
            de_matrix = numpy.asarray([[int(float(s) * norm_factors[k] + .5) for k,s in enumerate(x[1:])]
                                       for x in de_list], dtype=numpy.float)
            rownames = numpy.asarray([x[0] for x in de_list])
            colnames = numpy.asarray([s.info.get('name',os.path.splitext(os.path.basename(s.path))[0])
                                      for s in stracks])
             # if all prefixes are identical within a group, keep this prefix as group identifier.
            if len(list(set( [x.split('.')[0] for x in colnames[:len(signals1)]] ))) == 1 \
            and len(list(set( [x.split('.')[0] for x in colnames[len(signals1):]] ))) == 1:
                group1 = colnames[0].split('.')[0]
                group2 = colnames[-1].split('.')[0]
            else:
                group1 = "Group1"
                group2 = "Group2"
            conds = [group1]*len(signals1) + [group2]*len(signals2)
            robjects.r.assign('Mdata', numpy2ri.numpy2ri(de_matrix))
            robjects.r.assign('row_names', numpy2ri.numpy2ri(rownames))
            robjects.r.assign('col_names', numpy2ri.numpy2ri(colnames))
            robjects.r.assign('conds', numpy2ri.numpy2ri(conds))
            robjects.r("""
            Mdata <- as.data.frame(Mdata,row.names=row_names)
            conds <- unlist(col_names)
            colnames(Mdata) <- conds
            """)

        robjects.r("""
        ### Still need to check that replicates are not identical - lfproc would fail
        groups <- unique(conds)
        couples <- combn(groups,2)
        if (any(table(conds)>1)){ method = 'normal' # if replicates
        } else { method = 'blind' }
        """)

        robjects.r("""
        library(DESeq)
        cds <- newCountDataSet(Mdata, conds)
        cds <- estimateSizeFactors(cds)
        cds <- estimateVarianceFunctions(cds,method='blind')
        """)

        groups = list(set(colnames))
        couples = itertools.combinations(groups, 2)
        output = self.temporary_path(fname='DE')
        for c in couples:
            out = output + '_' + c[0] + '-' + c[1] + '.txt'
            r_cmd = """
            res <- nbinomTest(cds, '%s', '%s')
            res <- res[order(res[,8]),]
            write.table(res, '%s', row.names=F, quote=F, sep='\t')
            """ % (c[0], c[1], out)
            robjects.r(r_cmd)
            if kw.get('complete') is None:
                clean = self.clean_deseq_output(out,c)
                shutil.move(clean,out)
            self.new_file(out, 'differential_expression')
        return self.display_time()
예제 #22
0
 def __call__(self, **kw):
     chrmeta = "guess"
     features = track(kw.get('features'), chrmeta=chrmeta)
     signals = kw.get('signals', [])
     if not isinstance(signals, list): signals = [signals]
     snames = [os.path.splitext(os.path.basename(sig))[0] 
               for sig in signals]
     signals = [track(sig) for sig in signals]
     labels = None
     data = None
     for chrom in features.chrmeta:
         _l, _d = feature_matrix([s.read(chrom) for s in signals],
                                 features.read(chrom), segment=True, 
                                 nbins=nbins, upstream=upstr, downstream=downstr)
         if _d.size == 0:
             continue
         if data is None:
             labels = _l
             data = _d
         else:
             labels = concatenate((labels, _l))
             data = vstack((data, _d))
     pdf = self.temporary_path(fname='plot_features.pdf')
     if data is None:
         raise ValueError("No data")
     kw['mode'] = int(kw.get('mode', 0))
     X = array(range(-upstr[1]+1,nbins+downstr[1]+1))/(1.0*nbins)
     if kw['mode'] == 0: #heatmap
         new = True
         for n in range(data.shape[-1]-1):
             heatmap(data[:, :, n], output=pdf, new=new, last=False,
                     rows=labels, columns=X, main=snames[n],
                     orderRows=True, orderCols=False)
             new = False
         heatmap(data[:, :, -1], output=pdf, new=new, last=True,
                 rows=labels,  columns=X, main=snames[-1],
                 orderRows=True, orderCols=False)
     elif kw['mode'] == 1: #average lineplot
         Y = data.mean(axis=0)
         ymin = min([x.min() for x in Y]+[0])
         ymax = max([x.max() for x in Y])
         lineplot(X, [Y[:, n] for n in range(data.shape[-1])],
                  output=pdf, new=True, last=True, legend=snames, ylim=(ymin,ymax))
     elif kw['mode'] == 2: #mosaic
         new = True
         mfrow = [4, 3]
         nplot = min(data.shape[0], max_pages*mfrow[0]*mfrow[1])
         ymin = min([data.min(),0])
         ymax = data.max()
         for reg in range(nplot-1):
             lineplot(X, [data[reg, :, n] for n in range(data.shape[-1])],
                      output=pdf, new=new, last=False, mfrow=mfrow, 
                      main=labels[reg], ylim=(ymin,ymax))
             new = False
             mfrow = []
         lineplot(X, [data[nplot-1, :, n] for n in range(data.shape[-1])],
                  output=pdf, new=new, last=True, main=labels[-1], 
                  legend=snames, ylim=(ymin,ymax))
     else:
         raise ValueError("Mode not implemented: %s" % kw['mode'])
     self.new_file(pdf, 'plot_features')
     return self.display_time()
예제 #23
0
 def motif_scan_to_track(self, fasta, motifName, motif, background, threshold, chrmeta, output=None):
     """Perform a motif scan and write the results to a track.
     
     It executes motif_scan(fasta, motif, background, threshold) and inserts all results back into a track. If
     the output track is None, a SQL track is created.
     
     Returns the track"""
     
     #The buffer size (used to speed up insertion into SQL tracks althoug it will probably help for most formats)
     COLLECT_SIZE = 1000
     
     results = self.motif_scan(fasta, motif, background, threshold)
     
     if output == None:
         output = self.temporary_path(fname='motif_finder_results', ext='sql')
     track_output = track(output, fields=['start','end','score','name', 'strand'], chrmeta=chrmeta, info={'datatype':'features'})
     
     #Sample: "chr1|chr1:1-230207" -> ["chr1", "chr1", "1", "230207"]
     parse_name = re.compile("^(.*)\|(.*):(.*)-(.*)$")
     
     lines = results.splitlines()
     
     features = []
     for line in lines:
         # name: Name of the FASTA part
         # seq: Matched sequence
         # score: Score
         # pos: Starting position (1 -> first nucleotide)
         # strand: +/- -> Watson/Crick
         [name, seq, score, pos, strand] = line.split("\t")
         
         score = float(score)
         pos = int(pos) -1
         length = len(seq)
         
         regionFrom = 0
         regionTo = length
         
         #Name parsing is a bit more complicated as we need handle more different cases. If the name is in the assembly
         #format, it can be parsed by the parse_name regex. If not, the name is taken as-is and the positions
         #(regionFrom & regionTo) are assumed to be simple (0 -> length).
         fullName = motifName
         #Parse name
         if parse_name.match(name) != None:
             #Sample: ">chr1|chr1:1-230207"
             [(name, _, regionFrom, regionTo)] = parse_name.findall(name)
             regionFrom = int(regionFrom)
             regionTo = int(regionTo)
         
         #Generate a more explicit name
         if chrmeta[name] != None:
             if hasattr(chrmeta[name], 'real_name') and chrmeta[name]['real_name'] != None:
                 fullName = chrmeta[name]['real_name']+" - "+motifName
         
         #Most track formats doesn't handle the case where to < from -> flip to correct
         if regionTo < regionFrom:
             strand = "+" if strand.strip() == "-" else "+"
             [regionFrom, regionTo] = [regionTo, regionFrom]
         
         features.append((name,regionFrom+pos,regionFrom+pos+length, score, fullName, strand))
         
         if len(features) >= COLLECT_SIZE:
             #Buffer full -> flush
             stream = FeatureStream(features, fields=['chr','start','end','score','name','strand'])
             track_output.write(stream)
             features = []
     
     if len(features) > 0:
         #Finished -> flush
         stream = FeatureStream(features, fields=['chr','start','end','score','name','strand'])
         track_output.write(stream)
     track_output.close()
     
     return output
예제 #24
0
    def __call__(self, **kw):
        feature_type = int(kw.get('feature_type', 0))
        assembly_id = kw.get('assembly')
        chrmeta = "guess"
        if assembly_id:
            assembly = genrep.Assembly(assembly_id)
            chrmeta = assembly.chrmeta
            genes = assembly.gene_track
            exons = assembly.exon_track
        elif not(feature_type == 3):
            raise ValueError("Please specify an assembly")
        if feature_type == 0:
            features = genes
        elif feature_type == 1:
            prom_pars = {'before_start': int(kw.get('upstream') or prom_up_def),
                          'after_start': int(kw.get('downstream') or prom_down_def),
                          'on_strand': True}
            features = lambda c: neighborhood(genes(c), **prom_pars)
        elif feature_type == 2:
            features = exons
        elif feature_type == 3:
            assert os.path.exists(kw.get('features'))
            _t = track(kw.get('features'), chrmeta=chrmeta)
            chrmeta = _t.chrmeta
            features = _t.read
        else:
            return 2

        signals = []
        norm_factors = []
        for sig in kw.get('signals', []):
            assert os.path.exists(sig), "File not found: %s." % sig
            _t = track(sig, chrmeta=chrmeta)
            if 'normalization' in _t.info:
                _nf = float(_t.info['normalization'])
            elif 'nreads' in _t.info:
                _nf = float(_t.info['nreads']) * 1e-7 / float(_t.info.get('read_extension', 1))
            else:
                _nf = 1
            signals.append(_t)
            norm_factors.append(_nf)
        if len(signals) > 1:
            _f = ["score" + str(i) for i in range(len(signals))]
        else:
            _f = ["score"]
        de_list = []
        for chrom in chrmeta:
            sread = [sig.read(chrom) for sig in signals]
            mread = score_by_feature(sread, features(chrom), fn='sum')
            de_list.extend(list(mread))
        name_idx = mread.fields.index("name")
        # Turn all scores into integers
        de_matrix = numpy.asarray([[int(s * norm_factors[k] + .5) for s in x[-len(_f):]]
                                   for k, x in enumerate(de_list)], dtype=numpy.float)
        rownames = numpy.asarray([x[name_idx] for x in de_list])
        colnames = numpy.asarray([os.path.splitext(os.path.basename(s.path))[0]
                                  for s in signals])
        del de_list
        output = self.temporary_path(fname='DE')

        robjects.r.assign('Mdata', numpy2ri.numpy2ri(de_matrix))
        robjects.r.assign('row_names', numpy2ri.numpy2ri(rownames))
        robjects.r.assign('col_names', numpy2ri.numpy2ri(colnames))
        robjects.r("""
        Mdata <- as.data.frame(Mdata,row.names=row_names)
        conds <- unlist(strsplit(col_names,".",fixed=T))
        colnames(Mdata) <- conds
        groups <- unique(conds)
        couples <- combn(groups,2)

        # Still need to check that replicates are not identical - lfproc would fail
        if (any(table(conds)>1)){ method = 'normal' # if replicates
        } else { method = 'blind' }

        library(DESeq)
        cds <- newCountDataSet(Mdata, conds)
        cds <- estimateSizeFactors(cds)
        cds <- estimateVarianceFunctions(cds,method='blind')
        """)

        groups = list(set(colnames))
        couples = itertools.combinations(groups, 2)
        for c in couples:
            out = output + '_' + c[0] + '-' + c[1] + '.txt'
            print out
            r_cmd = """
            res <- nbinomTest(cds, '%s', '%s')
            res <- res[order(res[,8]),]
            write.table(res, '%s', row.names=F)
            """ % (c[0], c[1], out)
            robjects.r(r_cmd)
            self.new_file(out, 'differential_expression')
        return 1