Example #1
0
    def test_cobble(self): # more tests below
        stream = fstream([('chr1',10,20,'A',1),('chr1',12,22,'B',-1),('chr1',15,25,'C',-1)],
                         fields = ['chr','start','end','name','strand'])
        expected = [('chr1',10,12,'A',1),
                    ('chr1',12,15,'A|B',0),
                    ('chr1',15,20,'A|B|C',0),
                    ('chr1',20,22,'B|C',-1),
                    ('chr1',22,25,'C',-1)]
        res = list(cobble(stream))
        self.assertEqual(res,expected)

        # stranded = True
        stream = fstream([('chr1',10,20,'A',1),('chr1',12,22,'B',-1),('chr1',15,25,'C',-1)],
                         fields = ['chr','start','end','name','strand'])
        expected = [('chr1',10,20,'A',1),
                    ('chr1',12,15,'B',-1),
                    ('chr1',15,22,'B|C',-1),
                    ('chr1',22,25,'C',-1)]
        res = list(cobble(stream,stranded=True))
        self.assertEqual(res,expected)

        # scored = True
        stream = fstream([('chr1',10,20,'A',1,50.0),('chr1',12,22,'B',-1,100.0),('chr1',15,65,'C',-1,20.0)],
                         fields = ['chr','start','end','name','strand','score'])
        expected = [('chr1',10,12,'A',1, 10.0),
                    ('chr1',12,15,'A|B',0, 45.0),
                    ('chr1',15,20,'A|B|C',0, 77.0),
                    ('chr1',20,22,'B|C',-1, 20.8),
                    ('chr1',22,65,'C',-1, 17.2)]
        res = list(cobble(stream,scored=True))
        self.assertEqual(res,expected)
Example #2
0
    def test_cobble(self):  # more tests below
        stream = fstream([('chr1', 10, 20, 'A', 1), ('chr1', 12, 22, 'B', -1),
                          ('chr1', 15, 25, 'C', -1)],
                         fields=['chr', 'start', 'end', 'name', 'strand'])
        expected = [('chr1', 10, 12, 'A', 1), ('chr1', 12, 15, 'A|B', 0),
                    ('chr1', 15, 20, 'A|B|C', 0), ('chr1', 20, 22, 'B|C', -1),
                    ('chr1', 22, 25, 'C', -1)]
        res = list(cobble(stream))
        self.assertEqual(res, expected)

        # stranded = True
        stream = fstream([('chr1', 10, 20, 'A', 1), ('chr1', 12, 22, 'B', -1),
                          ('chr1', 15, 25, 'C', -1)],
                         fields=['chr', 'start', 'end', 'name', 'strand'])
        expected = [('chr1', 10, 20, 'A', 1), ('chr1', 12, 15, 'B', -1),
                    ('chr1', 15, 22, 'B|C', -1), ('chr1', 22, 25, 'C', -1)]
        res = list(cobble(stream, stranded=True))
        self.assertEqual(res, expected)

        # scored = True
        stream = fstream(
            [('chr1', 10, 20, 'A', 1, 50.0), ('chr1', 12, 22, 'B', -1, 100.0),
             ('chr1', 15, 65, 'C', -1, 20.0)],
            fields=['chr', 'start', 'end', 'name', 'strand', 'score'])
        expected = [('chr1', 10, 12, 'A', 1, 10.0),
                    ('chr1', 12, 15, 'A|B', 0, 45.0),
                    ('chr1', 15, 20, 'A|B|C', 0, 77.0),
                    ('chr1', 20, 22, 'B|C', -1, 20.8),
                    ('chr1', 22, 65, 'C', -1, 17.2)]
        res = list(cobble(stream, scored=True))
        self.assertEqual(res, expected)
Example #3
0
def combine(trackList, fn, win_size=1000, aggregate={}):
    """
    Applies a custom function to a list of tracks, such as union, intersection,
    etc., and return a single result track. The input streams need to be ordered
    w.r.t 'chr', 'start' and 'end'. To be applied chromosome by chromosome.

    Only fields of the first track are kept. Values for a common field are
    merged by default according to `common.strand_merge`,`common.no_merge` and `common.generic_merge`,
    respectively for strand, chromosome and all others.

    :param trackList: list of FeatureStream objects.
    :param fn: boolean function to apply, such as bbcflib.gfminer.stream.union.
    :param win_size: (int) window size, in bp.
    :param aggregate: (dict) for each field name given as a key, its value is the function
        to apply to the vector containing all trackList's values for this field in order
        to merge them. E.g. ``{'score': lambda x: sum(x)/len(x)}`` will return the average of
        all *trackList*'s scores in the output.
    :rtype: FeatureStream
    """
    aggregate.setdefault('strand',common.strand_merge)
    aggregate.setdefault('chr',common.no_merge)
    _f = ['start','end']
    if all('chr' in t.fields for t in trackList):
        _f += ['chr']
    if isinstance(fn,str): fn = eval(fn) # can type "combine(...,fn='intersection')"
    trackList = [common.cobble(common.reorder(t,fields=_f)) for t in trackList]
    return common.fusion(FeatureStream(_combine(trackList,fn,win_size,aggregate),
                                       fields=trackList[0].fields))
Example #4
0
def fimo(motifs,fasta,qval=True):
    # Run Fimo
    if qval:
        options = "--max-stored-scores 1000000 --verbosity 1 --thresh 0.01 --qv-thresh"
    else:
        options = "--max-stored-scores 1000000 --verbosity 1 --thresh 0.000001"
    cmd = "fimo " + options + " %s %s" % (motifs, fasta)
    print "Running >>",cmd
    os.system(cmd)
    os.system("sort -k2,2n -k3,3n -k4,4n fimo_out/fimo.txt > fimo.txt")

    # Bed output
    t = track('fimo.txt', fields=["name","chr","start","end","strand","score","p-value","q-value","sequence"])
    t.fields = ["name","chr","start","end","strand","a","score","q","sequence"]
    s = t.read()
    s = select(s,['chr','start','end','name','score','strand'])
    s = apply(s,'chr',lambda x:x.split('|')[1])
    s = sorted_stream(s)
    s = cobble(s)
    s = apply(s,'name',lambda x:'|'.join(list(set(x.split('|')))))
    outname = 'fimo.bed'
    bed = track(outname,fields=s.fields)
    bed.make_header(name="TSS_motifs", description="Motifs +-XKb around TSS", mode='overwrite')
    bed.write(s)
    if os.path.exists("fimo_out"): shutil.rmtree("fimo_out")
Example #5
0
def fimo(motifs, fasta, qval=True):
    # Run Fimo
    if qval:
        options = "--max-stored-scores 1000000 --verbosity 1 --thresh 0.01 --qv-thresh"
    else:
        options = "--max-stored-scores 1000000 --verbosity 1 --thresh 0.000001"
    cmd = "fimo " + options + " %s %s" % (motifs, fasta)
    print "Running >>", cmd
    os.system(cmd)
    os.system("sort -k2,2n -k3,3n -k4,4n fimo_out/fimo.txt > fimo.txt")

    # Bed output
    t = track('fimo.txt',
              fields=[
                  "name", "chr", "start", "end", "strand", "score", "p-value",
                  "q-value", "sequence"
              ])
    t.fields = [
        "name", "chr", "start", "end", "strand", "a", "score", "q", "sequence"
    ]
    s = t.read()
    s = select(s, ['chr', 'start', 'end', 'name', 'score', 'strand'])
    s = apply(s, 'chr', lambda x: x.split('|')[1])
    s = sorted_stream(s)
    s = cobble(s)
    s = apply(s, 'name', lambda x: '|'.join(list(set(x.split('|')))))
    outname = 'fimo.bed'
    bed = track(outname, fields=s.fields)
    bed.make_header(name="TSS_motifs",
                    description="Motifs +-XKb around TSS",
                    mode='overwrite')
    bed.write(s)
    if os.path.exists("fimo_out"): shutil.rmtree("fimo_out")
Example #6
0
    def __call__(self, **kw):

        def _parse_logic(string):
            s = re.sub(r'[^\w\d!=><\. ]', '', string)
            s = re.sub(r' OR ', ')or(%f ', s)
            s = re.sub(r' AND ', ')and(%f ', s)
            return "(%f "+s+")"

        def _run_test(row, indx, cond):
            num = float(row[col_ind[indx]])
            num = max(-sys.maxint,min(sys.maxint,num))
            num = (num,)*c.count("%f")
            return eval(cond % (num))

        def _add_label(s,x):
            _f = s.fields+['track_name']
            return FeatureStream((y+(x,) for y in s), fields=_f)

        venn_options = {} # tune it here
        tracks = []
        intype = kw.get("input_type") or "Table"
        if intype == "Table":
            s_cols = kw.get('id_columns','')
            s_filters = kw.get('filters','')
            infile = track(kw.get('table',''),format='txt',header=True)
            col_ind = [int(i)-1 for i in s_cols.split(",")]
            legend = [infile.fields[i] if i<len(infile.fields) else str(i) for i in col_ind]
            conds = [_parse_logic(x) for x in s_filters.split(",")]
            tlabels = [chr(k+65) for k in range(len(col_ind))]
            conds += ["1"]*(len(col_ind)-len(conds))
            combn = [tuple(sorted(x)) for k in range(len(tlabels)) 
                     for x in combinations(tlabels,k+1)]
            c1 = dict(("|".join(c),0) for c in combn)
            c2 = dict(("|".join(c),0) for c in combn)
            indx = dict((c,[tlabels.index(x) for x in c]) for c in combn)
            for row in infile:
                tests = [_run_test(row,i,c) for i,c in enumerate(conds)]
                for c in combn:
                    c1["|".join([tlabels[n] for n,t in enumerate(tests) if t])] += 1
                    c2["|".join(c)] += all([tests[i] for i in indx[c]])
            nsamples = len(col_ind)
            combn = ['|'.join(y) for x in combn for y in x]
        elif intype == "Tracks":
            #filenames = kw['TrMulti']['files']
            filenames = kw['files']
            if not isinstance(filenames,(list,tuple)): filenames = [filenames]
            for f in filenames: assert os.path.exists(f), "File not found: %s ." % f
            tracks = [track(f,chrmeta='guess') for f in filenames]
            nsamples = len(tracks)
            tlabels = [chr(k+65) for k in range(len(tracks))]
            combn = [combinations(tlabels,k+1) for k in range(len(tlabels))]
            combn = ['|'.join(sorted(y)) for x in combn for y in x]
            c1 = dict(zip(combn,[0]*len(combn)))
            c2 = dict(zip(combn,[0]*len(combn)))
            total_cov = 0.0
            _scored = (kw.get('type') == 'score')
            chromset = set([c for t in tracks for c in t.chrmeta])
            for chrom in chromset:
                streams = [_add_label(t.read(chrom),tlabels[n]) for n,t in enumerate(tracks)]
                s = cobble(concatenate(streams),scored=_scored)
                name_idx = s.fields.index('track_name')
                start_idx = s.fields.index('start')
                end_idx = s.fields.index('end')
                if _scored: score_idx = s.fields.index('score')
                for x in s:
                    length = x[end_idx]-x[start_idx]
                    total_cov += length
                    sub = sorted(list(set(x[name_idx].split('|')))) # avoid 'A|A'
                    cb = [combinations(sub,k) for k in range(1,len(sub)+1)]
                    cb = ['|'.join(sorted(y)) for c in cb for y in c]
                    if _scored:
                        c1['|'.join(sub)] += x[score_idx]
                        for c in cb: c2[c] += x[score_idx]
                    else:
                        c1['|'.join(sub)] += length
                        for c in cb: c2[c] += length
            if total_cov < 1:
                output = self.temporary_path(fname='venn_summary.txt')
                with open(output,'wb') as summary:
                    summary.write("Empty content (no coverage) on %s." %(",".join(chromset)))
                self.new_file(output, 'venn_summary')
                return
            legend = [t.name for t in tracks]
            if _scored:
                for c in combn:
                    c2[c] = round(c2[c])
            else:
                for c in combn:
                    c2[c] = round((100*c2[c])/total_cov)
                    c1[c] = (100*c1[c])/total_cov
        else:
            raise ValueError("Input type '%s' not supported." %intype)


        if nsamples <= 4:
            format = kw.get('output') or 'pdf'
            output = self.temporary_path(fname='venn_diagram.'+format)
            venn(c2,legend=legend,options=venn_options,output=output,format=format)
            self.new_file(output, 'venn_diagram')

        # Text summary
        output = self.temporary_path(fname='venn_summary.txt')
        with open(output,'w') as summary:
            summary.write("%s\t%s\t%s\n" % ("Group","Coverage", "Cumulative coverage"))
            record = "%s\t%.2f\t%d\n"
            for c in sorted(combn, key=lambda x:(len(x),x)):
                summary.write(record%(c,c1[c],c2[c]))
        self.new_file(output, 'venn_summary')
        return self.display_time()
Example #7
0
 def commonTest(self, X, R):
     T = list(cobble(fstream(X, fields=['chr', 'start', 'end', 'score'])))
     print T
     self.assertEqual(T, R)
Example #8
0
    def __call__(self, **kw):
        def _parse_logic(string):
            s = re.sub(r'[^\w\d!=><\. ]', '', string)
            s = re.sub(r' OR ', ')or(%f ', s)
            s = re.sub(r' AND ', ')and(%f ', s)
            return "(%f " + s + ")"

        def _run_test(row, indx, cond):
            num = float(row[col_ind[indx]])
            num = max(-sys.maxint, min(sys.maxint, num))
            num = (num, ) * c.count("%f")
            return eval(cond % (num))

        def _add_label(s, x):
            _f = s.fields + ['track_name']
            return FeatureStream((y + (x, ) for y in s), fields=_f)

        venn_options = {}  # tune it here
        tracks = []
        intype = kw.get("input_type") or "Table"
        if intype == "Table":
            s_cols = kw.get('id_columns', '')
            s_filters = kw.get('filters', '')
            infile = track(kw.get('table', ''), format='txt', header=True)
            col_ind = [int(i) - 1 for i in s_cols.split(",")]
            legend = [
                infile.fields[i] if i < len(infile.fields) else str(i)
                for i in col_ind
            ]
            conds = [_parse_logic(x) for x in s_filters.split(",")]
            tlabels = [chr(k + 65) for k in range(len(col_ind))]
            conds += ["1"] * (len(col_ind) - len(conds))
            combn = [
                tuple(sorted(x)) for k in range(len(tlabels))
                for x in combinations(tlabels, k + 1)
            ]
            c1 = dict(("|".join(c), 0) for c in combn)
            c2 = dict(("|".join(c), 0) for c in combn)
            indx = dict((c, [tlabels.index(x) for x in c]) for c in combn)
            for row in infile:
                tests = [_run_test(row, i, c) for i, c in enumerate(conds)]
                for c in combn:
                    c1["|".join([tlabels[n] for n, t in enumerate(tests)
                                 if t])] += 1
                    c2["|".join(c)] += all([tests[i] for i in indx[c]])
            nsamples = len(col_ind)
            combn = ['|'.join(y) for x in combn for y in x]
        elif intype == "Tracks":
            filenames = kw['TrMulti']['files']
            if not isinstance(filenames, (list, tuple)):
                filenames = [filenames]
            for f in filenames:
                assert os.path.exists(f), "File not found: %s ." % f
            tracks = [track(f, chrmeta='guess') for f in filenames]
            nsamples = len(tracks)
            tlabels = [chr(k + 65) for k in range(len(tracks))]
            combn = [combinations(tlabels, k + 1) for k in range(len(tlabels))]
            combn = ['|'.join(sorted(y)) for x in combn for y in x]
            c1 = dict(zip(combn, [0] * len(combn)))
            c2 = dict(zip(combn, [0] * len(combn)))
            total_cov = 0.0
            _scored = (kw.get('type') == 'score')
            chromset = set([c for t in tracks for c in t.chrmeta])
            for chrom in chromset:
                streams = [
                    _add_label(t.read(chrom), tlabels[n])
                    for n, t in enumerate(tracks)
                ]
                s = cobble(concatenate(streams), scored=_scored)
                name_idx = s.fields.index('track_name')
                start_idx = s.fields.index('start')
                end_idx = s.fields.index('end')
                if _scored: score_idx = s.fields.index('score')
                for x in s:
                    length = x[end_idx] - x[start_idx]
                    total_cov += length
                    sub = sorted(list(set(
                        x[name_idx].split('|'))))  # avoid 'A|A'
                    cb = [combinations(sub, k) for k in range(1, len(sub) + 1)]
                    cb = ['|'.join(sorted(y)) for c in cb for y in c]
                    if _scored:
                        c1['|'.join(sub)] += x[score_idx]
                        for c in cb:
                            c2[c] += x[score_idx]
                    else:
                        c1['|'.join(sub)] += length
                        for c in cb:
                            c2[c] += length
            if total_cov < 1:
                output = self.temporary_path(fname='venn_summary.txt')
                with open(output, 'wb') as summary:
                    summary.write("Empty content (no coverage) on %s." %
                                  (",".join(chromset)))
                self.new_file(output, 'venn_summary')
                return
            legend = [t.name for t in tracks]
            if _scored:
                for c in combn:
                    c2[c] = round(c2[c])
            else:
                for c in combn:
                    c2[c] = round((100 * c2[c]) / total_cov)
                    c1[c] = (100 * c1[c]) / total_cov
        else:
            raise ValueError("Input type '%s' not supported." % intype)

        if nsamples <= 4:
            format = kw.get('format') or 'pdf'
            output = self.temporary_path(fname='venn_diagram.' + format)
            venn(c2,
                 legend=legend,
                 options=venn_options,
                 output=output,
                 format=format)
            self.new_file(output, 'venn_diagram')

        # Text summary
        output = self.temporary_path(fname='venn_summary.txt')
        with open(output, 'w') as summary:
            summary.write("%s\t%s\t%s\n" %
                          ("Group", "Coverage", "Cumulative coverage"))
            record = "%s\t%.2f\t%d\n"
            for c in sorted(combn, key=lambda x: (len(x), x)):
                summary.write(record % (c, c1[c], c2[c]))
        self.new_file(output, 'venn_summary')
        return self.display_time()
Example #9
0
 def commonTest(self,X,R):
     T = list(cobble(fstream(X,fields=['chr','start','end','score'])))
     print T
     self.assertEqual(T,R)