Ejemplo n.º 1
0
 def setTranslatedForName(**kwargs):
     global k
     global translation
     libname = kwargs.get('libname')
     o = getKMERsForName( **mem.sr(kwargs, libname = libname))
     translated = zeros((len(o),k))
     idxed_mers = dict([(i,k) for i,k in enumerate(o.keys())])
     occurrences=array([ o[idxed_mers[i]] for i in range(len(translated))])
     d = translation
     for i in idxed_mers.keys():
         translated[i] = [d.get(l,4) for  l in idxed_mers[i]]
     return idxed_mers,translated, occurrences
Ejemplo n.º 2
0
def run(alltweets, freebase_type = 'band', **kwargs):
    '''
    yield a list of statistically significant tweets.
    cross check it against a set of aliases generated by freebase.
    
    called with a freebase type, it will call the fetch_type routine
    provided by freebase.py in order to grab a list of aliases to match
    against discovered terms.
    '''

    counts = count(tokenize(alltweets))
    long_keys = set([k for k in counts.keys() if len(k)>=5])
    freebase_aliases = fbu.fetch_type(freebase_type, **mem.sr(kwargs))
    matched = [ a.lower() for a  in freebase_aliases if (a.lower() in long_keys)]
    
    raise Exception()
Ejemplo n.º 3
0
    def setAllGenes(**kwargs):
       allPeaks = getPeaks()
       all_results = {}

       #if you were running for a larger dataset you might want to 
       #break this loop after a single iteration and just choose a chromosome
       for num in range(1,20) + ['X']:
           print 'Parsing Chromosome: chr{0}'.format(num)
           genes_dict = {}
           all_results['chr{0}'.format(num)] = genes_dict

           #get the genes on a chromosome
           chrgenes = getTrackChrGenes(**mem.sr(kwargs, num = num))
           #get the peaks on a chromosome
           peaks = allPeaks['chr{0}'.format(num)]

           for i, g in enumerate(chrgenes):
               name = g['name']
               startpos = g['start'] if g['strand'] == 1 else g['end']
               hits = []
               
               #list features near this gene.
               for p in peaks:
                   stranded_offset =array([ g['strand'] * (p['start']  - startpos),
                                           g['strand'] * (p['end'] - startpos)])
                   if( np.min(abs(stranded_offset)) < 2000 \
                           or np.prod(stranded_offset) < 0):
                       stranded_offset.sort()
                       hits.append({'peak_info':p,
                                  'peak_stranded_offset':stranded_offset})
               
               #store some extra information in the dictionary that we'll output
               hits = sorted(hits,key = lambda x: x['peak_stranded_offset'][0])
               gene_object = {
                   'dnase_peaks':hits,
                   'name':name,
                   'gene_info':g,
                   'start':g['start'],
                   'end':g['end'],
                   'strand':g['strand']
                   }
               genes_dict[name] = gene_object

               if (mod(i,100) == 0):
                   print 'Gene {0}: {1}, {2} hits'.format(i, g['name'], len(hits))
       
       return all_results;