def setTranslatedForName(**kwargs): global k global translation libname = kwargs.get('libname') o = getKMERsForName( **mem.sr(kwargs, libname = libname)) translated = zeros((len(o),k)) idxed_mers = dict([(i,k) for i,k in enumerate(o.keys())]) occurrences=array([ o[idxed_mers[i]] for i in range(len(translated))]) d = translation for i in idxed_mers.keys(): translated[i] = [d.get(l,4) for l in idxed_mers[i]] return idxed_mers,translated, occurrences
def run(alltweets, freebase_type = 'band', **kwargs): ''' yield a list of statistically significant tweets. cross check it against a set of aliases generated by freebase. called with a freebase type, it will call the fetch_type routine provided by freebase.py in order to grab a list of aliases to match against discovered terms. ''' counts = count(tokenize(alltweets)) long_keys = set([k for k in counts.keys() if len(k)>=5]) freebase_aliases = fbu.fetch_type(freebase_type, **mem.sr(kwargs)) matched = [ a.lower() for a in freebase_aliases if (a.lower() in long_keys)] raise Exception()
def setAllGenes(**kwargs): allPeaks = getPeaks() all_results = {} #if you were running for a larger dataset you might want to #break this loop after a single iteration and just choose a chromosome for num in range(1,20) + ['X']: print 'Parsing Chromosome: chr{0}'.format(num) genes_dict = {} all_results['chr{0}'.format(num)] = genes_dict #get the genes on a chromosome chrgenes = getTrackChrGenes(**mem.sr(kwargs, num = num)) #get the peaks on a chromosome peaks = allPeaks['chr{0}'.format(num)] for i, g in enumerate(chrgenes): name = g['name'] startpos = g['start'] if g['strand'] == 1 else g['end'] hits = [] #list features near this gene. for p in peaks: stranded_offset =array([ g['strand'] * (p['start'] - startpos), g['strand'] * (p['end'] - startpos)]) if( np.min(abs(stranded_offset)) < 2000 \ or np.prod(stranded_offset) < 0): stranded_offset.sort() hits.append({'peak_info':p, 'peak_stranded_offset':stranded_offset}) #store some extra information in the dictionary that we'll output hits = sorted(hits,key = lambda x: x['peak_stranded_offset'][0]) gene_object = { 'dnase_peaks':hits, 'name':name, 'gene_info':g, 'start':g['start'], 'end':g['end'], 'strand':g['strand'] } genes_dict[name] = gene_object if (mod(i,100) == 0): print 'Gene {0}: {1}, {2} hits'.format(i, g['name'], len(hits)) return all_results;