def getKMERsForName(libname = 'mouse',tissue_term = None, **kwargs): ''' Calls "parse" on the fasta file referred to as libname. Optionally, specify a tissue term that will serve as a regex filter on the FASTA record descriptors. Specifying a term such as "brain" will do a case insensitive search on records in the library to return only kmers from records matching "term". ''' def setKMERsForName(**kwargs): lname = kwargs['libname'] return parse( **kwargs) name = libname if tissue_term ==None \ else '{0}_tissue={1}'.format(libname,tissue_term) if kwargs.has_key('restored'): output = kwargs['restored'] mem.getOrSet(setKMERsForName, **mem.rc(kwargs, libname = libname, tissue_term = tissue_term, name = name, update = output)) else: return mem.getOrSet(setKMERsForName, **mem.rc(kwargs, libname = libname, tissue_term = tissue_term, name = name))
def getTrackChrGenes(**kwargs): ''' Get all of the genes from a bed file on a given chromosome. kwargs num: chromosome number fname: bedfile path (uses global bedfile as the default) returns a list of attributes for every gene. ''' def setTrackChrGenes(**kwargs): fname = kwargs.get('fname', bedfile) num = kwargs.get('num', 1) t = track.load(fname); chromosome_data = t.read('chr{0}'.format(num)) rows = [dict(zip(r.keys(),r.data)) for r in iter(chromosome_data)] return rows return mem.getOrSet(setTrackChrGenes, **mem.rc( kwargs, onfail = 'compute', name = '{0}_{1}'.format(kwargs.get('fname',os.path.basename(bedfile)), kwargs.get('num', 1)) ))
def plotPeaks(num = 1): import cb.utils.plots as myplots def setHist(**kwargs): peaks = getPeaks()['chr{0}'.format(num)] proms = getTrackChrPromoters(num = num) all_hits = zeros(20) for k,v in proms.iteritems(): mid =(v[0] + v[1]) / 2 deltas = [] for p in peaks: pmid = (p['start'] + p['end'])/2 if abs(pmid - mid) < 5000: deltas.append(pmid - mid) hits, bin_offsets = histogram(deltas, 20, [-5000,5000]) all_hits += hits; return bin_offsets, all_hits; bin_offsets, hits = mem.getOrSet(setHist, num = num) f = myplots.fignum(1) ax = f.add_subplot(111) ax.set_xlabel('distance from promoter') #ax.set_xticks(bin_offsets) #ax.set_xticklabels(['{0}'.format(e) for e in bin_offsets]) ax.set_ylabel('counts') ax.plot(bin_offsets[:-1],hits)
def getTrackChrPromoters(**kwargs): ''' Get all of the forward promoter from a bed file on a given chromosome. kwargs num: chromosome number fname: bedfile path (uses global bedfile as the default) returns a list of the coordinates of each forward promoter. ''' def setTrackChrPromoters(**kwargs): fname = kwargs.get('fname', bedfile) num = kwargs.get('num', 1) t = track.load(fname); chromosome_data = t.read('chr{0}'.format(num)) rows = [dict(zip(r.keys(),r.data)) for r in iter(chromosome_data)] fwd_genes = [e for e in rows if e['strand'] == 1] fwd_starts =dict([(e['name'],e['start']) for e in fwd_genes]) fwd_promoters= dict([(k, [v - 2000, v - 100]) for k,v in fwd_starts.iteritems()]) return fwd_promoters return mem.getOrSet(setTrackChrPromoters, onfail = 'compute', name = '{0}_{1}'.format(kwargs.get('fname',os.path.basename(bedfile)), kwargs.get('num', 1)))
def mapAllGenes(**kwargs): def setAllGenes(**kwargs): allPeaks = getPeaks() all_results = {} #if you were running for a larger dataset you might want to #break this loop after a single iteration and just choose a chromosome for num in range(1,20) + ['X']: print 'Parsing Chromosome: chr{0}'.format(num) genes_dict = {} all_results['chr{0}'.format(num)] = genes_dict #get the genes on a chromosome chrgenes = getTrackChrGenes(**mem.sr(kwargs, num = num)) #get the peaks on a chromosome peaks = allPeaks['chr{0}'.format(num)] for i, g in enumerate(chrgenes): name = g['name'] startpos = g['start'] if g['strand'] == 1 else g['end'] hits = [] #list features near this gene. for p in peaks: stranded_offset =array([ g['strand'] * (p['start'] - startpos), g['strand'] * (p['end'] - startpos)]) if( np.min(abs(stranded_offset)) < 2000 \ or np.prod(stranded_offset) < 0): stranded_offset.sort() hits.append({'peak_info':p, 'peak_stranded_offset':stranded_offset}) #store some extra information in the dictionary that we'll output hits = sorted(hits,key = lambda x: x['peak_stranded_offset'][0]) gene_object = { 'dnase_peaks':hits, 'name':name, 'gene_info':g, 'start':g['start'], 'end':g['end'], 'strand':g['strand'] } genes_dict[name] = gene_object if (mod(i,100) == 0): print 'Gene {0}: {1}, {2} hits'.format(i, g['name'], len(hits)) return all_results; return mem.getOrSet(setAllGenes, **kwargs)
def getPeaks(): ''' Get all of peaks from a narrowpeak file on all chromosomes. kwargs none: returns a list of peaks. ''' def setPeaks(**kwargs): peaks = {} with open(peakfile) as pf: for l in pf.readlines(): grps = l.split('\t') cols = ['chrom', 'start', 'end', 'name', 'score', 'strand', 'signalValue', 'pValue', 'qValue', 'peak'] #note, peak is a zero based offset from start hit = dict(zip(cols[1:],grps[1:])) hit['start'] = int(hit['start']) hit['end'] = int(hit['end']) hit['peak'] = int(hit['peak']) if not peaks.has_key(grps[0]): peaks[grps[0]] = [] peaks[grps[0]].append(hit) return peaks return mem.getOrSet(setPeaks, onfail = 'compute')
def getTranslatedForName(libname, **kwargs): '''Translate kMERs to a numerical array for downstream analysis.''' def setTranslatedForName(**kwargs): global k global translation libname = kwargs.get('libname') o = getKMERsForName( **mem.sr(kwargs, libname = libname)) translated = zeros((len(o),k)) idxed_mers = dict([(i,k) for i,k in enumerate(o.keys())]) occurrences=array([ o[idxed_mers[i]] for i in range(len(translated))]) d = translation for i in idxed_mers.keys(): translated[i] = [d.get(l,4) for l in idxed_mers[i]] return idxed_mers,translated, occurrences tissue_term = kwargs.get('tissue_term', None) name = libname if tissue_term ==None \ else '{0}_tissue={1}'.format(libname,tissue_term) return mem.getOrSet(setTranslatedForName, **mem.rc(kwargs, libname = libname, name = name))
def getBandCollectionAliases(**kwargs): def setBandCollectionAliases(name = None, **kwargs): assert name != None all_aliases = [] freebase = discovery.build('freebase', 'v1', developerKey=DEVELOPER_KEY) names_key = name names_list = band_collectionnames[names_key] for n in names_list: q = [{ "name~=":"{0}".format(n), "type": "/music/musical_group", "/common/topic/alias": [{ "value": None }], "/music/musical_group/member": [{ "member": { "/common/topic/alias": [{ "value": None }] } }], }] responses = json.loads(freebase.mqlread(query=json.dumps(q)).execute()) for band in responses['result']: member_aliases = [ a['value'] for e in band["/music/musical_group/member"] for a in e['member']["/common/topic/alias"]] band_aliases = [a['value'] for a in band["/common/topic/alias"] ] all_aliases.extend(member_aliases) all_aliases.extend(band_aliases) return all_aliases name = kwargs['name'] return mem.getOrSet(setBandCollectionAliases, **mem.rc(kwargs, name = name))
def getKMERsForName(libname): def setKMERsForName(**kwargs): lname = kwargs["libname"] return parse(libname) return mem.getOrSet(setKMERsForName, libname=libname, name=libname)