def run0(spec_ct = 8, **kwargs): def setLocusResults(**kwargs): spec_ct = kwargs.get('spec_ct') bases = (128693265,129266680) a0 = fetch_num_ali() names = fetch_alinames() ref = a0[0] ali_counts = sum(less(a0,4) * equal(a0, a0[0,:]) ,1) names_all = [names[i] for i in argsort(ali_counts)[::-1]] names = names_all[:spec_ct] a0 = a0[argsort(ali_counts)[::-1]][:spec_ct] ali_counts = sorted(ali_counts)[::-1][:spec_ct] wl = 150 n_runs = 500 locii = {} results = {} for n_specs in [3, 8]: locii[n_specs], results[n_specs] = run_windows(a0,ref, n_specs = n_specs, n_runs = n_runs, win_len = wl, win_ofs = wl/2, spec_names = names) return locii, results return mem.getOrSet(setLocusResults, **mem.rc(kwargs, on_fail = 'compute', spec_ct = spec_ct))
def term_network(name = 'bdtnp', nterms = -1 , **kwargs): ''' kwargs: nterms: defaults to -1 ''' def set_term_network( **kwargs): nterms = kwargs.get('nterms') name = kwargs.get('name') if name == 'bdtnp': gene_list = nio.getBDTNP().keys() elif name == 'kn': gene_list = graphs['kn'].nodes() grps = term_groups(**mem.sr(kwargs, name = name)) network = nx.Graph() network.add_nodes_from(gene_list) for g in grps: edgelist = [[g1[0],g2[0]] for g1 in g[1] for g2 in g[1]] network.add_edges_from( edgelist ) return network return mem.getOrSet(set_term_network, **mem.rc(kwargs, on_fail = 'compute', register = '{0}_{1}'.format(name,nterms), nterms = nterms, name = name))
def consensus_graph(name = 'none', graphs =(), **kwargs): ''' Get a graph having nodes consisting of the union o the nodes in all graphs and having edges consisting of the intersection of edges in all graphs. ''' def get_cons_graph(**kwargs): graphs = kwargs.get('graphs') if type(graphs[0]) != nx.DiGraph: raise Exception('For now, this method is only compatible with digraph') all_nodes = set.union(*[set(g.nodes()) for g in graphs]) ##NOTE, THIS SYNTAX IS DESIGNED FOR DIRECTED GRAPHS ##FOR UNDIRECTED, IT WILL FAIL TO COUNT BIDIRECTIONAL EDGES all_edges = set.intersection(*[set(g.edges()) for g in graphs]) cons = nx.DiGraph() cons.add_nodes_from(all_nodes) cons.add_edges_from(all_edges) return cons return mem.getOrSet( get_cons_graph, **mem.rc(kwargs, on_fail = 'compute', register =name , graphs = graphs, name = name))
def term_groups(name = 'bdtnp', nterms = -1 ,**kwargs): ''' kwargs: nterms: defaults to -1 ''' def set_term_groups(**kwargs): nterms = kwargs.get('nterms') if name == 'bdtnp': gene_list = nio.getBDTNP().keys() elif name == 'kn': gene_list = graphs['kn'].nodes() #GET ALL CONTROLLED VOCAB TERMS APPLYING TO A GIVEN GENE LIST terms = [(gname,gt) for gname in gene_list for gt in gene_terms(gname) ] all_terms = set([t[1] for t in terms]) term_groups_tmp =[(k, list(g)) for k, g in it.groupby( sorted(terms, key = lambda x: x[1]), key = lambda x: x[1]) ] #SORT THE TERM GROUPS BY GENE COUNT AND ONLY TAKE TOP N if nterms == -1: nterms = len(term_groups_tmp) term_groups = sorted(term_groups_tmp, key = lambda x: len(x[1]))[::-1][:nterms] return term_groups return mem.getOrSet(set_term_groups, **mem.rc(kwargs, on_fail = 'compute', register = '{0}_{1}'.format(name, nterms), nterms = nterms))
def get_motifs(**kwargs): def set_motifs(**kwargs): mfpath = cfg.dataPath('motifs/all_vert_motifs.txt') fpath = cfg.dataPath('CRE/{0}_for_motifs.txt'.format(promoter_type)) cmd = 'motif-match -n 1 -m {0} -V 1'.format(mfpath) cmd2 = 'xargs echo' prc = spc.Popen(cmd, shell = True, stdin = spc.PIPE, stdout = spc.PIPE) mlines = prc.communicate(input = open(fpath).read())[0].splitlines() seqs = {} for o in mlines: o = o.split(' ') name = o[1] entry = seqs.get(name, []) entry.append({'motif':o[0], 'start':int(o[2]), 'end':int(o[3]), 'strand':o[4], 'score':float(o[6])}) seqs[name] = entry return seqs return mem.getOrSet(set_motifs, **mem.rc(kwargs, on_fail = 'fail', register = promoter_type))
def get_results(**kwargs): def set_results(**kwargs): cells = fetch_cluster_results([t[0] for t in kwargs.get('tsrt')]) mod_list = list( set([m['module'] for c in cells for m in c[1]] ) ) mods =dict([(mod, [{'tissue':c[0], 'gene':m['gene']} for c in cells for m in c[1] if m['module'] == mod] ) for mod in mod_list]) tf_list = set(it.chain(*mods.keys())) gene_list = set([elt['gene'] for v in mods.values() for elt in v ]) tfs = dict([(tf, [{'tissue':elt['tissue'], 'module':k, 'gene':elt['gene']} for k, v in mods.iteritems() if tf in k for elt in v ]) for tf in tf_list]) genes = dict([(g, [{'tissue':elt['tissue'], 'module':k} for k, v in mods.iteritems() for elt in v if elt ['gene'] == g]) for g in gene_list]) return mods, genes, tfs return mem.getOrSet(set_results, **mem.rc(kwargs, on_fail = 'compute'))
def tiling_peaks(**kwargs): def set_tiling_peaks(**kwargs): root = cfg.dataPath('modencode/wormtile/computed-peaks_gff3') files = [os.path.join(root, f) for f in os.listdir(root)] out = {} for f in files: if f[-2:] != 'gz': continue fopen= gzip.open(f) data = [l for l in fopen.readlines() if not l[0] == '#'] out[os.path.basename(f)] = \ [dict(zip(['chr', 'meth', 'type', 'start','end','score', 'blank','blank2','annotations' ], l.strip().split('\t'))) for l in data] for k,v in out.iteritems(): for d in v: d['start'] = int(d['start']) d['end'] = int(d['end']) d['score'] = float(d['score']) return out return mem.getOrSet(set_tiling_peaks, **mem.rc(kwargs, hardcopy = True, name = 'default'))
def get_assay_gprops(**kwargs): kwargs['atype'] = kwargs.get('atype', default_atype) def set_assay_gprops(**kwargs): chips = get_assay_info(**mem.sr(kwargs)) genes = parse_genes() tf_stats = {} for k, v in chips.iteritems(): tf_stats[k] = {} for k2,v2 in v.iteritems(): print 'n_exps = {0}'.\ format(np.sum([len(v) for v in tf_stats.values()])) tf_stats[k][k2] = {} cs = [e['chr'] for e in v2] f_gups=[genes[cs[i]][e['fup_gene']] for i,e in enumerate(v2)] f_gdowns=[genes[cs[i]][e['fdown_gene']] for i,e in enumerate(v2)] fup_deltas = [e['mean'] - f_gups[i].location.start.position for i,e in enumerate(v2)] fdown_deltas=[e['mean'] - f_gdowns[i].location.start.position for i,e in enumerate(v2)] r_gups=[genes[cs[i]][e['rup_gene']] for i,e in enumerate(v2)] r_gdowns=[genes[cs[i]][e['rdown_gene']] for i,e in enumerate(v2)] rup_deltas = [e['mean'] - r_gups[i].location.end.position for i,e in enumerate(v2)] rdown_deltas=[e['mean'] - r_gdowns[i].location.end.position for i,e in enumerate(v2)] deltas = array([fdown_deltas, fup_deltas, rdown_deltas, rup_deltas]).T closest = argmin(np.abs(deltas),1) csrt = argsort(np.abs(deltas),1) primaries = [] secondaries = [] for i,c in enumerate(csrt): for j,e in enumerate(c[:2]): arr = primaries if j == 0 else secondaries d = {} if e == 0 : d['gene'] = f_gdowns[i] elif e==1 : d['gene'] = f_gups[i] elif e==2 : d['gene'] = r_gdowns[i] elif e==3 : d['gene'] = r_gups[i] d['dist'] = deltas[i,e] * (-1 if e < 2 else 1) arr.append(d) tf_stats[k][k2]['primaries'] = primaries tf_stats[k][k2]['secondaries'] = secondaries return tf_stats return mem.getOrSet(set_assay_gprops, **mem.rc(kwargs, name = kwargs['atype'], hardcopy = True))
def tf_chip_peaks(**kwargs): def setTf_Chip_Peaks(**kwargs): root = cfg.dataPath('wormchip') files = [os.path.join(root, f) for f in os.listdir(root)] out = {} for f in files: fopen= open(f) data = [l for l in fopen.readlines() if not l[0] == '#'] out[os.path.basename(f)] = \ [dict(zip(['chr', 'meth', 'type', 'start','end','score', 'blank','blank2','qValue' ], l.strip().split('\t'))) for l in data] vlens = [len(e) for e in out[os.path.basename(f)]] for k,v in out.iteritems(): for d in v: d['start'] = int(d['start']) d['end'] = int(d['end']) d['score'] = float(d['score']) d['qValue'] = float(d['qValue'].split('=')[1]) return out return mem.getOrSet(setTf_Chip_Peaks, **mem.rc(kwargs, hardcopy = True))
def get_simple_description(**kwargs): kwargs['atype'] = kwargs.get('atype', default_atype) def set_simple_description(**kwargs): props = get_assay_gprops(**mem.sr(kwargs)) chips = get_assay_info(**mem.sr(kwargs)) simple = {} for tf, assays in props.iteritems(): simple[tf] = {} assay_keys = assays.keys() idfun = lambda g: g.qualifiers['db_xref'][1][9:] simple[tf]['gnames'] = list(it.chain(*[ [idfun(e['gene']) for e in assays[k]['primaries']] for k in assay_keys ])) simple[tf]['genes'] = list(it.chain(*[ [e['gene'] for e in assays[k]['primaries']] for k in assay_keys ])) simple[tf]['dists'] = array(list(it.chain(*[ [e['dist'] for e in assays[k]['primaries']] for k in assay_keys ]))) simple[tf]['scores'] = array(list(it.chain(*[ [e['score'] for e in chips[tf][k]] for k in assay_keys ]))) return simple return mem.getOrSet(set_simple_description, **mem.rc(kwargs, name = kwargs['atype']))
def get_mean_induction(**kwargs): def set_mind(**kwargs): cre, cre_rndvals, keys = get_mutants() return mean(cre_rndvals[:,0])/ mean(cre_rndvals[:,1]) return mem.getOrSet(set_mind, **mem.rc(kwargs, register = promoter_type, on_fail = 'compute'))
def get_easy0(**kwargs): """an easy inference using the arbitrary distance cutoff of 3000 bases and grabbing the n highest scoring edges globally""" def set_easy0(**kwargs): atype = kwargs.get("atype") simple = wp.get_simple_thr(atype=atype, dthr=1500, dsign=-1, sthr=1e-4) raise Exception() score_soft_cut = -136 score_hard_cut = -90 sids = wu.symbol_ids() prop_tuples = [] for tf, props in simple.iteritems(): # for now, remove tfs that are not mappable if not tf in sids.keys(): continue ssrt = argsort(props["scores"]) lscores = log10(props["scores"][ssrt]) easy = nonzero(less(lscores, score_soft_cut))[0] medium = nonzero(greater(lscores, score_soft_cut) * less(lscores, score_hard_cut))[0] # generous_edges = concatenate([easy,medium]) prop_tuples.append( [ (tf, props["genes"][g], -(score_hard_cut - lscores[g]) / (score_soft_cut - score_hard_cut)) for g in medium ] ) prop_tuples.append([(tf, props["genes"][g], 1) for g in easy]) edgelist = array(list(it.chain(*prop_tuples))) edges = [(sids[e[0]], e[1].qualifiers["db_xref"][1][9:], e[2]) for e in edgelist] return edges return mem.getOrSet(set_easy0, **mem.rc(kwargs))
def chromosome_offsets(**kwargs): def set_chromosome_offsets(**kwargs): lens =[] names = chromosome_names() for name in names: root = cfg.dataPath('/data/genomes/Caenorhabditis_elegans') fdir = os.path.join(root,name) for r, d, files in os.walk(fdir): for f in files: if '.gb' in f: fopen = open(os.path.join(r,f)) break gb = list(sio.parse(fopen, 'genbank'))[0] fopen.close() lens.append( gb.features[0].location.end.position) offsets = {} cur_ofs = 0 for i, l in enumerate(lens): offsets[names[i]] = cur_ofs cur_ofs += l return offsets return mem.getOrSet(set_chromosome_offsets, **mem.rc(kwargs, hardcopy = True))
def getTrackChrGenes(**kwargs): ''' Get all of th genes from a bed file on a given chromosome. kwargs num: chromosome number fname: bedfile path returns a list of attributes for every gene. ''' def setTrackChrGenes(**kwargs): fname = kwargs.get('fname', mousefile) num = kwargs.get('num', 1) t = track.load(fname); chromosome_data = t.read('chr{0}'.format(num)) rows = [dict(zip(r.keys(),r.data)) for r in iter(chromosome_data)] return rows return mem.getOrSet(setTrackChrGenes, **mem.rc( kwargs, onfail = 'compute', name = '{0}_{1}'.format(kwargs.get('fname',os.path.basename(mousefile)), kwargs.get('num', 1)) ))
def plotPeaks(num = 1): import cb.utils.plots as myplots def setHist(**kwargs): peaks = getPeaks()['chr{0}'.format(num)] proms = getTrackChrPromoters(num = num) all_hits = zeros(20) for k,v in proms.iteritems(): mid =(v[0] + v[1]) / 2 deltas = [] for p in peaks: pmid = (p['start'] + p['end'])/2 if abs(pmid - mid) < 5000: deltas.append(pmid - mid) hits, bin_offsets = histogram(deltas, 20, [-5000,5000]) all_hits += hits; return bin_offsets, all_hits; bin_offsets, hits = mem.getOrSet(setHist, num = num) f = myplots.fignum(1) ax = f.add_subplot(111) ax.set_xlabel('distance from promoter') #ax.set_xticks(bin_offsets) #ax.set_xticklabels(['{0}'.format(e) for e in bin_offsets]) ax.set_ylabel('counts') ax.plot(bin_offsets[:-1],hits)
def getTrackChrPromoters(**kwargs): ''' Get all of the forward promoter from a bed file on a given chromosome> kwargs num: chromosome number fname: bedfile path returns a list of the coordinates of each forward promoter. ''' def setTrackChrPromoters(**kwargs): fname = kwargs.get('fname', mousefile) num = kwargs.get('num', 1) t = track.load(fname); chromosome_data = t.read('chr{0}'.format(num)) rows = [dict(zip(r.keys(),r.data)) for r in iter(chromosome_data)] fwd_genes = [e for e in rows if e['strand'] == 1] fwd_starts =dict([(e['name'],e['start']) for e in fwd_genes]) fwd_promoters= dict([(k, [v - 2000, v - 100]) for k,v in fwd_starts.iteritems()]) return fwd_promoters return mem.getOrSet(setTrackChrPromoters, onfail = 'compute', name = '{0}_{1}'.format(kwargs.get('fname',os.path.basename(mousefile)), kwargs.get('num', 1)))
def get_tss(**kwargs): def load_tss(**kwargs): cnames = chromosome_names() genes = parse_genes() out = {} for name in cnames: crgenes= genes[name] gstrands =array([g.strand for g in crgenes]) fwd = nonzero(greater(gstrands, 0))[0] rev = nonzero(less(gstrands,0))[0] gstarts = array([g.location.start.position for g in crgenes]) gends =array( [g.location.end.position for g in crgenes]) fstarts = gstarts[fwd] rends = gends[rev] fstart_sorted = sorted([(fwd[i], s) for i, s in enumerate(fstarts)], key = lambda x: x[1]) fend_sorted = sorted([(rev[i], r) for i, r in enumerate(rends)], key = lambda x: x[1]) out[name] = {'fwd_genes': [e[0] for e in fstart_sorted], 'fwd_tss':[e[1] for e in fstart_sorted], 'rev_genes': [e[0] for e in fend_sorted], 'rev_tss':[e[1] for e in fend_sorted]} #note that gstarts begin sorted #gends on the other hand... do not. return out return mem.getOrSet(load_tss, **mem.rc(kwargs,hardcopy =True))
def site_mut_inds(**kwargs): def set_site_muts(**kwargs): l = len(get_cons()) site_muts = [set( get_trip_muts(idx) ) for idx in range(l)] return site_muts return mem.getOrSet(set_site_muts, **mem.rc(kwargs, register = promoter_type, on_fail = 'compute'))
def get_num_seqs(**kwargs): def set_num_seqs(**kwargs): ntdict = nt_ids() cre, cre_rndvals, keys = get_mutants() return array([[ntdict[let] for let in seq] for seq in cre]) return mem.getOrSet(set_num_seqs, **mem.rc(kwargs, register = promoter_type, on_fail = 'compute'))
def get_synapse_dict(**kwargs): def set_synapse_dict(**kwargs): rows = get_rows() all_out_cxns = dict([(k, [ e for e in list(val) ]) for k,val in it.groupby(\ sorted(rows, key = lambda x: x[0]), key = lambda x: x[0])]) return all_out_cxns return mem.getOrSet(set_synapse_dict, **mem.rc(kwargs))
def get_map_rows(**kwargs): def set_map_rows(**kwargs): mapfile = cfg.dataPath("wormbase/loci_all.txt") fopen = open(mapfile) lines = fopen.readlines() cols = [e.strip() for e in lines[0].strip().split(",")] rows = [dict(zip(cols, [e.strip() for e in l.strip().split(",")])) for l in lines[1:-1]] return rows return mem.getOrSet(set_map_rows, **mem.rc(kwargs))
def get_cons(**kwargs): def consensus_seq(seqs): return [ sorted([(k,list(g)) for k, g in it.groupby(sorted(c)) ], key = lambda x: len(x[1]))[-1][0] for c in seqs.T] def set_cons(**kwargs): seqs, seqs_rndvals, keys = get_mutants(**mem.sr(kwargs)) cons = consensus_seq(seqs[::100]) return cons cons = mem.getOrSet(set_cons, **mem.rc(kwargs, register = promoter_type, on_fail = 'compute')) return cons
def fetch_alinames(): def setFetchAliNames(**kwargs): fa_file = cfg.dataPath('pvt1/pvt1.fa') ali = aio.parse(open(fa_file), 'fasta') a0 = ali.next() return [a.id for a in a0] return mem.getOrSet(setFetchAliNames, on_fail = 'compute')
def get_rows(**kwargs): def set_rows(**kwargs): root = cfg.dataPath('wormbrain/2006') connect_file = os.path.join(root, 'NeuronConnect.xls') fp_file = os.path.join(root,'NeuronFixedPoints.xls') cwb = xlrd.open_workbook(connect_file) sh = cwb.sheets()[0] rows = [[e.value for e in sh.row(i)] for i in range(1,sh.nrows) ] return rows return mem.getOrSet(set_rows, **mem.rc(kwargs))
def get_graph(**kwargs): def set_graph(**kwargs): edge_set = get_edge_set() rows = get_rows(**mem.sr(kwargs)) sub_cxns = dict([(k, [ e for e in list(val) if e[2] in edge_set]) for k,val in it.groupby(\ sorted(rows, key = lambda x: x[0]), key = lambda x: x[0])]) g = nx.DiGraph(); for k, v in sub_cxns.iteritems(): g.add_weighted_edges_from([(e[0], e[1], e[3]) for e in v]) return g return mem.getOrSet(set_graph,**mem.rc(kwargs))
def fetch_num_ali(): def setFetchNumAli(**kwargs): fa_file = cfg.dataPath('pvt1/pvt1.fa') ali = aio.parse(open(fa_file), 'fasta') a_ali = ali.next() a0 = [[nt_dict[elt] for elt in a.seq.upper()] for a in a_ali] a0_num = array(a0, byte) return a0_num return mem.getOrSet(setFetchNumAli, on_fail = 'compute')
def get_motif_dicts(pad = 2, **kwargs): def set_motif_dicts(**kwargs): masks = cre_masks(kwargs.get('pad')) out = {} cons = [nt_ids()[let] for let in get_cons()] for j, seq in enumerate(get_num_seqs()): key = tuple([ i for i , mask in enumerate( masks ) if sum(not_equal(seq,cons) * mask) != 0 ]) if not out.has_key(key): out[key] = [] out[key].append(j) return out return mem.getOrSet(set_motif_dicts, **mem.rc(kwargs, pad = pad, register = '{0}_{1}'.format(promoter_type, pad), on_fail = 'compute'))
def last_5(**kwargs): dnums = range(1, 40) delts = [(int(9 + floor(d / 30)), 1 + int(d % 30)) for d in dnums] days = ["2011-{0}-{1}".format(*delt) for delt in delts] def set_l5(**kwargs): days = kwargs.get("days") all_results = {} for h in hashtags: all_results[h] = [] search = ["#{0} since:{1}".format(h, d) for d in days] for s in search: all_results[h].append(tweepy.api.search(s, rpp=100)) return all_results return mem.getOrSet(set_l5, **mem.rc(kwargs, days=days, name=",".join(days)[:20]))
def get_array_imaps(**kwargs): def set_array_imaps(**kwargs): sdict =get_synapse_dict(**mem.sr(kwargs)) nameset = set([]) for k,v in sdict.iteritems(): nameset.add(k) nameset.update([r[1] for r in v]) nnames = list(nameset) ctypes = [u'Rp', u'EJ', u'Sp', u'S', u'R', u'NMJ'] ctypes_imap = dict([(k,i) for i, k in enumerate(ctypes)]) nnames_imap = dict([(k,i) for i, k in enumerate(nnames)]) return {'ctypes':ctypes, 'ctypes_imap':ctypes_imap, 'nnames':nnames, 'nnames_imap':nnames_imap} return mem.getOrSet(set_array_imaps, **mem.rc(kwargs))
def get_simple_thr(**kwargs): kwargs['dthr'] = kwargs.get('dthr',None) kwargs['sthr'] = kwargs.get('sthr',None) kwargs['dsign'] = kwargs.get('dsign',None) kwargs['atype'] = kwargs.get('atype', default_atype) def set_simple_thr(**kwargs): dthr = kwargs['dthr'] dsign = kwargs['dsign'] sthr = kwargs['sthr'] simple = get_simple_description(**mem.sr(kwargs)) out = {} for k,v in simple.iteritems(): criteria = ones(len(v['scores'])) if sthr != None: criteria *= less(v['scores'],sthr) if dsign != None: criteria *= greater(v['dists']*dsign, 0) if dthr != None: criteria *= less(abs(v['dists']),dthr) allowed = nonzero(criteria)[0] out[k] = {'genes':[v['genes'][i] for i in allowed], 'gnames':[v['gnames'][i] for i in allowed], 'dists':v['dists'][allowed], 'scores':v['scores'][allowed]} return out #names = {'wormtile':'sthr_{0}'.format(kwargs['sthr']), # 'tfchip':'dthr_{0}_sthr_{1}'.\ # format(kwargs['dthr'], # kwargs['sthr']) # } tkwargs = dict([(k,kwargs[k]) for k in ['dthr','sthr', 'dsign']]) name = '{0}:'.format(kwargs['atype']) + \ '_'.join(it.chain(*sorted([(str(k),str(v)) for k,v in tkwargs.iteritems()], key = lambda x: x[0]))) return mem.getOrSet(set_simple_thr, **mem.rc(kwargs, name =name))
def get_flows(**kwargs): def set_flows(**kwargs): return mincost.flow_all() return mem.getOrSet(set_flows,**kwargs)