def init(**kwargs): ''' Read in the 16s tree of life and a random clade corresponding to the halobacteria. At each node, sets metadata from the databases that I have grabbed. Metadata (node.m) for terminal nodes includes: taxnode -- ncbi taxon of the node gbacc -- genbank accession number of the 16s for the node gbid -- genbank id of the 16s for the node inputs: reset [False] output: tree <biopython tree>, the entire 16s tree of life halo <biopython clade>, a clade of the tree of life usage: tree, halo = init() ''' print 'testing...' def setTree(**kwargs): nwk = Phylo.read(config.dataPath('sequences/16s.newick'),"newick") for n in it.chain(nwk.get_terminals(),nwk.get_nonterminals()): n.m = {} db_metadata(nwk) print "SETTING TREE!!!" return nwk return mem.getOrSet(setTree, **mem.rc( kwargs, name = kwargs.get('name', 'default_tree'), on_fail = 'compute', register = 'init'))
def datafiles(**kwargs): def set_datafiles(**kwargs): out ={} idmap = id_map(**mem.sr(kwargs)) for k,v in idmap.iteritems(): out[k] = array([ [float(e) for e in re.compile('\s+').split(l.strip())] for l in open(v['file']).readlines() if l[0] in '0123456789']) return out return mem.getOrSet(set_datafiles, **mem.rc(kwargs, on_fail = 'compute'))
def getBNet(**kwargs): '''Get the saved network from the knowledge based network, redFly. output: tuple of dicts keyed by gene/tf names trgs: {gname: {color:0.}{weights:[0....]}{tfs:['tfname']} ...} tfs : {tfname:{color:0.}{weights:[0....]}{tgs:['tfname']} ...}''' def setBNet(**kwargs): fpath = config.dataPath('network/network_predmodel/inputnetworks/bRN.txt') TC = getTC( reset = mod(kwargs.get('reset',0),2)) CL = getCL( reset = mod(kwargs.get('reset',0),2)) nwdata = open(fpath).read() #A few functions defined here to be used later trgfun = lambda x: x[1] wtfun = lambda x:float( x[2] ) tffun = lambda x: x[0] sigmafun = lambda x: 1 / (1 + np.exp(-x /1)) r = re.compile('^[ ]*(?P<tf>\S+)\s+(?P<target>\S+)' ,re.M) matches = list(re.finditer(r,nwdata)) #Unsorted lists of tfs and targets targets =map(lambda x:x.group('target'),matches) tfs = map(lambda x:x.group('tf'),matches) weights =[1.0] * len(tfs) #Concat the data for easier sorting cat = [] for i in np.argsort(tfs): if TC.has_key(tfs[i]) and CL.has_key(targets[i]): cat.append([tfs[i],targets[i],weights[i]]) #Extract a dictionary with information for each target. trg_d = {} count = 0.0 for k, g in it.groupby(sorted(cat,key = trgfun),key = trgfun): l = list(g) count += 1.0 trg_d[k] = {'color': np.array([count, 0, 0]), 'tfs' : map(tffun,l), 'weights': map(wtfun,l) } #Extract a dictionary with information for each TF tf_d = {} for k, g in it.groupby(cat,key = lambda x: x[0]): l = list(g) tf_targets = map(lambda x: x[1],l) tf_d[k] = {'targets':map(trgfun,l), 'weights':map(wtfun,l)} return (trg_d, tf_d) return mem.getOrSet(setBNet, **mem.rc({},on_fail = 'compute',**kwargs)) pass
def getBDTNP(protein = False,misc = False, **kwargs): def setBDTNP( protein = False, misc = False, **kwargs): gene_cols, misc_cols, rows, row_nns = bdtnp.parser.read() mapfile = open(config.dataPath('flybase/gene_map.tsv')) map_rows = [] for l in mapfile.xreadlines(): l = l.replace('\n','') if l != '' and l[0] != '#' : map_rows.append(l.split('\t')) syms = [x[0] for x in map_rows] fbids= [x[1] for x in map_rows] times = set(it.chain(*[x['steps'] for x in gene_cols.values()])) for g in gene_cols.values() + misc_cols.values(): gene_rows = zeros((len(rows), len(times))) for i,t in enumerate(times): if t in g['steps']: row = rows[:, g['idxs'][g['steps'].index(t)]] else: row = zeros(len(rows)) gene_rows[:,i] = row #if g['info']['short_name'] == 'danr': raise Exception() g['vals'] = gene_rows protein_cols = dict([(k,val) for k,val in gene_cols.iteritems() if val['info']['type'] == 'protein']) mrna_cols = dict([(k,val) for k,val in gene_cols.iteritems() if val['info']['type'] == 'mRNA']) #things that are wonky include: # 1) Protein data (where column names do not match flybase symbols) # 2) Weird elements such as Traf1 that are not present in the network anyway # 3) FBgn0031375 / CG31670 which is apparently absent from the map and I fix. mrna_idxs = [syms.index(k) if k in syms else syms.index('erm') if k == 'CG31670' else -1 for k in mrna_cols.keys()] mrna_fbids = [fbids[idx] if idx != -1 else '' for idx in mrna_idxs] protein_idxs = [syms.index(k[:-1]) if k[:-1] in syms else -1 for k in protein_cols.keys()] protein_fbids = [fbids[idx] if idx != -1 else '' for idx in protein_idxs] if misc: return misc_cols if protein: return dict( [(protein_fbids[i], protein_cols.values()[i]) for i, elt in enumerate(protein_idxs) if elt != -1]) else: return dict( [(mrna_fbids[i], mrna_cols.values()[i]) for i, elt in enumerate(mrna_idxs) if elt != -1]) return mem.getOrSet(setBDTNP, **mem.rc(kwargs, register ='protein' if protein else \ 'misc' if misc else 'mrna', protein = protein, misc = misc, on_fail = 'compute'))
def getBTOL(**kwargs): def setBTOL(**kwargs): B = BTOL(**mem.sr(kwargs)) if not B.treeInitialized(): print 'Underlying tree structure apparently uninitialized: initializing\n...' B.initTree() print '...\nDone\nSaving\n...' B.saveTree() print '...\nDone' return B return mem.getOrSet(setBTOL, **mem.rc(kwargs, register = 'BTOL'))
def get_seqs(dbname, **kwargs): def set_seqs(**kwargs): cbdb = compbio.projects.cbdb dbname = kwargs['dbname'] dbi = cbdb.getName(dbname) nodes = dbi.S.q(dbi.Sequence).all() return nodes kwnew = mem.rc(kwargs,hardcopy = False, name = dbname, on_fail = 'compute', dbname = dbname) return mem.getOrSet(set_seqs, **kwnew)
def leafNodes(self,**kwargs): def setLeafNodes(**kwargs): all_leaves = self.t.get_terminals() dbi = cbdb.getName('taxdmp') all_nodes = [ ncbi.get_node(l.m['taxid'],dbi) if 'taxid' in l.m.keys() else None for l in all_leaves] return all_nodes nodes = mem.getOrSet(setLeafNodes, **mem.rc(kwargs, hardcopy = False, on_fail = 'compute', register = 'leaf_nodes')) return nodes
def recall_c2(**kwargs): ''' A kludgy wrapper to store the clustering results for later without modifying the original mess of a program, c2... ''' def setC2(**kwargs): ll = c2(**mem.sr(kwargs)) result = c2(ll, **mem.sr(kwargs)) return result return mem.getOrSet(setC2, **mem.rc(kwargs, name = 'default_c2_settings', on_fail = 'compute'))
def get_taxnodes(dbname, **kwargs): def set_taxnodes(**kwargs): all_seqs = get_seqs(dbname,**mem.sr(kwargs)) seq_taxa = [s.source_taxon if s.source_taxon else None for s in all_seqs] alinodes = [ncbi.get_node(s) if s != None else None for s in seq_taxa] return alinodes return mem.getOrSet(set_taxnodes, **mem.rc(kwargs, on_fail = 'compute', hardcopy = False, register = dbname))
def get_taxon_forsome(nodes,rank,set_name = 'default_setname',**kwargs): def set_taxon_forsome(nodes = None, rank = None,**kwargs): assert nodes != None and rank != None taxon = [ncbi.get_taxon(node, rank = rank) if node else None for node in nodes] return taxon return mem.getOrSet(set_taxon_forsome, **mem.rc(kwargs, nodes = nodes, rank = rank, on_fail = 'compute', hardcopy = False, register= set_name + rank))
def get_taxon_forall(aliname, rank = None, **kwargs): def setTaxon(aliname = None, rank = None,**kwargs): assert aliname != None and rank != None nodes = get_taxnodes(aliname,**mem.sr(kwargs)) taxon = [ncbi.get_taxon(node, rank=rank) if node else None for node in nodes] return taxon return mem.getOrSet(setTaxon, **mem.rc(kwargs, aliname = aliname, rank = rank, on_fail = 'compute', hardcopy = False, register = aliname + rank))
def get_reinitz_data(**kwargs): ofs = kwargs.get('ofs',0) do_plot_coords = kwargs.get('plot_coords',False) do_plot_vals = kwargs.get('plot_vals',False) idm= id_map() df = datafiles(**mem.rc(kwargs)) #I'm not sure exactly how this dataset works but #each nuclei has a bunch of numbers that appear to be #monotonically increasing. # #I just take the first instance. nums = dict([(k,v[:,0]) for k, v in df.iteritems()]) nuc_count = len(set(nums.values()[2])) values = dict([(k,v[nuc_count *ofs: nuc_count *(ofs + 1),-1]) for k, v in df.iteritems()]) coords = dict([(k,v[nuc_count *ofs :nuc_count *(ofs + 1),1:3]) for k, v in df.iteritems()]) #to check the basic consistency of the data, enable the plot routines. #I suppose that I could do this for all of the nuclei occurences... #right now, only the first is used. if do_plot_coords: f = myplots.fignum(1,(8,8)) ax = f.add_subplot(111) ct = mycolors.getct(len(values)) for i,k in enumerate(values.keys()): ax.scatter(coords[k][:,0][::1], coords[k][:,1][::1], 10, edgecolor = 'none', alpha = .25,c =ct[i], label = k, ) f.savefig(myplots.figpath( 'reinitz_exprdata_coords_nuc_offset={0}'.format(ofs))) if do_plot_vals: f = myplots.fignum(1,(8,8)) ax = f.add_subplot(111) ct = mycolors.getct(len(values)) for i,k in enumerate(values.keys()): ax.scatter(coords[k][:,0][::1], values[k][::1], 10, edgecolor = 'none',alpha = .25,c =ct[i], label = k, ) f.savefig(myplots.figpath( 'reinitz_exprdata_ap_vals_nuc_offset={0}'.format(ofs))) return coords, values
def id_map(**kwargs): def set_id_map(**kwargs): fname = cfg.dataPath('reinitz/28-7-2011-1-56-6-30-0/txt/byGenes') gsums = open(cfg.dataPath('flybase/gene_summaries.tsv')) gmap = open(cfg.dataPath('flybase/gene_map.tsv')) gassoc = open(cfg.dataPath('flybase/gene_association.fb')) gname_orig = [ os.path.splitext(f)[0].lower() for f in os.listdir(fname) ] gfiles =dict( [ (gname_orig[i], os.path.join(fname,f)) for i, f in enumerate(os.listdir(fname)) ] ) gname_map = dict([( re.sub( re.compile('[^a-z]'),'',g), g) for g in gname_orig]) gnames = gname_map.keys() glines = dict([(k.lower(),[]) for k in gnames]) lines_kept = {} for i, g in enumerate(gassoc.xreadlines()): if g[0] == '!': continue g0 = g g = re.sub( re.compile('[^a-z]'),'', g.lower().split('\t')[9].strip()) for k,v in glines.iteritems(): if k == g: v.append((i,g)) lines_kept[i] = g0 matches = glines ids = {} for k, v in matches.iteritems(): names = [ l[1] for l in v] line_nums = [ l[0] for l in v] these_ids = [lines_kept[i].split('\t')[1].strip() for i in line_nums] #just hacking here... for sloppy paired I use the first id... #alas... ids[k] = tuple(sorted(set(these_ids)))[0] return dict([ (idval, {'file': gfiles[gname_map[k]], 'name':gname_map[k]}) for k, idval in ids.iteritems()]) #name_grps = dict([(gpkey, list(g)) for gpkey, g in it.groupby(sorted(names))]) #print k #print [ (gk, len(gv)) for gk, gv in name_grps.iteritems()] return mem.getOrSet(set_id_map,**mem.rc(kwargs,on_fail = 'compute'))
def getSush(**kwargs): '''Get sushmita's regression weights and biases''' def setSush(**kwargs): path = config.dataPath('network/network_predmodel/regressionwts/fRN') bias_files = [ os.path.join( path, f) for f in os.listdir(path) if 'bias' in f ] nw_files = [ os.path.join( path, f) for f in os.listdir(path) if 'nw' in f ] bias_re = re.compile('(?P<gname>\S+)\s+(?P<level>\S+)') weight_re = re.compile('(?P<gname>\S+)\s+(?P<tfname>\S+)\s+(?P<level>\S+)') genes = {} for b in bias_files: for l in open(b).xreadlines(): match = bias_re.search(l) genes[match.group('gname')] = dict(bias = match.group('level')) for n in nw_files: for l in open(n).xreadlines(): match = weight_re.search(l) g = genes[match.group('gname')] g['tfs'] = g.get('tfs', []) + [match.group('tfname')] g['weights'] = g.get('weights', []) + [match.group('level')] return genes return mem.getOrSet(setSush, **mem.rc(kwargs, hardcopy = True))
def getNet(**kwargs): '''Get the saved network from patrick's files. output: tuple of dicts keyed by gene/tf names trgs: {gname: {color:0.}{weights:[0....]}{tfs:['tfname']} ...} tfs : {tfname:{color:0.}{weights:[0....]}{tgs:['tfname']} ...}''' def setNet(**kwargs): net_name = kwargs.get('net_name', 'unsup') if net_name == 'unsup': netfile = 'unsup_patrick.txt' elif net_name == 'logistic': netfile = 'logistic_0.6.txt' else: raise Exception() fpath = config.dataPath('network/patrick/{0}'.format(netfile)) TC = getTC( reset = mod(kwargs.get('reset',0),2)) CL = getCL( reset = mod(kwargs.get('reset',0),2)) nwdata = open(fpath).read() #A few functions defined here to be used later trgfun = lambda x: x[1] wtfun = lambda x:float( x[2] ) tffun = lambda x: x[0] sigmafun = lambda x: 1 / (1 + np.exp(-x /1)) r = re.compile('^[ ]*(?P<tf>\S+)\s+(?P<target>\S+)\s+(?P<weight>\S+)' ,re.M) matches = list(re.finditer(r,nwdata)) #Unsorted lists of tfs and targets targets =map(lambda x:x.group('target'),matches) tfs = map(lambda x:x.group('tf'),matches) weights =map(lambda x:x.group('weight'),matches) #Concat the data for easier sorting cat = [] for i in np.argsort(tfs): if TC.has_key(tfs[i]) and CL.has_key(targets[i]): cat.append([tfs[i],targets[i],weights[i]]) #Extract a dictionary with information for each target. trg_d = {} count = 0.0 for k, g in it.groupby(sorted(cat,key = trgfun),key = trgfun): l = list(g) count += 1.0 trg_d[k] = {'color': np.array([count, 0, 0]), 'tfs' : map(tffun,l), 'weights': map(wtfun,l) } #Extract a dictionary with information for each TF tf_d = {} for k, g in it.groupby(cat,key = lambda x: x[0]): l = list(g) tf_targets = map(lambda x: x[1],l) tf_d[k] = {'targets':map(trgfun,l), 'weights':map(wtfun,l)} return (trg_d, tf_d) return mem.getOrSet(setNet, **mem.rc(kwargs, hardcopy = True, on_fail = 'compute', register = kwargs.get('net_name', 'unsup'))) pass
def draw_cm_muscle_congruencies(seqs, profiles, run_id, reset = True): print 'computing alignments...' print ' ...using muscle' malis, mrefs, mpairs =\ mem.getOrSet(setAlignments, **mem.rc({}, seqs = seqs, profiles = profiles, run_id = run_id, ali_type = 'muscle', reset = reset, on_fail = 'compute', register = 'tuali_musc_{0}'.format(run_id))) print ' ...using cmalign.' salis, srefs, spairs =\ mem.getOrSet(setAlignments, **mem.rc({}, seqs = seqs, profiles = profiles, run_id = run_id, ali_type = 'struct', reset = reset, on_fail = 'compute', register = 'tuali__struct_{0}'.format(run_id))) print ' ...making trees.' for idx, alis in enumerate(zip(malis, salis)): m, s = alis mtree = phyml.tree(m,run_id, bionj = True) stree = phyml.tree(s,run_id, bionj = True) maps = dict([(elt.id,i) for i, elt in enumerate(m)]) mdists = zeros((len(maps),len(maps))) sdists = zeros((len(maps),len(maps))) for n1 in mtree.get_terminals(): for n2 in mtree.get_terminals(): mdists[maps[n1.name],maps[n2.name]] = \ mtree.distance(n1,n2) for n1 in stree.get_terminals(): for n2 in stree.get_terminals(): sdists[maps[n1.name],maps[n2.name]] = \ stree.distance(n1,n2) tree_similarity(sdists, mdists, '{0}_struct_{1}'.format(run_id,idx), k = len(sdists - 1)) tree_similarity(sdists, mdists, '{0}_struct_{1}'.format(run_id,idx), k = 6) f = myplots.fignum(4, (8,10)) ct = mycolors.getct(len(mtree.get_terminals())) import networkx for t, sp, ttype in zip([mtree, stree], [211,212], ['sequence', 'structural']): a = f.add_subplot(sp) layout = 'neato' G = phylo.to_networkx(t) Gi = networkx.convert_node_labels_to_integers(G, discard_old_labels=False) posi = networkx.pygraphviz_layout(Gi, layout, args = '') posn = dict((n, posi[Gi.node_labels[n]]) for n in G) networkx.draw(G, posn, labels = dict([(n, '') for n in G.nodes()]), node_size = [100 if n.name in maps.keys() else 0 for n in G.nodes()], width = 1, edge_color = 'black', ax = a, node_color = [ct[maps.get(n.name, -1)] for n in G.nodes()] ) a.annotate('Embedded tree for {0} alignment.'.format(ttype), [0,1], xycoords = 'axes fraction', va = 'top', xytext = [10,0],textcoords = 'offset pixels') a.annotate('Total branch length is {0}'.format(t.total_branch_length()), [1,0], xycoords = 'axes fraction', ha = 'right', xytext = [-10,10],textcoords = 'offset pixels') #phylo.draw_graphviz( mtree, label_func = lambda x: '', # node_color = [ct[maps.get(n.name, -1)] for n in G.nodes()] +\ # [ct[0] for n in mtree.get_nonterminals()], axes = ax) datafile = cfg.dataPath('figs/gpm2/pt2_mus_cm_tree_embeddings_{0}_struct_{1}.ps'.format(run_id, idx)) f.savefig(datafile, dpi = 200, format = 'ps')
def show_conservation(fidx = 0, reset = False): fnum = flist[fidx] rfid = 'RF{0:05}'.format(fnum) print rfid if fnum ==50: ftype = 'riboswitch' else: ftype = 'all' out = mem.getOrSet(setFamData, **mem.rc({}, reset =reset, on_fail = 'compute', hardcopy = False, register = 'fdat'+rfid, ftype = ftype, rfid = rfid)) mvals, tvals, structs = mem.getOrSet(setTree, **mem.rc({},reset = reset, on_fail = 'compute', hardcopy = True, register = 'st'+rfid, rfid = rfid, ftype = ftype)) idxs, tidx = sutils.show_paired_v_energy(rfid,rfid,mvals,tvals,structs,ftype) all_pairs = structs['structs'] all_energies = structs['energies'] pints,eints, mints, tints = [structs['structs'][i] for i in idxs],\ [ structs['energies'][i] for i in idxs],\ [ mvals[tidx][i] for i in idxs],\ [ tvals[tidx][i] for i in idxs] seq = structs['seq'] if do_make_subopts: subopts = rutils.suboptimals(seq, n = 400) verts = rutils.struct_verts(subopts, seq, rfid) f = myplots.fignum(4,figsize) rplots.grid_rnas(verts, dims = [40]) f.savefig(figfile.format('{0}_grid_rnas'.\ format(rfid))) aff = rutils.struct_affinity_matrix(all_pairs, len(seq)) pca = rutils.project_structs(all_pairs, ptype ='pca', affinities = aff, n_comp = 3) for metric in ['n_comp']:# ['frac_silent','frac_paired','n_comp']: scolors = [] for i in range(len(tvals[tidx])): m_silent, pidxs, frac_good = sutils.metric( mvals[tidx][i],tvals[tidx][i], mtype = metric) scolors.append(mean(m_silent)) scolors = myplots.rescale(scolors, [0.,1.])[:,newaxis] * array([1.,0.,0.]) f = myplots.fignum(4,figsize) ax = f.add_subplot(111) xvals, yvals = pca[:,:2].T myplots.padded_limits(ax, xvals, yvals) ax.scatter(xvals,yvals,300,linewidth = 1, edgecolor = 'black', color = scolors) ax.scatter(pca[idxs,0],pca[idxs,1], 2100 ,alpha = 1, color = 'black') ax.scatter(pca[idxs,0],pca[idxs,1], 2000 ,alpha = 1, color = 'white') ax.scatter(pca[idxs,0],pca[idxs,1], 400 ,alpha = 1, color = scolors[idxs], ) ax.annotate('''Conservation metric: {0} Projected onto C=2 Principal Components'''.format(metric), [0,1],xycoords = 'axes fraction', va = 'top', xytext = [10,-10],textcoords='offset points') f.savefig(figfile.format('{0}_pca_{1}'.\ format(rfid, metric)))
def modules(reset=False): return mem.getOrSet(setModules, **mem.rc({}, reset=reset, hardcopy=True, on_fail="compute"))
def eval_seq_group(gap_seqs, rfid, run_id, inp_run_id, reset = True, draw_alis = draw_all_easy, clade_alignment_method = clade_alignment_method, max_structs = 5): rutils = utils data = butils.load_data(inp_run_id, 'output') structs = data['structs'] energies = data['energies'] esrt = argsort(energies)[::-1] s_inds = esrt[:max_structs] structs, energies = [structs[i] for i in s_inds], [energies[i] for i in s_inds] refseq = data['seq'] nq = len(gap_seqs) ns = len(structs) names = ['N{1:04}'.format(rfid, idx) for idx in range(nq)] seqs = [rutils.ungapped_seq(gap_seqs[i], names[i]) for i in range(nq)] profiles = mem.getOrSet(setProfiles, **mem.rc({}, seq = refseq, structs = structs, run_id = rfid, reset = reset, on_fail = 'compute', register = 'tuprof_{0}'.format(rfid))) if draw_alis: draw_cm_muscle_congruencies(seqs, profiles, run_id, reset = reset) if clade_alignment_method == 'cm': alis, refs, all_pairs =\ mem.getOrSet(setAlignments, **mem.rc({}, seqs = seqs, profiles = profiles, run_id = rfid, ali_type = 'struct', reset = reset, on_fail = 'compute', register = 'tuali_struct_{0}'.format(rfid))) else: raise Exception('No methods besides cm are yet implemented') seq_group_data = {} seq_group_data['seqs'] = gap_seqs seq_group_data['structs'] = [] for i, struct in enumerate(structs): struct_data = {} ali = alis[i] ref = refs[i] pairs = all_pairs[i] #NOTE THAT DUE TO AN AWKWARD SYNTAX DECISION, #I AM ALLOWING FOR THE POSSIBILITY THAT EACH #ALI ELT HAS DIFFERENT PAIRS. # #ALL OF MY ROUTINES SO FAR ONLY USE A SINGLE #PAIR SET AND SO I USE PAIRS[0] EXCLUSIVELY struct_data.update(ref = ref[0], pairs = pairs[0], ali = ali) rid = '{0}_{1}'.format(run_id, i) if clade_tree_method == 'bionj': tree = phyml.tree(ali, run_id = rid, bionj = True) else: tree = get_phase_tree(ali, pairs[0], run_id) for i, ct in enumerate(tree.get_terminals()): seq = filter(lambda x: x.id == ct.name, ali)[0] ct.m = {'seq':seq, 'probs':array([1 for j in range(len(seq))])} if clade_ancestor_method == 'independent': ml_tree = get_ml_ancestor_tree(tree, ali, '{0}_paml{1}'.format(run_id, i)) else: ml_tree = get_structure_ancestor_tree(\ tree, ali,'{0}_stree{1}'.format(run_id, i)) muts, times, gaps, irresolvables = tree_conservation.count_struct(ml_tree, pairs[0]) struct_data.update(muts = muts, times = times, gaps = gaps, irresolvables = irresolvables) seq_group_data['structs'].append(struct_data) return seq_group_data
def c2( launcher = None, ncluster =2000, host = 'tin', reset = 0, step = 10, exemp_time = 'all', doplot = False ,**kwargs): mrnas = nio.getBDTNP() misc = nio.getBDTNP(misc = True) vals = array([v['vals'] for v in mrnas.values()]) gvars = var(vals, 1) gminvars = np.min(gvars,1) gmedvars = median(gvars,1) min20 = argsort(gminvars)[::-1][:20] med20 = argsort(gmedvars)[::-1][:20] int20 = set(min20).intersection(set(med20)) xgenes = array(list(int20)) cell_data = vals[xgenes].transpose(1,2,0) scd = shape(cell_data) #times = reshape(zeros(shape(cell_data[0:2]))[:,:,newaxis , arange(shape(cell_data[1])) # , (prod(shape(cell_data)[0:2]))) xycoords = (arange(scd[0])[:,newaxis,newaxis]*[1,0] +\ arange(scd[1])[newaxis,:,newaxis]*[0,1]) cell_data = reshape(cell_data, (prod(shape(cell_data)[0:2]), shape(cell_data)[2] )) xy_data = reshape(xycoords, (prod(scd[0:2]),2 )) if exemp_time == 'all': inds = arange(len(cell_data)) else: inds = arange(len(cell_data))[nonzero(equal(xy_data[:,1],exemp_time))[0]] np.random.seed(1) np.random.shuffle(inds) rand_thousand = inds[0:ncluster] sim_data = cell_data[rand_thousand] sim_xy = xy_data[rand_thousand] t = [ mean(sim_data, 0), std(sim_data,0)] t[1][equal(t[1],0)] = 0 metric = 'neg_dist' sims = similarity(sim_data, transform = t, method = metric) name = 'll_{0}_{1}_{2}'.format(metric,ncluster,exemp_time) def setLauncher(**kwargs): sims= kwargs.get('sims') metric = kwargs.get('metric') name = kwargs.get('name') d_in = [] percs = logspace(.1,1.5,8) for p in percs: d_in.append(dict(similarities = sims, self_similarity = ss.scoreatpercentile(sims, p), metric = metric )) launcher = bcl.launcher(d_in, host = host, name = name) return launcher if launcher == None: output = mem.getOrSet(setLauncher, **mem.rc(dict(sims = sims, metric = metric, name = name, hardcopy = True, reset = reset, hard_reset = False,))) return output def setC2(launcher = launcher, **kwargs): if launcher == None: raise Exception() else: output = launcher.output() return output #It appears that the bsub process failed for the first output. #No big deal. Debug later. output = mem.getOrSet(setC2, **mem.rc(dict(harcopy = True, launcher = launcher, reset = reset, on_fail = 'compute', hard_reset = False, name = 'c2'+ name ))) all_inds = array([ squeeze(o['inds']) for o in output[:] ]) xs = misc['x']['vals'][zip(*xy_data)] #zip(*sim_xy)] ys = misc['y']['vals'][zip(*xy_data)] #zip(*sim_xy)] zs = misc['z']['vals'][zip(*xy_data)] #zip(*sim_xy)] colors =array( mycolors.getct(shape(all_inds)[1]) ) f = plt.figure(0) f.clear() all_tps = range(scd[1]) nc = len(all_inds) nt = len(all_tps) all_members = [] for i, inds in enumerate(all_inds): #compute similarity matrices 1000 at a time: exemplars = sim_data[list(set(list(inds)))] sim = similarity(cell_data, exemplars, transform = t, method = metric) closest = argmax(sim, 1) all_members.append(closest) if doplot: for j, tp in enumerate(all_tps): ax = f.add_axes( [float(j)/nt,float(i) /nc,1./nt, 1. /nc] ) ax.set_yticks([]) ax.set_xticks([]) i_sub = nonzero(equal(xy_data[:,1], j) * greater(ys,0))[0] cs = colors[closest[i_sub]] x = xs[i_sub] z = zs[i_sub] plt.scatter(x[::step],z[::step], 40,alpha = .75, c = cs[::step], edgecolor = 'none') ct_data = xy_data return all_members, ct_data