Example #1
0
def init(**kwargs):
  '''
  Read in the 16s tree of life and a random clade corresponding to the
  halobacteria.

  At each node, sets metadata from the databases that I have grabbed.
  Metadata (node.m) for terminal nodes includes:
    taxnode   -- ncbi taxon of the node
    gbacc     -- genbank accession number of the 16s for the node
    gbid      -- genbank id of the 16s for the node
    
  inputs:
    reset [False]

  output:
    tree  <biopython tree>, the entire 16s tree of life
    halo  <biopython clade>, a clade of the tree of life

  usage:
    tree, halo = init()
'''
  
  print 'testing...'
  def setTree(**kwargs):
    nwk = Phylo.read(config.dataPath('sequences/16s.newick'),"newick")
    for n in it.chain(nwk.get_terminals(),nwk.get_nonterminals()): n.m = {}
    db_metadata(nwk)
    print "SETTING TREE!!!"
    return nwk
  
  return mem.getOrSet(setTree,
                      **mem.rc( kwargs, 
                                name = kwargs.get('name', 'default_tree'),
                                on_fail = 'compute',
                                register = 'init'))
Example #2
0
def datafiles(**kwargs):
    def set_datafiles(**kwargs):
        out ={}
        idmap = id_map(**mem.sr(kwargs))
        for k,v in idmap.iteritems():
            out[k] = array([ [float(e) for e in re.compile('\s+').split(l.strip())] for l in open(v['file']).readlines() if l[0] in '0123456789'])
        return out
    return mem.getOrSet(set_datafiles, **mem.rc(kwargs,
                                                on_fail = 'compute'))
Example #3
0
File: io.py Project: bh0085/compbio
def getBNet(**kwargs):
  '''Get the saved network from the knowledge based network, redFly.

  output: tuple of dicts keyed by gene/tf names

          trgs: {gname: {color:0.}{weights:[0....]}{tfs:['tfname']}
                 ...}
          tfs : {tfname:{color:0.}{weights:[0....]}{tgs:['tfname']}
                 ...}'''
  def setBNet(**kwargs):
    fpath = config.dataPath('network/network_predmodel/inputnetworks/bRN.txt')
    TC = getTC( reset = mod(kwargs.get('reset',0),2))
    CL = getCL( reset = mod(kwargs.get('reset',0),2))
    nwdata = open(fpath).read()
    #A few functions defined here to be used later
    trgfun = lambda x: x[1]
    wtfun = lambda x:float( x[2] )
    tffun = lambda x: x[0]
    sigmafun = lambda x: 1 / (1 + np.exp(-x /1))

    r = re.compile('^[ ]*(?P<tf>\S+)\s+(?P<target>\S+)'
                   ,re.M)
    matches = list(re.finditer(r,nwdata))    
    #Unsorted lists of tfs and targets
    targets =map(lambda x:x.group('target'),matches)
    tfs =    map(lambda x:x.group('tf'),matches)
    weights =[1.0] * len(tfs)
    
    #Concat the data for easier sorting
    cat = []
    for i in np.argsort(tfs):
      if TC.has_key(tfs[i]) and CL.has_key(targets[i]):
	cat.append([tfs[i],targets[i],weights[i]])

    #Extract a dictionary with information for each target.
    trg_d = {}
    count = 0.0
    for k, g in it.groupby(sorted(cat,key = trgfun),key = trgfun):
      l = list(g)
      count += 1.0
      trg_d[k] = {'color': np.array([count, 0, 0]),
		  'tfs' : map(tffun,l),
		  'weights': map(wtfun,l)
		  }

    #Extract a dictionary with information for each TF
    tf_d = {}
    for k, g in it.groupby(cat,key = lambda x: x[0]):
      l = list(g)
      tf_targets = map(lambda x: x[1],l)
        
      tf_d[k] = {'targets':map(trgfun,l),
		 'weights':map(wtfun,l)}

    return  (trg_d, tf_d)
  return mem.getOrSet(setBNet, **mem.rc({},on_fail = 'compute',**kwargs))
  pass
Example #4
0
File: io.py Project: bh0085/compbio
def getBDTNP(protein = False,misc = False, **kwargs):
  def setBDTNP( protein = False, misc = False, **kwargs):
     gene_cols, misc_cols, rows, row_nns = bdtnp.parser.read()
     mapfile = open(config.dataPath('flybase/gene_map.tsv'))
     map_rows = []
     for l in mapfile.xreadlines(): 
       l = l.replace('\n','')
       if  l != '' and l[0] != '#' : map_rows.append(l.split('\t'))
     syms = [x[0] for x in map_rows]
     fbids= [x[1] for x in map_rows]

     times = set(it.chain(*[x['steps'] for x in gene_cols.values()]))
     for g in gene_cols.values() + misc_cols.values():
       gene_rows = zeros((len(rows), len(times)))
       for i,t in enumerate(times):
         if t in g['steps']: row = rows[:, g['idxs'][g['steps'].index(t)]]
         else: row = zeros(len(rows)) 
         gene_rows[:,i] = row

       #if g['info']['short_name'] == 'danr': raise Exception()
       g['vals'] = gene_rows

     protein_cols = dict([(k,val) for k,val in gene_cols.iteritems() 
                          if val['info']['type'] == 'protein'])
     mrna_cols = dict([(k,val) for k,val in gene_cols.iteritems() 
                       if val['info']['type'] == 'mRNA'])
     
     #things that are wonky include:
     # 1) Protein data (where column names do not match flybase symbols)
     # 2) Weird elements such as Traf1 that are not present in the network anyway
     # 3) FBgn0031375 / CG31670 which is apparently absent from the map and I fix.
     mrna_idxs = [syms.index(k) if k in syms  else 
                  syms.index('erm') if k == 'CG31670' else -1 
                  for k in mrna_cols.keys()]
     mrna_fbids = [fbids[idx] if idx != -1 else '' for idx in mrna_idxs] 

     protein_idxs = [syms.index(k[:-1]) if k[:-1] in syms   else -1 
                  for k in protein_cols.keys()]
     protein_fbids = [fbids[idx] if idx != -1 else '' for idx in protein_idxs] 
     

     if misc:
       return misc_cols
     if protein:
       return dict( [(protein_fbids[i], protein_cols.values()[i]) 
        for i, elt in enumerate(protein_idxs) if elt != -1])
     else:
       return dict( [(mrna_fbids[i], mrna_cols.values()[i]) 
        for i, elt in enumerate(mrna_idxs) if elt != -1])
  
  return mem.getOrSet(setBDTNP,
                      **mem.rc(kwargs,
                               register ='protein' if protein else \
                                 'misc' if misc else 'mrna',
                               protein = protein,
                               misc = misc,
                               on_fail = 'compute'))
Example #5
0
def getBTOL(**kwargs):
  def setBTOL(**kwargs):
    B = BTOL(**mem.sr(kwargs))
    if not B.treeInitialized():
      print 'Underlying tree structure apparently uninitialized: initializing\n...'
      B.initTree()
      print '...\nDone\nSaving\n...'
      B.saveTree()
      print '...\nDone'
    return B
  return mem.getOrSet(setBTOL, **mem.rc(kwargs, register = 'BTOL'))
Example #6
0
def get_seqs(dbname, **kwargs):
  def set_seqs(**kwargs):
    cbdb = compbio.projects.cbdb
    dbname = kwargs['dbname']
    dbi = cbdb.getName(dbname)
    nodes = dbi.S.q(dbi.Sequence).all()
    return nodes
  kwnew =  mem.rc(kwargs,hardcopy = False,
                  name = dbname, on_fail = 'compute',
                  dbname = dbname)
  return mem.getOrSet(set_seqs, **kwnew)
Example #7
0
 def leafNodes(self,**kwargs):
   def setLeafNodes(**kwargs):
     all_leaves = self.t.get_terminals()
     dbi = cbdb.getName('taxdmp')
     all_nodes = [ ncbi.get_node(l.m['taxid'],dbi) 
                   if 'taxid' in l.m.keys() else None for l in all_leaves]
     return all_nodes
   nodes = mem.getOrSet(setLeafNodes, 
                        **mem.rc(kwargs,
                                 hardcopy = False,
                                 on_fail = 'compute',
                                 register = 'leaf_nodes'))
   return nodes
Example #8
0
def recall_c2(**kwargs):
  '''
A kludgy wrapper to store the clustering results for later
without modifying the original mess of a program, c2...
'''
  def setC2(**kwargs):
    ll = c2(**mem.sr(kwargs))
    result =  c2(ll, **mem.sr(kwargs))
    return result
  return mem.getOrSet(setC2, 
                      **mem.rc(kwargs,
                               name = 'default_c2_settings',
                               on_fail = 'compute'))
Example #9
0
def get_taxnodes(dbname, **kwargs):
  def set_taxnodes(**kwargs):
    
    all_seqs = get_seqs(dbname,**mem.sr(kwargs))
    seq_taxa = [s.source_taxon 
                   if s.source_taxon else None 
                   for s in all_seqs]
    alinodes = [ncbi.get_node(s) if s != None else None for s in seq_taxa]
    return alinodes
  return mem.getOrSet(set_taxnodes,
                      **mem.rc(kwargs,
                               on_fail = 'compute',
                               hardcopy = False, 
                               register = dbname))
Example #10
0
def get_taxon_forsome(nodes,rank,set_name = 'default_setname',**kwargs):
  def set_taxon_forsome(nodes = None, rank = None,**kwargs):
    assert nodes != None and rank != None
    taxon = [ncbi.get_taxon(node, rank = rank)
             if node else None for node in nodes]
    return taxon
  
  return mem.getOrSet(set_taxon_forsome,
                      **mem.rc(kwargs,
                               nodes = nodes,
                               rank = rank,
                               on_fail = 'compute',
                               hardcopy = False,
                               register= set_name + rank))
Example #11
0
def get_taxon_forall(aliname,
                     rank = None, 
                     **kwargs):
  def setTaxon(aliname = None, rank = None,**kwargs):
    assert aliname != None and rank != None
    nodes = get_taxnodes(aliname,**mem.sr(kwargs))
    taxon = [ncbi.get_taxon(node, rank=rank) 
             if node else None for node in nodes]
    return taxon
  return mem.getOrSet(setTaxon,
                      **mem.rc(kwargs,
                              aliname = aliname,
                              rank = rank,
                              on_fail = 'compute',
                              hardcopy = False,
                              register = aliname + rank))
Example #12
0
def get_reinitz_data(**kwargs):

    ofs = kwargs.get('ofs',0)
    do_plot_coords = kwargs.get('plot_coords',False)
    do_plot_vals = kwargs.get('plot_vals',False)

    idm= id_map()
    df = datafiles(**mem.rc(kwargs))

    #I'm not sure exactly how this dataset works but
    #each nuclei has a bunch of numbers that appear to be
    #monotonically increasing.
    #
    #I just take the first instance.
    nums = dict([(k,v[:,0]) for k, v in df.iteritems()])
    nuc_count = len(set(nums.values()[2]))
   
    values = dict([(k,v[nuc_count *ofs: nuc_count *(ofs + 1),-1]) 
                   for k, v in df.iteritems()])
    coords = dict([(k,v[nuc_count *ofs :nuc_count *(ofs + 1),1:3]) for k, v in df.iteritems()])

    #to check the basic consistency of the data, enable the plot routines.
    #I suppose that I could do this for all of the nuclei occurences...
    #right now, only the first is used.
    if do_plot_coords:
        f = myplots.fignum(1,(8,8))
        ax = f.add_subplot(111)
        ct = mycolors.getct(len(values))
        for i,k in enumerate(values.keys()):
            ax.scatter(coords[k][:,0][::1], coords[k][:,1][::1], 10,
                       edgecolor = 'none', alpha = .25,c =ct[i],
                       label = k, )

        f.savefig(myplots.figpath( 'reinitz_exprdata_coords_nuc_offset={0}'.format(ofs)))
    if do_plot_vals:
        f = myplots.fignum(1,(8,8))
        ax = f.add_subplot(111)
        ct = mycolors.getct(len(values))
        for i,k in enumerate(values.keys()):
            ax.scatter(coords[k][:,0][::1], values[k][::1], 10,
                       edgecolor = 'none',alpha = .25,c =ct[i],
                       label = k, )

        f.savefig(myplots.figpath( 'reinitz_exprdata_ap_vals_nuc_offset={0}'.format(ofs)))

    return coords, values
Example #13
0
def id_map(**kwargs):
    def set_id_map(**kwargs):
     fname = cfg.dataPath('reinitz/28-7-2011-1-56-6-30-0/txt/byGenes')
     
     gsums = open(cfg.dataPath('flybase/gene_summaries.tsv'))
     gmap = open(cfg.dataPath('flybase/gene_map.tsv'))
     gassoc = open(cfg.dataPath('flybase/gene_association.fb'))
     
     gname_orig =  [ os.path.splitext(f)[0].lower() for f   in  os.listdir(fname) ] 
     gfiles =dict(  [ (gname_orig[i], os.path.join(fname,f)) for i, f in  enumerate(os.listdir(fname)) ] )
     gname_map = dict([( re.sub( re.compile('[^a-z]'),'',g), g) for g in gname_orig])
     gnames = gname_map.keys()
     
     glines = dict([(k.lower(),[]) for k in gnames])
     
     lines_kept = {}
     for i, g in enumerate(gassoc.xreadlines()):
         if g[0] == '!': continue
         g0 = g
         g = re.sub( re.compile('[^a-z]'),'', g.lower().split('\t')[9].strip())
         for k,v in glines.iteritems():
     
             if k == g: 
                 v.append((i,g))
                 lines_kept[i] = g0
         
     
     matches = glines
     ids = {}
     for k, v in matches.iteritems():
         names =  [ l[1] for l in v] 
         line_nums =  [ l[0] for l in v] 
         these_ids = [lines_kept[i].split('\t')[1].strip() for i in line_nums] 
         #just hacking here... for sloppy paired I use the first id...
         #alas...
         ids[k] = tuple(sorted(set(these_ids)))[0]
     
     return dict([ (idval, {'file': gfiles[gname_map[k]], 'name':gname_map[k]}) for k, idval in ids.iteritems()])
         #name_grps = dict([(gpkey, list(g)) for gpkey, g in  it.groupby(sorted(names))])
         #print k
         #print [ (gk, len(gv)) for gk, gv in name_grps.iteritems()] 
    return mem.getOrSet(set_id_map,**mem.rc(kwargs,on_fail = 'compute'))
Example #14
0
File: io.py Project: bh0085/compbio
def getSush(**kwargs):
  '''Get sushmita's regression weights and biases'''
  def setSush(**kwargs):
    path = config.dataPath('network/network_predmodel/regressionwts/fRN')
    bias_files = [ os.path.join( path, f) for f in os.listdir(path)  if 'bias' in f ]
    nw_files = [ os.path.join( path, f) for f in os.listdir(path)  if 'nw' in f ]
    
    bias_re = re.compile('(?P<gname>\S+)\s+(?P<level>\S+)')
    weight_re = re.compile('(?P<gname>\S+)\s+(?P<tfname>\S+)\s+(?P<level>\S+)')
    genes = {}
    for b in bias_files:
      for l in open(b).xreadlines():
	match = bias_re.search(l)
	genes[match.group('gname')] = dict(bias = match.group('level'))
    for n in nw_files:
      for l in open(n).xreadlines():
	match = weight_re.search(l)
	g = genes[match.group('gname')]
	g['tfs'] = g.get('tfs', []) + [match.group('tfname')]
	g['weights'] = g.get('weights', []) + [match.group('level')]
    return genes
  return mem.getOrSet(setSush, **mem.rc(kwargs,
                                        hardcopy = True))
Example #15
0
File: io.py Project: bh0085/compbio
def getNet(**kwargs):
  '''Get the saved network from patrick's files.

  output: tuple of dicts keyed by gene/tf names

          trgs: {gname: {color:0.}{weights:[0....]}{tfs:['tfname']}
                 ...}
          tfs : {tfname:{color:0.}{weights:[0....]}{tgs:['tfname']}
                 ...}'''
  def setNet(**kwargs):
    net_name = kwargs.get('net_name', 'unsup')
    if net_name == 'unsup':
      netfile = 'unsup_patrick.txt'
    elif net_name == 'logistic':
      netfile = 'logistic_0.6.txt'
    else:
      raise Exception()


    fpath = config.dataPath('network/patrick/{0}'.format(netfile))
    TC = getTC( reset = mod(kwargs.get('reset',0),2))
    CL = getCL( reset = mod(kwargs.get('reset',0),2))
    nwdata = open(fpath).read()
    #A few functions defined here to be used later
    trgfun = lambda x: x[1]
    wtfun = lambda x:float( x[2] )
    tffun = lambda x: x[0]
    sigmafun = lambda x: 1 / (1 + np.exp(-x /1))

    r = re.compile('^[ ]*(?P<tf>\S+)\s+(?P<target>\S+)\s+(?P<weight>\S+)'
                   ,re.M)
    matches = list(re.finditer(r,nwdata))    
    #Unsorted lists of tfs and targets
    targets =map(lambda x:x.group('target'),matches)
    tfs =    map(lambda x:x.group('tf'),matches)
    weights =map(lambda x:x.group('weight'),matches)
    
    #Concat the data for easier sorting
    cat = []
    for i in np.argsort(tfs):
      if TC.has_key(tfs[i]) and CL.has_key(targets[i]):
	cat.append([tfs[i],targets[i],weights[i]])

    #Extract a dictionary with information for each target.
    trg_d = {}
    count = 0.0
    for k, g in it.groupby(sorted(cat,key = trgfun),key = trgfun):
      l = list(g)
      count += 1.0
      trg_d[k] = {'color': np.array([count, 0, 0]),
		  'tfs' : map(tffun,l),
		  'weights': map(wtfun,l)
		  }


    #Extract a dictionary with information for each TF
    tf_d = {}
    for k, g in it.groupby(cat,key = lambda x: x[0]):
      l = list(g)
      tf_targets = map(lambda x: x[1],l)
        
      tf_d[k] = {'targets':map(trgfun,l),
		 'weights':map(wtfun,l)}

    return  (trg_d, tf_d)
  return mem.getOrSet(setNet,  **mem.rc(kwargs,
                                        hardcopy = True,
                                        on_fail = 'compute',
                                        register = kwargs.get('net_name',
                                                              'unsup')))
  pass
Example #16
0
def draw_cm_muscle_congruencies(seqs, profiles, run_id, reset = True):
    print 'computing alignments...'
    print '  ...using muscle'
    malis, mrefs, mpairs =\
            mem.getOrSet(setAlignments, 
                         **mem.rc({},
                                  seqs = seqs, profiles = profiles, 
                                  run_id = run_id, ali_type = 'muscle',
                                  reset = reset,
                                  on_fail = 'compute', 
                                  register = 'tuali_musc_{0}'.format(run_id))) 
    print '  ...using cmalign.'
    salis, srefs, spairs  =\
        mem.getOrSet(setAlignments, 
                     **mem.rc({},
                              seqs = seqs, profiles = profiles, 
                              run_id = run_id, ali_type = 'struct',
                              reset = reset,
                              on_fail = 'compute', 
                              register = 'tuali__struct_{0}'.format(run_id)))
 
    print '  ...making trees.'
    
    for idx, alis in enumerate(zip(malis, salis)):
        m, s = alis
        mtree  = phyml.tree(m,run_id, bionj = True)
        stree  = phyml.tree(s,run_id, bionj = True)
        
        maps = dict([(elt.id,i) for i, elt in enumerate(m)])
        mdists = zeros((len(maps),len(maps)))
        sdists = zeros((len(maps),len(maps)))
        for n1 in mtree.get_terminals():
            for n2 in mtree.get_terminals():
                mdists[maps[n1.name],maps[n2.name]] = \
                    mtree.distance(n1,n2)
        
        for n1 in stree.get_terminals():
            for n2 in stree.get_terminals():
                sdists[maps[n1.name],maps[n2.name]] = \
                    stree.distance(n1,n2)
        tree_similarity(sdists, mdists, '{0}_struct_{1}'.format(run_id,idx), k = len(sdists - 1))
        tree_similarity(sdists, mdists, '{0}_struct_{1}'.format(run_id,idx), k = 6)

        f = myplots.fignum(4, (8,10))
        ct = mycolors.getct(len(mtree.get_terminals()))

        import networkx

        for t, sp, ttype in zip([mtree, stree], [211,212], ['sequence', 'structural']):
            a = f.add_subplot(sp)
            layout = 'neato'
            G = phylo.to_networkx(t)
            Gi = networkx.convert_node_labels_to_integers(G, discard_old_labels=False)
            posi = networkx.pygraphviz_layout(Gi, layout, args = '')
            posn = dict((n, posi[Gi.node_labels[n]]) for n in G)


            networkx.draw(G, posn, labels = dict([(n, '') for n in G.nodes()]),
                      node_size = [100 if  n.name in maps.keys() else 0 for n in G.nodes()],
                      width = 1, edge_color = 'black',
                      ax = a,
                      node_color = [ct[maps.get(n.name, -1)] for n in G.nodes()] )
        

            a.annotate('Embedded tree for {0} alignment.'.format(ttype),
                    [0,1], xycoords = 'axes fraction', va = 'top',
                    xytext = [10,0],textcoords = 'offset pixels')
            a.annotate('Total branch length is {0}'.format(t.total_branch_length()),
                    [1,0], xycoords = 'axes fraction', ha = 'right',
                    xytext = [-10,10],textcoords = 'offset pixels')            

        #phylo.draw_graphviz(  mtree,  label_func = lambda x: '', 
        #                      node_color = [ct[maps.get(n.name, -1)] for n in G.nodes()] +\
        #                          [ct[0] for n in mtree.get_nonterminals()], axes = ax)

        datafile = cfg.dataPath('figs/gpm2/pt2_mus_cm_tree_embeddings_{0}_struct_{1}.ps'.format(run_id, idx))
        f.savefig(datafile, dpi = 200, format = 'ps')
Example #17
0
def show_conservation(fidx = 0, reset = False):
    fnum = flist[fidx]
    rfid = 'RF{0:05}'.format(fnum)
    print rfid
    if fnum ==50: ftype = 'riboswitch'
    else: ftype = 'all'
    
    
    out = mem.getOrSet(setFamData,
                              **mem.rc({}, reset =reset,
                                       on_fail = 'compute',
                                       hardcopy = False,
                                       register = 'fdat'+rfid,
                                       ftype = ftype,
                                       rfid = rfid))

    
    mvals, tvals, structs = mem.getOrSet(setTree,
                                         **mem.rc({},reset = reset,
                                                  on_fail = 'compute',
                                                  hardcopy = True,
                                                  register = 'st'+rfid,
                                                  rfid = rfid,
                                                  ftype = ftype))
    
    idxs, tidx  = sutils.show_paired_v_energy(rfid,rfid,mvals,tvals,structs,ftype)
    
    all_pairs = structs['structs']
    all_energies = structs['energies']
    
    pints,eints, mints, tints = [structs['structs'][i] for i in idxs],\
        [ structs['energies'][i] for i in idxs],\
        [ mvals[tidx][i] for i in idxs],\
        [ tvals[tidx][i] for i in idxs]
    seq = structs['seq']
    
    if do_make_subopts:
        subopts = rutils.suboptimals(seq, n = 400)
        verts = rutils.struct_verts(subopts, seq, rfid)
        f = myplots.fignum(4,figsize)
        rplots.grid_rnas(verts, dims = [40])
        f.savefig(figfile.format('{0}_grid_rnas'.\
                                     format(rfid)))

    
                



    aff = rutils.struct_affinity_matrix(all_pairs, len(seq))
    pca = rutils.project_structs(all_pairs,
                          ptype ='pca',
                          affinities = aff,
                          n_comp = 3) 

    for metric in ['n_comp']:# ['frac_silent','frac_paired','n_comp']:
      scolors = []
      for i in range(len(tvals[tidx])):
          m_silent, pidxs, frac_good = sutils.metric(
              mvals[tidx][i],tvals[tidx][i],
              mtype = metric)
          
          scolors.append(mean(m_silent))
      scolors = myplots.rescale(scolors, [0.,1.])[:,newaxis] * array([1.,0.,0.])
      
      
      f = myplots.fignum(4,figsize)
      ax = f.add_subplot(111)
      xvals, yvals = pca[:,:2].T
      myplots.padded_limits(ax, xvals, yvals)
      
      ax.scatter(xvals,yvals,300,linewidth = 1,
                 edgecolor = 'black', color = scolors)

      ax.scatter(pca[idxs,0],pca[idxs,1], 2100 ,alpha = 1, 
                 color = 'black')
      ax.scatter(pca[idxs,0],pca[idxs,1], 2000 ,alpha = 1, 
                 color = 'white')
      ax.scatter(pca[idxs,0],pca[idxs,1], 400 ,alpha = 1, 
                 color = scolors[idxs],
                 )


      ax.annotate('''Conservation metric: {0}
Projected onto C=2 Principal Components'''.format(metric),
                  [0,1],xycoords = 'axes fraction', va = 'top',
                  xytext = [10,-10],textcoords='offset points')
      
      f.savefig(figfile.format('{0}_pca_{1}'.\
                                 format(rfid, metric)))
Example #18
0
def modules(reset=False):
    return mem.getOrSet(setModules, **mem.rc({}, reset=reset, hardcopy=True, on_fail="compute"))
Example #19
0
def eval_seq_group(gap_seqs, rfid, run_id, inp_run_id, reset = True,
                   draw_alis = draw_all_easy,
                   clade_alignment_method = clade_alignment_method,
                   max_structs = 5):

    rutils = utils
    data = butils.load_data(inp_run_id, 'output')
    structs = data['structs']
    energies = data['energies']
    esrt = argsort(energies)[::-1]
    s_inds = esrt[:max_structs]
    structs, energies = [structs[i] for i in s_inds], [energies[i] for i in s_inds]

    refseq = data['seq']
    
    nq = len(gap_seqs)
    ns = len(structs)

    names = ['N{1:04}'.format(rfid, idx) for idx in range(nq)]
    seqs = [rutils.ungapped_seq(gap_seqs[i], names[i]) for i in range(nq)]
    


    profiles = mem.getOrSet(setProfiles, 
                            **mem.rc({},
                                     seq = refseq, structs = structs, run_id = rfid,
                                     reset = reset,
                                     on_fail = 'compute', 
                                     register = 'tuprof_{0}'.format(rfid)))
    
    if draw_alis: 
        draw_cm_muscle_congruencies(seqs, profiles, 
                                    run_id, reset = reset)
    

    if clade_alignment_method == 'cm':
        alis, refs, all_pairs  =\
            mem.getOrSet(setAlignments, 
                         **mem.rc({},
                                  seqs = seqs, profiles = profiles, 
                                  run_id = rfid, ali_type = 'struct',
                                  reset = reset,
                                  on_fail = 'compute', 
                                  register = 'tuali_struct_{0}'.format(rfid)))
    else:
        raise Exception('No methods besides cm are yet implemented')
    

    seq_group_data = {}
    seq_group_data['seqs'] = gap_seqs
    seq_group_data['structs'] = []
    for i, struct in enumerate(structs):
        struct_data = {}
        ali = alis[i]
        ref = refs[i]
        pairs = all_pairs[i]
        
        #NOTE THAT DUE TO AN AWKWARD SYNTAX DECISION,
        #I AM ALLOWING FOR THE POSSIBILITY THAT EACH
        #ALI ELT HAS DIFFERENT PAIRS.
        #
        #ALL OF MY ROUTINES SO FAR ONLY USE A SINGLE 
        #PAIR SET AND SO I USE PAIRS[0] EXCLUSIVELY
        struct_data.update(ref = ref[0], 
                           pairs = pairs[0],
                           ali = ali)
                        
        rid = '{0}_{1}'.format(run_id, i)

        if clade_tree_method ==  'bionj': 
            tree = phyml.tree(ali, run_id = rid, bionj = True)
        else: tree = get_phase_tree(ali, pairs[0], run_id)

        for i, ct in enumerate(tree.get_terminals()):
            seq = filter(lambda x: x.id == ct.name, ali)[0]
            ct.m = {'seq':seq,
                    'probs':array([1 for j in range(len(seq))])}

        if clade_ancestor_method == 'independent':
            ml_tree = get_ml_ancestor_tree(tree, ali, 
                                           '{0}_paml{1}'.format(run_id, i))
        else:
            ml_tree = get_structure_ancestor_tree(\
                tree, ali,'{0}_stree{1}'.format(run_id, i))
        
        muts, times, gaps, irresolvables = tree_conservation.count_struct(ml_tree, pairs[0])

        struct_data.update(muts = muts, times = times, 
                        gaps = gaps, irresolvables = irresolvables)
        seq_group_data['structs'].append(struct_data)

    return seq_group_data
Example #20
0
def c2( launcher = None, ncluster =2000, host = 'tin', 
        reset = 0, step = 10, exemp_time = 'all',
        doplot = False ,**kwargs):
  mrnas = nio.getBDTNP()
  misc = nio.getBDTNP(misc = True)
  
  vals = array([v['vals'] for v in mrnas.values()])
  gvars = var(vals, 1)
  gminvars = np.min(gvars,1)
  gmedvars = median(gvars,1)

  min20 = argsort(gminvars)[::-1][:20]
  med20 = argsort(gmedvars)[::-1][:20]

  int20 = set(min20).intersection(set(med20))
  xgenes = array(list(int20))

  cell_data = vals[xgenes].transpose(1,2,0)
  scd = shape(cell_data)
  #times = reshape(zeros(shape(cell_data[0:2]))[:,:,newaxis , arange(shape(cell_data[1]))
  #                    , (prod(shape(cell_data)[0:2])))
  xycoords = (arange(scd[0])[:,newaxis,newaxis]*[1,0] +\
                arange(scd[1])[newaxis,:,newaxis]*[0,1])
  cell_data = reshape(cell_data, (prod(shape(cell_data)[0:2]), shape(cell_data)[2] ))
  xy_data = reshape(xycoords, (prod(scd[0:2]),2 ))
    
  if exemp_time == 'all':
    inds = arange(len(cell_data))
  else:
    inds = arange(len(cell_data))[nonzero(equal(xy_data[:,1],exemp_time))[0]]
  
  np.random.seed(1)
  np.random.shuffle(inds)
  rand_thousand = inds[0:ncluster]
  
  sim_data = cell_data[rand_thousand]
  sim_xy = xy_data[rand_thousand]
  t = [ mean(sim_data, 0), std(sim_data,0)]
  t[1][equal(t[1],0)] = 0
  metric = 'neg_dist'
  sims = similarity(sim_data, transform = t, method = metric)

  name = 'll_{0}_{1}_{2}'.format(metric,ncluster,exemp_time)
  def setLauncher(**kwargs):
    sims= kwargs.get('sims')
    metric = kwargs.get('metric')
    name = kwargs.get('name')
    d_in = []
    percs = logspace(.1,1.5,8)
    for p in percs:
      d_in.append(dict(similarities = sims,
                       self_similarity = ss.scoreatpercentile(sims, p),
                       metric = metric
                       ))

    launcher = bcl.launcher(d_in, host = host, name = name)
    return launcher  
  if launcher == None:
    output = mem.getOrSet(setLauncher,
                          **mem.rc(dict(sims = sims, metric = metric,
                                        name = name,
                                        hardcopy = True,
                                        reset = reset,
                                        hard_reset = False,)))  
    return output



  def setC2(launcher = launcher, **kwargs):
    if launcher == None:
      raise Exception()
    else:
      output = launcher.output()
    return output
    #It appears that the bsub process failed for the first output.
    #No big deal. Debug later.
  
  output = mem.getOrSet(setC2,
                        **mem.rc(dict(harcopy = True,
                                      launcher = launcher,
                                      reset = reset,
                                      on_fail = 'compute',
                                      hard_reset = False,
                                      name =  'c2'+ name )))
  all_inds = array([  squeeze(o['inds']) for o in output[:] ])
  

  xs = misc['x']['vals'][zip(*xy_data)] #zip(*sim_xy)]
  ys = misc['y']['vals'][zip(*xy_data)] #zip(*sim_xy)]
  zs = misc['z']['vals'][zip(*xy_data)] #zip(*sim_xy)]
  
  colors =array( mycolors.getct(shape(all_inds)[1]) )
  f = plt.figure(0)
  f.clear()
  
  all_tps = range(scd[1])
  nc = len(all_inds)
  nt = len(all_tps)

  all_members = []
  for i, inds in enumerate(all_inds):
    #compute similarity matrices 1000 at a time:
    exemplars = sim_data[list(set(list(inds)))]
    sim = similarity(cell_data, 
                   exemplars, 
                   transform = t,
                   method = metric)
    closest = argmax(sim, 1)
    all_members.append(closest)
    
    
    if doplot:
      for j, tp in enumerate(all_tps):
        ax = f.add_axes( [float(j)/nt,float(i) /nc,1./nt, 1. /nc] )
        ax.set_yticks([])
        ax.set_xticks([])
        i_sub = nonzero(equal(xy_data[:,1], j) * greater(ys,0))[0]
        cs = colors[closest[i_sub]]
        x = xs[i_sub]
        z = zs[i_sub]
        plt.scatter(x[::step],z[::step], 40,alpha = .75, c = cs[::step], edgecolor = 'none')
    
  ct_data = xy_data
  return all_members, ct_data