예제 #1
0
파일: rfam.py 프로젝트: bh0085/projects
def split_family_seqs():
    alis_dir = cfg.dataPath('rfam/family_alis/')
    meta_dir = cfg.dataPath('rfam/family_metas/')

    fopen = open(cfg.dataPath('rfam/Rfam.seed'))
    alis = aio.parse(fopen,'stockholm')
    while 1:
        infos = {}
        start = fopen.tell()
        while 1:
            l = fopen.readline()       
            if l == '': break
            if l[0] == '#':
                ukey = str(l[5:7])
                infos.update( [(ukey, infos.get(ukey,'') + l[8:])])
            
            else:
                if l.strip() != '': break
        
        
        fopen.seek(start)
        ali = alis.next()
        if not ali:
            break
        rfname = infos['AC'].strip()
        alifile = open(os.path.join(alis_dir, rfname+'.fa'),'w')
        metafile = open(os.path.join(meta_dir, rfname+'.pickle'),'w')

        aio.write(ali, alifile, 'fasta')
        pickle.dump(infos, metafile)

        alifile.close()
        metafile.close()
예제 #2
0
def split_family_seqs():
    alis_dir = cfg.dataPath('rfam/family_alis/')
    meta_dir = cfg.dataPath('rfam/family_metas/')

    fopen = open(cfg.dataPath('rfam/Rfam.seed'))
    alis = aio.parse(fopen, 'stockholm')
    while 1:
        infos = {}
        start = fopen.tell()
        while 1:
            l = fopen.readline()
            if l == '': break
            if l[0] == '#':
                ukey = str(l[5:7])
                infos.update([(ukey, infos.get(ukey, '') + l[8:])])

            else:
                if l.strip() != '': break

        fopen.seek(start)
        ali = alis.next()
        if not ali:
            break
        rfname = infos['AC'].strip()
        alifile = open(os.path.join(alis_dir, rfname + '.fa'), 'w')
        metafile = open(os.path.join(meta_dir, rfname + '.pickle'), 'w')

        aio.write(ali, alifile, 'fasta')
        pickle.dump(infos, metafile)

        alifile.close()
        metafile.close()
예제 #3
0
파일: __init__.py 프로젝트: bh0085/compbio
  def setNet(**kwargs):
    method =kwargs.get('method', 'tree')
    num = kwargs.get('num', 1)

    description_path = cfg.dataPath('::daniel/net%s_chip_features.tsv') % num
    data_path = cfg.dataPath('::daniel/informativeness/%s%s.txt') %(method,num)
    split_re = re.compile('\s')
    
    desc_open = open(description_path)
    description_cols = split_re.split(desc_open.readline().strip()) + ['Exp_Index']
    description_vals = [split_re.split(l.strip()) for l in desc_open.readlines()]
    for idx, d in enumerate(description_vals): d.append(idx)
    

    data_open = open(data_path)
    weight, tf, exp = zip(*[array(split_re.split(l.strip()), float) 
                           for l in data_open.readlines()])
    exp  = [ e -1 for e in exp]
    description = {}
    for i in range(len(description_cols)): 
      description[description_cols[i]] = [d[i] for d in description_vals]
      
    
    ntf = np.max(tf) + 1
    nexp = len(description.values()[0]) 
    
    grid = zeros((ntf,nexp))
    for vals in zip(weight,tf,exp): grid[vals[1], vals[2]] = float(vals[0])
    
    return grid, description
예제 #4
0
파일: utils.py 프로젝트: bh0085/projects
def rna_draw(seq, struct, name, out_type = 'svg'):
    lines = '{0}\n{1}\n'.format(seq,struct)
    if out_type == 'png':
        outfile = cfg.dataPath('rnafold/{0}.png'.format(name))
        rprc = spc.Popen('RNAplot -o svg; convert rna.svg {0}'.format(outfile), shell = True,
                         stdin = spc.PIPE, stdout = spc.PIPE)
        
        out = rprc.communicate(input = lines)[0].splitlines()
  
        from matplotlib._png import read_png
        image = read_png(outfile)
    elif out_type== 'svg':
        outfile = cfg.dataPath('rnafold/{0}.svg'.format(name))
        
        tempdir = 'tmp_{0}'.format(name);
        rprc = spc.Popen('mkdir {1}; cd {1}; RNAplot -o svg; mv rna.svg {0}; cd ..; rm -r {1};'.format(outfile, tempdir), shell = True,
                         stdin = spc.PIPE, stdout = spc.PIPE)
        
        out = rprc.communicate(input = lines)[0].splitlines()
        
        
        struct_svg =  open(outfile).read()
        data = xparse.parse(struct_svg)
        arr = svg.get_polys(data)[0]

    else:
        raise Exception()
    
    return arr
예제 #5
0
def get_fam(rfid):
    '''Get a family including tree and sequence information
from an Rfam data dump stored in data/rfam

inputs: 
  rfid:   rfam family id.

outputs:
  ali:    a biopython alignment
  tree:   a biopython tree from a newick file.
  info:   information parsed from the original stockholm file.

'''

    fmeta = open(cfg.dataPath('rfam/family_metas/{0}.pickle'.format(rfid)))
    fali = open(cfg.dataPath('rfam/family_alis/{0}.fa'.format(rfid)))

    ali = aio.parse(fali, 'fasta').next()
    info = pickle.load(fmeta)

    fname = cfg.dataPath('rfam/Rfam.seed_tree/{0}.seed_tree'.format(rfid))
    tree = nio.parse(
        open(cfg.dataPath(
            'rfam/Rfam.seed_tree/{0}.seed_tree'.format(rfid)))).next()
    return ali, tree, info
예제 #6
0
def names():

    files = [l for l in os.listdir(cfg.dataPath("batch/tmp")) if "mcmc" in l]
    fpaths = [os.path.join(cfg.dataPath("batch/tmp"), f) for f in files]
    data = sio.loadmat(fpaths[0])

    gnames = data["gene_names"]
    tfnames = data["tf_names"]
    return gnames, tfnames
예제 #7
0
파일: heatmaps.py 프로젝트: bh0085/compbio
def load(net = 2, num = 1676,
         min_module_size = 10,
         min_go_size = 5,
         max_go_modules = 2,
         prb_threshold = [1e-6,.01]):
    fopen = open(cfg.dataPath('daniel/heatplots/net{0}_top{1}_heatplot_matrix.txt'.format(net,num)),'r')
    l0 = fopen.readline()

    arr = array([[float(elt) for elt in l.split('\t')] for  l in  fopen.xreadlines() if l.strip() != ''])
    ids = arr[:,:1]
    arr = arr[:,1:]

    arr[equal(arr,0)] = np.min(arr[not_equal(arr,0)])
    arr = -1 * log10(arr)


    clines = open(cfg.dataPath('daniel/heatplots/net{0}_top1676_communities.txt'.\
                                    format(net))).readlines()
    n_per_modules = [len(c.split('\t')) for c in clines]
    glines = open(cfg.dataPath('daniel/heatplots/net{0}_goterm_counts.txt'.\
                                    format(net))).readlines()
    n_per_go = dict([c.split('\t') for c in glines if c.strip() != ''])
    for k, v in n_per_go.iteritems(): n_per_go[k] = int(v.strip())

    big_mods = nonzero(greater(n_per_modules,min_module_size))[0]
    big_gos = set([ k for k, v in n_per_go.iteritems() if v > min_go_size])
    
    col_tits = [s.strip()  for s in l0.split('\t')[1:] if s.strip() != '']

    #FOR SOME WEIRD REASON, ONE OF THE COLUMNS THAT SHOULD BE A GO NAME IS 
    #CALLED 'V3'. AS V3 IS NOT PRESENT IN THE GO DESCRIPTIONS LIST,
    #I LEAVE IT OUT.

    acols = array([ idx for idx, elt in enumerate(col_tits) if elt in big_gos ])
    arows = array([ idx for idx, elt in enumerate(ids) if elt in big_mods ])

    arr = arr[arows][:, acols]

    thr = -1 * log10(array(prb_threshold))
    arr[greater(arr,thr[0])] = thr[0]
    arr[less(arr,thr[1])] = thr[1]
    
    arr = arr -  np.min(arr)
    arr = arr /  np.max(arr)

    go_modules = sum(arr, 0)
    final_cols = nonzero(less(go_modules,max_go_modules)*\
                             greater(go_modules,0))[0]
    acols = acols[final_cols]
    arr = arr[:,final_cols]

    return arr, \
        array([ col_tits[idx] for idx in acols]), \
        array([ ids[idx] for idx in arows])
예제 #8
0
파일: infernal.py 프로젝트: bh0085/projects
def alignment(seqs_in, profile, run_id):
    '''Compute an alignment of multiple sequences to a given 
covariance model profile such as constructed by cmbuild
via infernal.profiles.

input:
  seqs:    a list of biopython SeqRecord objects
  profile: the filename of a covariance model profile
  run_id:  a run id to use for naming temporary files to avoid collisions

output:
  ali:     an rfam multiple sequence alignment
  ref:     the profile reference sequence aligned to ali
  struct:  the profile reference structure aligned to ali

'''
    if type(seqs_in[0]) == str:
        raise Exception(
            'Sorry but string lists are not supported. We need ids!')
        #seqs = [Bio.SeqRecord.SeqRecord(Bio.Seq.Seq(s,
        #                                            Bio.Seq.Alphabet.RNAAlphabet),
        #                                'S{0:03}'.format(idx))
        #        for idx, s in enumerate(seqs)]
    else:
        seqs = [
            Bio.SeqRecord.SeqRecord(
                Bio.Seq.Seq(
                    ''.join([let for let in str(ali.seq) if let in 'AUTGC']),
                    Bio.Seq.Alphabet.RNAAlphabet), 'S{0:03}'.format(idx))
            for idx, ali in enumerate(seqs_in)
        ]

    name_maps = dict([('S{0:03}'.format(idx), s.id)
                      for idx, s in enumerate(seqs_in)])

    infile = cfg.dataPath('infernal/temp/{0}_{1:03}_unaligned.fa'.format(
        run_id, idx))
    outfile = cfg.dataPath('infernal/temp/{0}_{1:03}_aligned.stk'.format(
        run_id, idx))
    Bio.SeqIO.write(seqs, infile, 'fasta')

    cstr = 'cmalign -o {0} {1} {2}'.format(outfile, profile, infile)
    ispc = spc.Popen(cstr, shell=True, stdout=spc.PIPE)
    out = ispc.communicate()[0]
    fopen = open(outfile)
    seqs, ref, struct = rutils.stk_parse(fopen)
    fopen.close()
    ali = ba.MultipleSeqAlignment(seqs)

    for a in ali:
        a.seq = a.seq.upper()
        a.id = name_maps[a.id]

    return ali, ref, struct
예제 #9
0
파일: infernal.py 프로젝트: bh0085/projects
def alignment(seqs_in, profile,run_id):
    '''Compute an alignment of multiple sequences to a given 
covariance model profile such as constructed by cmbuild
via infernal.profiles.

input:
  seqs:    a list of biopython SeqRecord objects
  profile: the filename of a covariance model profile
  run_id:  a run id to use for naming temporary files to avoid collisions

output:
  ali:     an rfam multiple sequence alignment
  ref:     the profile reference sequence aligned to ali
  struct:  the profile reference structure aligned to ali

'''
    if type(seqs_in[0]) == str:
        raise Exception('Sorry but string lists are not supported. We need ids!')
        #seqs = [Bio.SeqRecord.SeqRecord(Bio.Seq.Seq(s,
        #                                            Bio.Seq.Alphabet.RNAAlphabet),
        #                                'S{0:03}'.format(idx))
        #        for idx, s in enumerate(seqs)]
    else:
        seqs = [Bio.SeqRecord.SeqRecord(Bio.Seq.Seq(''.join([let 
                                                             for let in str(ali.seq)  
                                                             if let in 'AUTGC' ]),
                                                    Bio.Seq.Alphabet.RNAAlphabet),
                                        'S{0:03}'.format(idx))
                for idx, ali in enumerate(seqs_in)]

    name_maps = dict( [('S{0:03}'.format(idx), s.id) for idx, s in enumerate(seqs_in)])
    

    infile = cfg.dataPath('infernal/temp/{0}_{1:03}_unaligned.fa'.format(run_id,idx))
    outfile= cfg.dataPath('infernal/temp/{0}_{1:03}_aligned.stk'.format(run_id,idx))
    Bio.SeqIO.write(seqs, infile, 'fasta')
    
    cstr = 'cmalign -o {0} {1} {2}'.format(outfile, profile, infile)
    ispc = spc.Popen(cstr, shell = True,
                            stdout = spc.PIPE)
    out = ispc.communicate()[0]
    fopen = open(outfile)
    seqs, ref, struct = rutils.stk_parse(fopen)
    fopen.close()
    ali = ba.MultipleSeqAlignment(seqs)
    

    for a in ali:
	    a.seq = a.seq.upper()
            a.id = name_maps[a.id]

    return ali, ref, struct
예제 #10
0
파일: netutils.py 프로젝트: bh0085/compbio
def parse_CL():
    f = open(config.dataPath('network/CL.geneexp')).read()
    elts =f.split('\n')
    seqdict = {}
    for e in elts:
        matches = list(re.finditer(re.compile('([^\s]+)'), e))
        if not len(matches): continue
        name = matches[0].group(1)
        seqdict[name] = []
        for i in matches[1:]:
            seqdict[name].append(float(i.group(1)))

    pickle.dump(seqdict,open(config.dataPath('network/CL.pickle','w')))
예제 #11
0
파일: paml.py 프로젝트: bh0085/compbio
def run_paml(tree_in,ali_in, run_id= 'T%05i' % (0,),
             verbose = False):
  '''
  Given an input tree in the form of a Biopython tree
  with branch lengths and names, write to a file and 
  run paml's baseml to generate a maximum likelihood 
  ancestry in the path data/paml/rst  '''

  paml_d = config.dataPath('paml')
  run_d = config.dataPath(os.path.join(paml_d , 'run_{0}'.format(run_id)))
  if not os.path.isdir(paml_d): os.mkdir(paml_d)
  if not os.path.isdir(run_d): os.mkdir(run_d)
  old_cwd = os.getcwd()
  os.chdir(run_d)

  outfilepath = 'paml_tree_{0}.paml'.format(run_id)
  
  treefilepath = 'paml_tree_{0}.newick'.format(run_id)
  treefile = open(treefilepath,'w')
  phylo.write(tree_in,treefile,'newick', plain = True)
  treefile.close()

  alifilepath ='paml_tree_{0}.phylip' .format(run_id)
  alifile = open(alifilepath, 'w')
  aio.write(ali_in, alifile, 'phylip')
  alifile.close()

  ctlfilepath= 'baseml_{0}.ctl'.format(run_id)
  ctlfile = open(ctlfilepath,'w')
  ctlfile.write(make_baseml(treefilepath,
                            alifilepath,
                            outfilepath,
                            ancestors = 1))
  ctlfile.close()

  command = 'baseml {0} '.format(ctlfilepath)

  #fix a damned paml bug.
  sed_command = "sed -i -e '1 s/$/\ \ I/' {0}"\
      .format(alifilepath)
  
  sprc = subprocess.Popen(sed_command, stdout = subprocess.PIPE, shell = True)
  comms = sprc.communicate()
  pprc = subprocess.Popen(    command, stdout = subprocess.PIPE, shell = True)
  comms = pprc.communicate()
  if verbose:
    print comms[0]

  os.chdir(old_cwd)
  rstfile = os.path.join(run_d,'rst')
  return rstfile
예제 #12
0
파일: netutils.py 프로젝트: bh0085/compbio
def print_soheil():
  n_tfs = 8
  trgs, tfs = parse_net()
  trg_subset = dict([ (tgkey, v )for tgkey, v in trgs.iteritems() if len(v['weights']) >= n_tfs])
  for k in trg_subset.keys()[20:]: trg_subset.pop(k)
  
  TS = load_TS()
  
  tpts = arange(20)
  for k, tg in trg_subset.iteritems():
    t_lines = [''] * len(tpts)
    gts = array(TS[k])[tpts]
    for i in range(len(gts)):
      
      t_lines[i] += '{0:20}'.format(gts[i])
    sorted_weights = argsort(tg['weights'])[::-1][0:n_tfs]
    tfs= [tg['tfs'][i] for i in sorted_weights]
    for tf in tfs:
      fts = array(TS[tf])[tpts]
      for i in range(len(gts)):
        t_lines[i] += ' {0:20}'.format(fts[i])

    
    
    keys = ['{0:20}'.format(k)]
    keys.extend(['{0:20}'.format(tf) for tf in tfs])

    l0 = ' '.join(keys)
    fname = '{0}.txt'.format(k)
    fopened = open(config.dataPath('network/'+fname),'w')
    fopened.write(l0 + '\n' + '\n'.join(t_lines))
    fopened.close()
예제 #13
0
파일: hapmap.py 프로젝트: bh0085/compbio
def polyfile(chr = 10, grp = 'ASW', **kwargs):
    root = cfg.dataPath('hapmap/phase3/polymorphic')
    fname = os.path.join(root,
                         'genotypes_chr{0}_{1}_phase3.2_nr.b36_fwd.txt.gz'\
                             .format(chr,grp));
    contents = gzip.open(fname).readlines()                    
    return contents
예제 #14
0
def tmp_fnames(run_id, num):
  '''get temporary filenames that scripts can write to'''
  tmp_dir = cfg.dataPath('batch/tmp')
  names = [os.path.join(tmp_dir, run_id + '_tmp{0:03d}'.format(idx))
                        for idx in range(num)]
                   
  return names
예제 #15
0
파일: phyml.py 프로젝트: bh0085/compbio
def tree(alignment,
         run_id = 'T%05i' % (0,),
         bionj = False):

  old_cwd = os.getcwd()
  new_wd = config.dataPath('phyml')
  if not os.path.isdir(new_wd): os.mkdir(new_wd)
  os.chdir(new_wd)

  infilepath = 'infile{0}'.format(run_id)
  infile = open(infilepath,'w')
  aio.write(alignment, infile, 'phylip')
  infile.close()


  command = 'phyml --quiet -i {0} -o {1} '.format(infilepath, 'n' if bionj else 'tlr' )
  print command
  subprocess.call(command,
                  shell = True,
                  stdout = subprocess.PIPE)
  treefilepath = infilepath + '_phyml_tree.txt'
  treefile = open(treefilepath)
  tree =phylo.read(treefile, 'newick')
  treefile.close()
  os.chdir(old_cwd)
  return tree
예제 #16
0
파일: run_mcmc.py 프로젝트: bh0085/compbio
def launch_many(run_id):
    '''
Generate script paramaters and launch a bunch of bsub jobs.

Designed to be run on the cluster via an interactive shell.
Note: If this is not run on cluster, since it does not look
up a remote url for files, it won't be able to find expression
data.

'''
    print 'Launching all jobs!'

    #MAKE INPUTS 
    expr_filenames = ['soheil/expression_c4d_n4_tt_{0}.mat'.format(ttnum)
                      for ttnum in range(70)] + ['soheil/expression_c4d_n4_intercluster.mat']
    urls = [ cfg.dataURL(f) for f in expr_filenames ]
    remote_exprnames =[  cfg.dataPath(url) for url in urls ]

    inp_dicts = [dict(out_iter_num = out_iter_num,
                      in_iter_num = in_iter_num,
                      k = k,
                      beta = beta,
                      f_mix = f_mix,
                      f_sim = f_sim,
                      f_en_in = f_en_in,
                      f_en_out = f_en_out,
                      th_cor = th_cor,
                      trunc_value = trunc_value,
                      degree_bound = degree_bound,
                      filename = filename)
                 for out_iter_num in array([25],double)
                 for in_iter_num in array([100],double)
                 for k in array([6],double)
                 for beta in array([4],double)
                 for f_mix in array([2],double)
                 for f_sim in array([.8],double)
                 for f_en_in in array([1.],double)
                 for f_en_out in array([1.],double)
                 for th_cor in array([.6],double)
                 for trunc_value in array([3],double)
                 for degree_bound in array([3],double)
                 for filename in remote_exprnames ]

    

    #MAKE EYEBALL
    eyeball = bsub.eyeball(run_id, 
                           os.path.abspath(inspect.stack()[0][1]),
                           inp_dicts,
                           func = 'run_single',
                           name = 'mcmc_',
                           mem = 3)

    #LAUNCH EYEBALL JOBS
    eyeball.launch()

    
    #RETURN A LIST OF LAUNCHED JOBS
    return dict(cmds=eyeball.cmds,
                inputs = inp_dicts)
예제 #17
0
파일: analyze.py 프로젝트: bh0085/compbio
    def setCRE(**kwargs):        
        cre_des = open(cfg.dataPath('CRE/27k/CRE_Randomization_Design.txt'))
        cre_rnd = open(cfg.dataPath('CRE/27k/CRE_Randomization.dat'))
        cre_rnd = cre_rnd.readlines()
        cre_des = cre_des.readlines()

        cre_rndvals = [[elt.strip() for elt in line.split('\t')] for line in cre_rnd[1:]]
        cre_seqs = [[elt.strip() for elt in line.split('\t')] for line in cre_des]

        cre_rndvals = dict([(e[0], e[1:]) for e in cre_rndvals])
        cre_seqs = dict([[e[0],e[1]] for e in cre_seqs])

        keys = list(set(cre_rndvals.keys()).intersection(cre_seqs.keys()))
        
        cre = array([list(cre_seqs[k]) for k in keys])
        cre_rndvals = array([array(cre_rndvals[k], float) for k in keys])
        return cre, cre_rndvals, keys
예제 #18
0
파일: analyze.py 프로젝트: bh0085/compbio
    def setIFNB(**kwargs):        
        IFNB_des = open(cfg.dataPath('CRE/27k/IFNB_Randomization_Design.txt'))
        IFNB_rnd = open(cfg.dataPath('CRE/27k/IFNB_Randomization.dat'))
        IFNB_rnd = IFNB_rnd.readlines()
        IFNB_des = IFNB_des.readlines()

        IFNB_rndvals = [[elt.strip() for elt in line.split('\t')] for line in IFNB_rnd[1:]]
        IFNB_seqs = [[elt.strip() for elt in line.split('\t')] for line in IFNB_des]

        IFNB_rndvals = dict([(e[0], e[1:]) for e in IFNB_rndvals])
        IFNB_seqs = dict([[e[0],e[1]] for e in IFNB_seqs])

        keys = list(set(IFNB_rndvals.keys()).intersection(IFNB_seqs.keys()))
        
        IFNB = array([list(IFNB_seqs[k]) for k in keys])
        IFNB_rndvals = array([array(IFNB_rndvals[k], float) for k in keys])
        return IFNB, IFNB_rndvals, keys
예제 #19
0
파일: exp.py 프로젝트: bh0085/compbio
def cluster(similarities, self_sim):
  if not os.path.isdir( cfg.dataPath('bdtnp/clustering/nuclei/')):
    os.mkdir( cfg.dataPath('bdtnp/clustering/nuclei/'))

  ny = len(similarities)
  simfile = open(\
    cfg.dataPath('bdtnp/clustering/nuclei/Similarities.txt'),'w')
  ssfile = open(\
    cfg.dataPath('bdtnp/clustering/nuclei/Preferences.txt'),'w')
  simlines = ['{0:05d}   {1:05d}  {2:g}\n'.\
                format(i+1, j+1, similarities[i,j]) 
              for i in range(ny) for j in range(ny) if i != j]
  for s in simlines: simfile.write(s)
  preflines = ['{0:0.8g}\n'.format(self_sim) for i in range(ny)]
  for p in preflines: ssfile.write(p)
  
  ssfile.close()
  simfile.close()
예제 #20
0
파일: endo.py 프로젝트: bh0085/zhang
def enzyme_link(name):
    index = open(cfg.dataPath('zhang/neb_products.html')).read()
    d = pq(index)
    print d('a')
    named_elt = d('a').filter(lambda x: name.lower() in pq(this).text().lower())[0]
    product_link = named_elt.attrib['href']
    return product_link

    return 0
예제 #21
0
def load_seqs(seq_dir = cfg.dataPath('zhang/tal_array/seqs')):
    seqs = {}
    for f in os.listdir(seq_dir):
        fopen = open(os.path.join(seq_dir,f))
        lines = fopen.readlines()
        seqs[lines[0][1:].strip()] = ''.\
            join([l.strip() for l in lines[1:]]) 
        fopen.close()
    return seqs
예제 #22
0
def setModules(**kwargs):
    files = [l for l in os.listdir(cfg.dataPath("batch/tmp")) if "mcmc" in l]
    fpaths = [os.path.join(cfg.dataPath("batch/tmp"), f) for f in files]
    ids = [l[0:10] for l in files]
    inps = [butils.load_data(i, "input") for i in ids]

    modules = {}
    lin_modules = {}
    for fidx, f in enumerate(fpaths):
        print "Getting module info for: {0}".format(f)
        data = sio.loadmat(f)
        tfnames = [d[0][0] for d in data["tf_names"]]
        tgnames = [d[0][0] for d in data["gene_names"]]
        coefs = [d[0][0] for d in data["coefs_dic_nonlinear"]]
        inp = inps[fidx]

        term_list = [list(it.chain(*mod)) for mod in data["model"]]
        for j, terms in enumerate(term_list):
            if sum([len(t) for t in terms]) == 0:
                continue
            for k, t in enumerate(terms):
                mod = tuple([tfnames[i] for i in sorted(t - 1)])
                mod_d = modules.get(mod, dict(genes=[], coefs=[], fpaths=[], clust_fpaths=[]))
                mod_d["genes"].append(tgnames[j])
                mod_d["coefs"].append(coefs[j][k])
                mod_d["clust_fpaths"].append(inp["filename"])
                mod_d["fpaths"].append(f)
                modules[mod] = mod_d

        lin_coefs = [d[0][0] for d in data["coefs_dic_nonlinear"]]
        term_list = [list(it.chain(*mod)) for mod in data["model_linear"]]
        for j, terms in enumerate(term_list):
            if sum([len(t) for t in terms]) == 0:
                continue
            for k, t in enumerate(terms):
                mod = tuple([tfnames[i] for i in sorted(t - 1)])
                mod_d = lin_modules.get(mod, dict(genes=[], coefs=[], fpaths=[], clust_fpaths=[]))
                mod_d["genes"].append(tgnames[j])
                mod_d["coefs"].append(coefs[j][k])
                mod_d["fpaths"].append(f)
                mod_d["clust_fpaths"].append(inp["filename"])

                lin_modules[mod] = mod_d
    return modules, lin_modules
예제 #23
0
파일: io.py 프로젝트: bh0085/compbio
  def setNet(**kwargs):
    net_name = kwargs.get('net_name', 'unsup')
    if net_name == 'unsup':
      netfile = 'unsup_patrick.txt'
    elif net_name == 'logistic':
      netfile = 'logistic_0.6.txt'
    else:
      raise Exception()


    fpath = config.dataPath('network/patrick/{0}'.format(netfile))
    TC = getTC( reset = mod(kwargs.get('reset',0),2))
    CL = getCL( reset = mod(kwargs.get('reset',0),2))
    nwdata = open(fpath).read()
    #A few functions defined here to be used later
    trgfun = lambda x: x[1]
    wtfun = lambda x:float( x[2] )
    tffun = lambda x: x[0]
    sigmafun = lambda x: 1 / (1 + np.exp(-x /1))

    r = re.compile('^[ ]*(?P<tf>\S+)\s+(?P<target>\S+)\s+(?P<weight>\S+)'
                   ,re.M)
    matches = list(re.finditer(r,nwdata))    
    #Unsorted lists of tfs and targets
    targets =map(lambda x:x.group('target'),matches)
    tfs =    map(lambda x:x.group('tf'),matches)
    weights =map(lambda x:x.group('weight'),matches)
    
    #Concat the data for easier sorting
    cat = []
    for i in np.argsort(tfs):
      if TC.has_key(tfs[i]) and CL.has_key(targets[i]):
	cat.append([tfs[i],targets[i],weights[i]])

    #Extract a dictionary with information for each target.
    trg_d = {}
    count = 0.0
    for k, g in it.groupby(sorted(cat,key = trgfun),key = trgfun):
      l = list(g)
      count += 1.0
      trg_d[k] = {'color': np.array([count, 0, 0]),
		  'tfs' : map(tffun,l),
		  'weights': map(wtfun,l)
		  }


    #Extract a dictionary with information for each TF
    tf_d = {}
    for k, g in it.groupby(cat,key = lambda x: x[0]):
      l = list(g)
      tf_targets = map(lambda x: x[1],l)
        
      tf_d[k] = {'targets':map(trgfun,l),
		 'weights':map(wtfun,l)}

    return  (trg_d, tf_d)
예제 #24
0
def view2():
    files = [l for l in os.listdir(cfg.dataPath("batch/outputs")) if "mcmc" in l]
    ids = [l[0:10] for l in files]
    ids = ids[::10]

    inps = [butils.load_data(i, "input") for i in ids]
    outs = [butils.load_data(i, "output") for i in ids]

    # idxs_good = nonzero(greater([elt.get('improve_ratio') for elt in outs],, .2 )[0]
    idxs_good = range(len(outs))

    outs = [o for idx, o in enumerate(outs) if idx in idxs_good]
    inps = [i for idx, i in enumerate(inps) if idx in idxs_good]

    params = inps[0].keys()

    f = myplots.fignum(1, (8, 8))

    params = params

    for i, p in enumerate(params):
        ax = f.add_axes([0.05, i * (1.0 / len(params)), 0.9, 1.0 / len(params)], title=p)
        # ax.set_yticks([])
        # ax.set_xticks([])

        xvals = [elt.get(p) for elt in inps]
        if type(xvals[0]) == str:
            continue
        yvals = [elt.get("improve_ratio") for elt in outs]
        yvals2 = [elt.get("stay_same") for elt in outs]

        yvals += random.rand(*shape(yvals)) * (max(yvals) - min(yvals)) / 50
        yvals2 += random.rand(*shape(yvals)) * (max(yvals) - min(yvals)) / 50
        xvals += random.rand(*shape(xvals)) * (max(xvals) - min(xvals)) / 50
        ax.scatter(xvals, yvals)

        # ax.scatter(xvals , yvals + yvals2,   25, color = 'red')
        ax.annotate(p, [0, 0], xycoords="axes fraction", ha="left", va="bottom")

    f.savefig(cfg.dataPath("figs/soheil/broad_run0_psplits.ps"))
    raise Exception()

    return inps
예제 #25
0
def enzyme_link(name):
    index = open(cfg.dataPath('zhang/neb_products.html')).read()
    d = pq(index)
    print d('a')
    named_elt = d('a').filter(
        lambda x: name.lower() in pq(this).text().lower())[0]
    product_link = named_elt.attrib['href']
    return product_link

    return 0
예제 #26
0
파일: __init__.py 프로젝트: bh0085/compbio
def get_leaf_16s(clade):
  cltree = Phylo.BaseTree.Tree(clade)
  leaves = clade.get_terminals()
  
  l0 = leaves[0]
  p0 = clade.get_path(l0)[-3]
  siblings = p0.get_terminals()

  rrnas = []
  random.seed(5)

  t0 = Phylo.BaseTree.Tree(p0)
  
  ct = 0
  names = []
  for l in t0.get_terminals():
    gbacc= clade_gbacc(l)
    gbid = gbl.search_sorted(gbl.prefix(gbacc), gbacc)
    rrna = rna4gbid(gbid, dbname = '16s')
    rrnas.append(list(map(lambda x: ord(x),rrna)))
    l.name = 'SEQ%i  '%(ct)
    names.append(l.name)
    ct += 1

  raise Exception()
  arr = array([list(x) for x in rrnas])
  letters = sum( not_equal(arr, ord('-')), 0)

  ungapped_arr = arr[:,nonzero(letters)[0]]
  seq_letters = [''.join([chr(x) for x in y]).replace('-','-') for y in ungapped_arr]
  
  #there are about a thousand nonzero elements and really quite few 
  #gaps in the sequence that we get out of this method.

  align = Align.Generic.Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
  for i in range(len(seq_letters)): align.add_sequence(names[i], seq_letters[i])
  AlignIO.write(align,open(config.dataPath('alignments/halo_16s.phylip'),'w'), 'phylip')
  AlignIO.write(align,open(config.dataPath('alignments/halo_16s.fasta'),'w'), 'fasta')
  AlignIO.write(align,open(config.dataPath('alignments/halo_16s.nexus'),'w'), 'nexus')

  #t0 = Phylo.BaseTree.Tree(p0)
  Phylo.write(t0,open(config.dataPath('trees/halo_16s.newick'), 'w'), 'newick')
예제 #27
0
def mat_tmp_fnames(run_id, num):
  '''get temporary filenames with .mat appended that matlab
  saves can be written to. 
  
  (matlab doesn't like loading save files without .mat extension).
  '''
  tmp_dir = cfg.dataPath('batch/tmp')
  names = [os.path.join(tmp_dir, run_id + '_tmp{0:03d}.mat'.format(idx))
                        for idx in range(num)]
                   
  return names
예제 #28
0
파일: infernal.py 프로젝트: bh0085/projects
def profiles(seq, structs, run_id):
    '''Compute a sequence profile using cmbuild with --rsearch
from a single sequence and fixed secondary structure. 

The reason to call profiles for several structures at
once is to avoid filename collisions by automatically
generating filenames for each of n structs.

input:
  seq:      a biopython SeqRecord object.
  structs:  an array of biopython.
  run_id:   a run id to avoid collisions of temporary files.

output:
  profiles: paths to files containing cm profiles for each struct

'''
    exemplar_stks = []
    for i, s in enumerate(structs):
        stk = ['.'] * len(seq)
        for p in s:
            stk[p[0]], stk[p[1]] = '(', ')'
        stk = ''.join(stk)
        exemplar_stks.append(rutils.stk_format(seq, stk))
    profiles = []
    for idx, stktext in enumerate(exemplar_stks):
        stkfile = cfg.dataPath('infernal/temp/{0}_{1:03}_{2}.stk'.format(
            seq.id, idx, run_id))
        cmfile = cfg.dataPath('infernal/temp/{0}_{1:03}_{2}.cm'.format(
            seq.id, idx, run_id))
        fopen = open(stkfile, 'w')
        fopen.write(stktext)
        fopen.close()
        cstr = 'cmbuild -F --rsearch {0} {1} {2}'.format(
            cfg.dataPath('infernal/matrices/RIBOSUM85-60.mat'), cmfile,
            stkfile)
        ispc = spc.Popen(cstr, shell=True, stdout=spc.PIPE)
        out = ispc.communicate()[0]
        profiles.append(cmfile)

    return profiles
예제 #29
0
파일: analyze.py 프로젝트: bh0085/compbio
 def set_motifs(**kwargs):
     mfpath = cfg.dataPath('motifs/all_vert_motifs.txt')
     fpath = cfg.dataPath('CRE/{0}_for_motifs.txt'.format(promoter_type))
     cmd = 'motif-match -n 1 -m {0}  -V 1'.format(mfpath)
     cmd2 = 'xargs echo'
     prc = spc.Popen(cmd, shell = True, stdin = spc.PIPE, stdout = spc.PIPE)
     mlines = prc.communicate(input = open(fpath).read())[0].splitlines()
     
     seqs = {}
     for o in mlines:
         o = o.split(' ')
         name = o[1]
         entry = seqs.get(name, [])
         entry.append({'motif':o[0],
                       'start':int(o[2]),
                       'end':int(o[3]),
                       'strand':o[4],
                       'score':float(o[6])})
         seqs[name] = entry
         
     return seqs
예제 #30
0
def sort_prefixes(volume_name="cb"):
    prefix_path = config.dataPath(config.dataURL("genbank/prefixes"))
    for p in os.listdir(prefix_path):
        f = os.path.join(prefix_path, p)
        fopen = open(f)
        lines = fopen.readlines()
        lsort = sorted(lines)
        fopen.close()
        fopen = open(f, "w")
        fopen.writelines(lsort)
        fopen.close()
        print p
예제 #31
0
파일: netutils.py 프로젝트: bh0085/compbio
def load_TS(reset = 0):
    hardcopy = True
    net_dir = os.path.abspath(os.path.dirname(inspect.getfile(inspect.currentframe())))
    if not reset:
        #no reason to use name... only one cl is available
        out, sxs = mem.read(default_name, hardcopy = hardcopy, np = False)
        if not sxs: raise Exception()
    else:
        out = pickle.load(open(config.dataPath('network/TC.pickle')))
        mem.write(default_name , out, hardcopy = hardcopy, np = False)

    return out
예제 #32
0
파일: utils.py 프로젝트: bh0085/projects
def select_exemplars_from_clustering(structs,struct_counts,seq, draw = False):
      min_count = 2

      freq_structs = [s for i, s in enumerate(structs) if struct_counts[i] >= min_count]
      if len(freq_structs) < 10:
          min_count = 1
          freq_structs = [s for i, s in enumerate(structs) if struct_counts[i] >= min_count]
      struct_counts= [s for i, s in enumerate(struct_counts) if s >= min_count]
      structs = freq_structs
      
      struct_energies = [struct_energy(seq, s) for s in structs]
      if len(structs) > 225:
          high_e = argsort(struct_energies)[::-1][:225]
          structs =[ structs[i] for  i in high_e]
          struct_counts =[ struct_counts[i] for  i in high_e]
          struct_energies =[ struct_energies[i] for  i in high_e]
                     
      
      
      clusters = cluster_2(structs,  struct_counts, seq, ptype = 'full_pairs')
      if draw:
          print 'DRAWING Clusters'
          verts = struct_verts(structs, seq, 'tempname')
          cluster_2_show(clusters, verts)
          f = plt.gcf()
          f.savefig(cfg.dataPath('figs/RNAfoldz/clusters_{0}.ps'.format(savename)))
      exemplars = set(clusters)
      cluster_exemplars = []
      for e in exemplars:
          reps =array([ (i, eng) for i, eng in enumerate(struct_energies) if clusters[i] == e])
          min_rep = reps[:,0][argmax(reps[:,1])]
          cluster_exemplars.append(min_rep)
          
      cluster_exemplars = set([int(e) for e in cluster_exemplars])
      sorted_exemplars = set(argsort(struct_counts)[::-1][:n_countsorted])
      energy_exemplars = set(argsort(struct_energies)[::-1][:n_esorted])
      final_exemplars = cluster_exemplars.union(sorted_exemplars).union(energy_exemplars)

      print '''Structural exemplars found:
Clustering:     {0}  {4}
Count sorting:  {1}  {5}
Energy sorting: {2}  {6}

Total unique:   {3}'''.format(len(cluster_exemplars),len(sorted_exemplars), 
                              len(energy_exemplars),len(final_exemplars),
                              mean([struct_energies[i] for i in cluster_exemplars]),
                              mean([struct_energies[i] for i in sorted_exemplars]),
                              mean([struct_energies[i] for i in energy_exemplars]))


      final_structs, final_energies = zip(*[(structs[i],struct_energies[i]) for i in  final_exemplars])
      return final_structs, final_energies
예제 #33
0
def get_run_num():
  '''
  Automatically get a run number from the max of all files
  so far saved in input/output/logs
  '''
  cur_id = max([int(e) for e in re.findall(\
          re.compile('([0-9]+)'),' '.join(it.chain(*
              [os.listdir(cfg.dataPath(d)) 
               for d in  ['batch/inputs',
                          'batch/outputs',
                          'batch/logs']])))]+ [-1])
  num = cur_id + 1
  return num
예제 #34
0
파일: analyze.py 프로젝트: bh0085/compbio
def write_seqs_to_motifs():
    seqs, rnd, keys = get_mutants()
    cons = get_cons()
    
    contents = ''
    for i, c in enumerate(seqs):
        k = keys[i]
        name = k
        contents +=  '\n'.join(['A {0} 1 {1}'.format(k,len(cons)),
                           '>{0}'.format(promoter_type),
                           ''.join(c).lower(),'\n'])
    outfile = open(cfg.dataPath('CRE/{0}_for_motifs.txt'.format(promoter_type)),'w')
    outfile.write(contents)
예제 #35
0
def errors():
    files = [l for l in os.listdir(cfg.dataPath("batch/tmp")) if "mcmc" in l]
    fpaths = [os.path.join(cfg.dataPath("batch/tmp"), f) for f in files]
    ids = [l[0:10] for l in files]

    inps = [butils.load_data(i, "input") for i in ids]

    idxs_good = nonzero(greater([elt.get("out_iter_num") for elt in inps], -1))[0]
    inps = [inps[i] for i in idxs_good]
    fpaths = [fpaths[i] for i in idxs_good]

    errors, staysames, improves = [], [], []
    for l, elt in enumerate(zip(fpaths, inps)):
        f, inp = elt

        data = sio.loadmat(f)
        errors.append(data["error"])
        staysames.append(data["stay_same"])
        improves.append(data["improve_ratio"])
        gnames = data["gene_names"]

    return errors, staysames, improves, gnames
예제 #36
0
def _write_rna(run_id, struct, seqs, seqnames):
    '''
Write a datafile for the mcmc tree builder in phase.

Seqs should be specified simply as an (ascii) list of 
strings having values AUGC.

The file itself should have a first line giving:

nseqs, lenseqs, seqtype 
  eg:   '16 3571 STRUCT'

then the structure should be spec'd with '(.)'

and then the seqs in format: 

'NAME'   AUGC...
GCUGUGUGUGCUU... 

'NAME2'  AUGU...
AUAUAUUAUAAUA...

...

INPUTS:
 struct   (specified as pairs)
 seqs     (specified as strlist)
 seqnames (specified as s.sttrlist)

'''
    rutils = RNAfoldz.utils

    l = len(seqs[0])
    n = len(seqs)
    dtype = 'STRUCT'

    lines = '{0} {1} {2}\n'.format(n, l, dtype)
    lines += '\n'

    stk = rutils.pairs_stk(struct, l)
    lines += '\n'.join(tw.wrap(stk)) + '\n\n'

    for seq, name in zip(seqs, seqnames):
        lines += name + '\n'
        lines += '\n'.join(tw.wrap(seq)) + '\n'
        lines += '\n'
    datafile = cfg.dataPath('phase/{0}/datafile.rna'.format(run_id))
    fopen = open(datafile, 'w')
    fopen.write(lines)

    return
예제 #37
0
def draw_remote_runs(show = 'conservation'):
	outdir = cfg.dataPath('batch/outputs')
	files = [os.path.join(outdir, f) for f in os.listdir(outdir) if 'ra2_' in f][1:]
	for idx, f in enumerate(files):
		print '{0} of {1} files'.format(idx,len(files))
		print f[-100::]
		fopen = open(f)
		out = pickle.load(fopen) 
		if transform:
			'''Fix stuff'''
			out_t = out
		else: out_t = out
		rplots.show_output(out_t)
		fopen.close()
	return outs	
예제 #38
0
파일: utils.py 프로젝트: bh0085/projects
def family_clustered_suboptimals(rfid, plots = True, num = 5000, min_count = 2,
                                 n_countsorted = 10, n_esorted = 10, 
                                 draw = False, cluster_type = 'just_list',
                                 savename = None):
    if savename == None:
        savename = rfid
    ali, tree, infos = rfam.get_fam(rfid)
    ali_ids = [a.name for a in ali]

    for i, n in enumerate(tree.get_terminals()):
        match = re.compile('_([^_]*)_').search(n.name) 
        if not match or not '/' in match.group(1):
            this_seq = []
        else:
            term_id = match.group(1)
            this_seq = ali[ali_ids.index(term_id)]
        n.m = {'seq':this_seq,
               'probs':[1 for j in range(len(this_seq))]}

    big_refnode, big_refseq = \
        subtree_refseq(tree)
    ungapped_ref = ungapped_seq(big_refseq, rfid)
    seq = ungapped_ref
    structs = suboptimals(ungapped_ref, sp_method = 'sample',name = rfid, n = num)

    stks = [pairs_stk(s,len(seq)) for s in structs]
    stk_srt = sorted([ (i,s) for i,s in enumerate(stks)], key = lambda x: x[1])
    stk_groups = [ list(g) for k, g in it.groupby(stk_srt,key =lambda x: x[1])]
    stk_unq, struct_counts = zip(*[( g[0][0] , len(g))  for g in stk_groups])
    structs  = [structs[elt] for elt in stk_unq ]
 
   
    if cluster_type == 'full_clustering':
        final_structs, final_energies = select_exemplars_from_clustering(structs,struct_counts,seq, draw = draw)
        return
    elif cluster_type == 'just_list':
        final_structs, final_energies = select_exemplars_from_list(structs,struct_counts,seq, draw = draw)

    if draw:
        try:
            print 'DRAWING final subopts' 
            verts = struct_verts(final_structs, seq, rfid )
            show_subopts(final_structs, verts, final_energies)
            f = plt.gcf()
            f.savefig(cfg.dataPath('figs/RNAfoldz/exemplars_{0}.ps'.format(savename)))
        except Exception, e:
            print "EXCEPTION!"
            pass
예제 #39
0
def tree_similarity(dist1, dist2, run_id,criterion = 'knn', k = 6):
    if criterion == 'knn':
        nq = len(dist1)
        nb1 = argsort(dist1, 1)[:,1:k+1]
        nb2 = argsort(dist2, 1)[:,1:k+1]
        all_nbs = [set(n1).union(set(n2)) for n1, n2 in zip(nb1, nb2)]
        nb_intersection = [set(n1).intersection(set(n2)) for n1, n2 in zip(nb1, nb2)]
        nb_dists = [ array([[dist1[i, n], dist2[i,n]]for n in nbs ]) for i,nbs in enumerate(all_nbs)]
        #take the first k distances.
        n_disagreements = [len(nbd) - k for nbd in nb_dists]
        nb_dists = array([ sorted(nbd, key = lambda x: min(x))[:k] for nbd in nb_dists])

        frac_diffs = [abs(diff(elt, 1).flatten()) / mean(elt,1) for  elt in nb_dists]
        abs_diffs = [abs(diff(elt, 1).flatten()) for  elt in nb_dists]
        
        ct = mycolors.getct(nq)
        f = myplots.fignum(4, (10,8))
        ax = f.add_axes([.05,.08,.25,.87])
        seismic.seismic(abs_diffs, ax = ax, colors = ct)
        
        jaccard = mean([float(len(nb_intersection[i])) / float(len(all_nbs[i])) for i in range(nq)])

        ax2 = f.add_axes([.34,.08,.6,.87])
        for i,d in enumerate(nb_dists):
            ax2.scatter(d[:,0], d[:,1], 20, alpha = .5,color =ct[i])

        
        lin = linregress(nb_dists[:,:,0].flatten(),nb_dists[:,:,1].flatten())
        rsquared = lin[2]**2

        ax2.annotate('NN dists for multi/struct-aligned trees.\nK = {0}'.format(k),
                    [0,1], xycoords = 'axes fraction', va = 'top',
                    xytext = [10,-10],textcoords = 'offset pixels')
        ax2.annotate('R-Squared: {0:3.3}\nJaccard Index: {1:3.3}'.format(rsquared, mean(jaccard)),
                    [1,0], xycoords = 'axes fraction', ha = 'right',
                    xytext = [-10,10],textcoords = 'offset pixels')
        ax2.set_xlabel('Muscle aligned tree distances')
        ax2.set_ylabel('Struct algined tree distances')
        
        datafile = cfg.dataPath('figs/gpm2/pt2_mus_cm_tree_dists_{0}_k{1}.tiff'.format(run_id, k))
        f.savefig(datafile)
예제 #40
0
import networkx as nx
from numpy import *
import subprocess as spc
import compbio.config as cfg
import os, re

default_flow_dir = cfg.dataPath('graph_flows')


def run_flow(g, gid):
    if not os.path.isdir(default_flow_dir):
        os.mkdir(default_flow_dir)
    write_flow(default_flow_dir, g, gid)
    compute_flow(default_flow_dir, gid)
    return parse_flow(default_flow_dir, gid)


def flow_inp_file(flow_dir, gid):
    return os.path.join(flow_dir, 'flow_{0}.inp'.format(gid))


def flow_out_file(flow_dir, gid):
    return os.path.join(flow_dir, 'flow_{0}.out'.format(gid))


def write_flow(flow_dir, g, gid):
    nn = len(g.nodes())
    ne = len(g.edges())
    lines = []
    lines.append('p min {0} {1}'.format(nn, ne))
    lines.append('n 1 0')
예제 #41
0
from numpy import *
import numpy as np, itertools as it

import matplotlib.pyplot as plt

import compbio.utils.plots as myplots
import compbio.utils.colors as mycolors
import compbio.utils.memo as mem
import compbio.config as cfg

import pickle

figsize = (8,8)
figtype = 'ps'
figfile = cfg.dataPath('figs/gpm2/pt3_fana/{{0}}.{0}'.format(figtype))

do_make_figs = True
if do_make_figs:
    do_make_subopts = False

#flist = [50,311,140,143,495,637,1304]
flist = [311,1304]

def setFamData(rfid = None, ftype = None,**kwargs):

    assert rfid; assert ftype;
    fprefix = 'FA' if ftype == 'all' else 'RS'
    sdat = bsu.load_data('{1}_{0}'.format(rfid,fprefix), 'output')
    tdat = bsu.load_data('{1}_tree_{0}'.format(rfid,fprefix), 'output')
예제 #42
0
def draw_cm_muscle_congruencies(seqs, profiles, run_id, reset = True):
    print 'computing alignments...'
    print '  ...using muscle'
    malis, mrefs, mpairs =\
            mem.getOrSet(setAlignments, 
                         **mem.rc({},
                                  seqs = seqs, profiles = profiles, 
                                  run_id = run_id, ali_type = 'muscle',
                                  reset = reset,
                                  on_fail = 'compute', 
                                  register = 'tuali_musc_{0}'.format(run_id))) 
    print '  ...using cmalign.'
    salis, srefs, spairs  =\
        mem.getOrSet(setAlignments, 
                     **mem.rc({},
                              seqs = seqs, profiles = profiles, 
                              run_id = run_id, ali_type = 'struct',
                              reset = reset,
                              on_fail = 'compute', 
                              register = 'tuali__struct_{0}'.format(run_id)))
 
    print '  ...making trees.'
    
    for idx, alis in enumerate(zip(malis, salis)):
        m, s = alis
        mtree  = phyml.tree(m,run_id, bionj = True)
        stree  = phyml.tree(s,run_id, bionj = True)
        
        maps = dict([(elt.id,i) for i, elt in enumerate(m)])
        mdists = zeros((len(maps),len(maps)))
        sdists = zeros((len(maps),len(maps)))
        for n1 in mtree.get_terminals():
            for n2 in mtree.get_terminals():
                mdists[maps[n1.name],maps[n2.name]] = \
                    mtree.distance(n1,n2)
        
        for n1 in stree.get_terminals():
            for n2 in stree.get_terminals():
                sdists[maps[n1.name],maps[n2.name]] = \
                    stree.distance(n1,n2)
        tree_similarity(sdists, mdists, '{0}_struct_{1}'.format(run_id,idx), k = len(sdists - 1))
        tree_similarity(sdists, mdists, '{0}_struct_{1}'.format(run_id,idx), k = 6)

        f = myplots.fignum(4, (8,10))
        ct = mycolors.getct(len(mtree.get_terminals()))

        import networkx

        for t, sp, ttype in zip([mtree, stree], [211,212], ['sequence', 'structural']):
            a = f.add_subplot(sp)
            layout = 'neato'
            G = phylo.to_networkx(t)
            Gi = networkx.convert_node_labels_to_integers(G, discard_old_labels=False)
            posi = networkx.pygraphviz_layout(Gi, layout, args = '')
            posn = dict((n, posi[Gi.node_labels[n]]) for n in G)


            networkx.draw(G, posn, labels = dict([(n, '') for n in G.nodes()]),
                      node_size = [100 if  n.name in maps.keys() else 0 for n in G.nodes()],
                      width = 1, edge_color = 'black',
                      ax = a,
                      node_color = [ct[maps.get(n.name, -1)] for n in G.nodes()] )
        

            a.annotate('Embedded tree for {0} alignment.'.format(ttype),
                    [0,1], xycoords = 'axes fraction', va = 'top',
                    xytext = [10,0],textcoords = 'offset pixels')
            a.annotate('Total branch length is {0}'.format(t.total_branch_length()),
                    [1,0], xycoords = 'axes fraction', ha = 'right',
                    xytext = [-10,10],textcoords = 'offset pixels')            

        #phylo.draw_graphviz(  mtree,  label_func = lambda x: '', 
        #                      node_color = [ct[maps.get(n.name, -1)] for n in G.nodes()] +\
        #                          [ct[0] for n in mtree.get_nonterminals()], axes = ax)

        datafile = cfg.dataPath('figs/gpm2/pt2_mus_cm_tree_embeddings_{0}_struct_{1}.ps'.format(run_id, idx))
        f.savefig(datafile, dpi = 200, format = 'ps')
예제 #43
0
#!/usr/bin/env python
'''
nt.py

Contains a few utilities for looking up nucleotide level info 
for zhang lab sequences of interest.

'''

import compbio.config as cfg
from Bio import SeqIO


ntfiles = {
    'nrx':cfg.dataPath('sequences/zhang/nt/nrx1_human_nt.gb'),
    'nlg':cfg.dataPath('sequences/zhang/nt/nlg1_human_nt.gb')                   
    }
aafiles = {
    'nrx':cfg.dataPath('sequences/zhang/aa/nrx1_human_aa.gb'),
    'nlg':cfg.dataPath('sequences/zhang/aa/nlg1_human_aa.gb')       
    }

def get_seq( name,  aa = True):
    seq = SeqIO.parse(open(aafiles[name]), 'genbank') if aa \
        else SeqIO.parse(open(ntfiles[name]),'genbank')

    return seq
예제 #44
0
def get_seq_groups(rfid = 'RF00167', reset = True, tree = True,
        draw_distances = draw_all_easy,
        draw_clusters = draw_all_easy,
        draw_single_cluster = draw_all_hard):
    '''
Run the tree computation for each clsuter in the rfam family.
(Or just one)

1) Compute clusters using a distance measure derived either 
   phyml or a simple levenshtein dist.

   kwds:
     tree          [True]  Use a tree or just a levenshtein 
                           distance to get distances for
                           init clustering.

2) Choose a cluster of well related sequences and for this 
   this cluster, compute an alignment (For each structure 
   using phase or for sequences using MUSCLE)
  
   kwds:
     struct_align  [True]   Whether to compute structural 
                            alignments or use MUSCLE

'''
    rutils = utils

    ali, tree, infos = rfam.get_fam(rfid)
    n = len(ali)

    if draw_distances:
        dists_t = seq_dists(ali,rfid, tree = True)
        dists_l = seq_dists(ali,rfid, tree = False)
        dtf = dists_t.flatten()
        dlf = dists_l.flatten()
        lin = linregress(dtf, dlf)
        rsquared = lin[2]**2

        f = myplots.fignum(5, (7,7))
        ax = f.add_subplot(111)
        ax.annotate('Levenshtein distance vs. BioNJ branch lengths',
                    [0,1], xycoords = 'axes fraction', va = 'top',
                    xytext = [10,-10],textcoords = 'offset pixels')
        ax.annotate('R-Squared: {0}'.format(rsquared),
                    [1,0], xycoords = 'axes fraction', ha = 'right',
                    xytext = [-10,10],textcoords = 'offset pixels')
        ax.set_xlabel('BIONJ Tree ML Distance')
        ax.set_ylabel('Levenshtein Distance')

        ax.scatter(dtf, dlf, 100)
        
        datafile = cfg.dataPath('figs/gpm2/pt2_lev_tree_dists.tiff')
        f.savefig(datafile)
        
    dists = mem.getOrSet(setDistances, ali = ali, tree = tree, run_id = rfid,
                         register = rfid, 
                         on_fail = 'compute',
                         reset = reset)
    
    clusters = maxclust_dists(dists, k = 5, method = 'complete')
    clusters -= 1

    if draw_clusters:

        ct = mycolors.getct(len(set(clusters)))
        colors = [ct[elt] for elt in clusters]
        pca_vecs = mlab.PCA(dists).project(dists) 
        
        f = myplots.fignum(5, (8,8))
        ax = f.add_subplot(111)
        ax.annotate('Rfam sequence clusters in first 2 PC of sequence space.',
                    [0,1], xycoords = 'axes fraction', va = 'top',
                    xytext = [10,-10],textcoords = 'offset pixels')
        ax.annotate('Number of Clusters: {0}'.format(len(ct)),
                    [1,0], xycoords = 'axes fraction', ha = 'right',
                    xytext = [-10,10],textcoords = 'offset pixels')
        ax.set_xlabel('PC 1')
        ax.set_ylabel('PC 2')

        ax.scatter(pca_vecs[:,0],pca_vecs[:,1], 20, color = colors)
        
        datafile = cfg.dataPath('figs/gpm2/pt2_all_seqs_clustered.ps')
        f.savefig(datafile)        

    #now take the largest cluster and do the analysis.
    
    cgrps = dict([ (k, list(g)) 
              for k , g  in it.groupby(\
                sorted( list(enumerate(clusters)),key = lambda x: x[1]),
                key = lambda x: x[1])])
    cbig = argmax([len(x) for x in cgrps.values()])
    cluster_seqs = [ elt[0] for elt in cgrps.values()[cbig] ] 
    csize = len(cluster_seqs)
    seqs =[ali[c] for c in cluster_seqs]

    
    
    if 0:

        ct = mycolors.getct(2)
        pca_vecs = mlab.PCA(dists).project(dists) 
        colors =[ct[1] if elt in cluster_seqs else ct[0] for elt in range(len(pca_vecs))] 
        
        f = myplots.fignum(5, (8,8))
        ax = f.add_subplot(111)
        ax.annotate('Inter and intra cluster distances vs. PC0 component for chosen cluster.',
                    [0,1], xycoords = 'axes fraction', va = 'top',
                    xytext = [10,-10],textcoords = 'offset pixels')
        ax.annotate('Number of cluster sequences: {0}, Number of total sequences'.format(csize, n  - csize),
                    [1,0], xycoords = 'axes fraction', ha = 'right',
                    xytext = [-10,10],textcoords = 'offset pixels')
        ax.set_xlabel('PC 0')
        ax.set_ylabel('Distance')


        for s in cluster_seqs:
            ax.scatter(pca_vecs[:,0],dists[s,:] ,200 *exp(-(dists[s,:] / .5) **2),  color = colors, alpha = .2)
        
        datafile = cfg.dataPath('figs/gpm2/pt2_focused_cluster_dists.ps')
        f.savefig(datafile)        
        
    clusters_final  = [ [ elt[0] for elt in cgrps.values()[i] ] for i in range(len(cgrps.values()))]
    seqs_final = [ [ ali[idx] for idx in clust ] for clust in clusters_final]
    return seqs_final
예제 #45
0
def save_muts_structs(out, out_tree):
    ofile = open(cfg.dataPath('RNAfoldz/out.pickle'), 'w')
    otfile = open(cfg.dataPath('RNAfoldz/out_tree.pickle'), 'w')
    pickle.dump(out, ofile)
    pickle.dump(out_tree, otfile)
    ofile.close(), otfile.close()
예제 #46
0
def _clear_folder(run_id):
    datadir = cfg.dataPath('phase/{0}/'.format(run_id))
    for f in os.listdir(datadir):
        os.remove(os.path.join(datadir, f))
예제 #47
0
파일: sequencing.py 프로젝트: bh0085/zhang
import align as ali
import compbio.config as cfg
import os 
import Bio.SeqIO as sio

default = cfg.dataPath('zhang/sequencing/piggybac')

def run_directory(directory = default):
    refs = [os.path.join(root, f) 
            for root, dirs, files in  os.walk(os.path.join(directory, 'refs')) 
            for f in files]
    all_refs = [sio.parse( f, format = 'fasta').next()
                for f in refs]
    

    results = [os.path.join(root, f)  
               for root, dirs, files in  os.walk(os.path.join(directory, 'results')) 
               for f in files]
    result_sequences = load_genewiz_seqs(results)

    for r in all_refs:
        print 'Aligning to reference: {0} '.format(r)
        out = align_seqs(r, result_sequences)
        for k,v in out.iteritems():
            print 'result {0}: {1}'.format(k,v)
    
def load_genewiz_seqs(filenames):
    seqs = {}
    for f in filenames:
        fopen = open(f)
        lines = fopen.readlines()
예제 #48
0
def get_consensus(rfid = 'RF00', mweight = .5, 
                  refseq_method = 'root', sp_method = 'sample',
                  aff_type = 'pairs',  reset = True,
                  do_plot = False,  run_id = 'CONS_TEST'):

    ali, tree, infos = rfam.get_fam(rfid)
    ali_ids = [a.name for a in ali]

    for i, n in enumerate(tree.get_terminals()):
        term_id = re.compile('_([^_]*)_').search(n.name).group(1) 
        this_seq = ali[ali_ids.index(term_id)]
        n.m = {'seq':this_seq,
               'probs':[1 for j in range(len(this_seq))]}

    #if do_plot : rplots.plot_clusters(inds,{'pca embedding':pca_vecs},title = title,plot3d = True)
    

    big_refnode, big_refseq = \
        subtree_refseq(tree, method = refseq_method)
    ungapped_ref = rutils.ungapped_seq(big_refseq, rfid)
    #pca_vecs,exemplar_structs =
    return family_exemplar_structs(rfid,
                                   sp_method = sp_method,
                                   refseq_method = refseq_method,
                                   aff_type = aff_type,
                                   )
    struct_profiles = infernal.profiles(ungapped_ref,exemplar_structs, run_id)

    clades = split_tree(tree)
    all_vecs = {'all_time':[ [ [] for i in range(len(struct_profiles))] 
			     for j in range(len(clades)) ],
		'all_mut':[ [ [] for i in range(len(struct_profiles))] 
			     for j in range(len(clades)) ],
		'fiftyfifty':[ [ [] for i in range(len(struct_profiles))] 
			     for j in range(len(clades)) ]}

    aamuts, aatimes, aairr, aagaps = [], [], [], []
    for idx_clade, c in enumerate(clades):
        if len(c.get_terminals()) < 3:
		print 'SKIPPPING CUZ SUBTREE TOO SMALL'
		continue
	c_ids = [ n.m['seq'].name for n in c.get_terminals() ]
	if len(nonzero(greater([len(list(g)) for k, g in it.groupby(sorted(c_ids))],1))[0])>0:
		print 'SKIPPING CUZ THERE ARE TWO COPIES OF SOME F*****G SEQUENCE IN TREE'
		continue          
        all_muts, all_times , all_gaps, all_irr = [], [], [], []
	print
	print 'Clade: {0}'.format(idx_clade)
        for idx_struct, struct_info in enumerate( zip( struct_profiles, exemplar_structs)):
          struct_profile, ex_struct = struct_info
	  ngaps = 0

          #OLD ALIGNMENTS
          calis = ba.MultipleSeqAlignment(\
              [n.m['seq'] for n in c.get_terminals() ])
          #NEW ALIGNMENTS AND REF STRUCTURE
          c_new_ali , stk, struct = infernal.alignment(calis, struct_profile, rfid)
          #REF STRUCTURE PAIRS
          pairs = rutils.stk_pairs(struct)
	  if len(pairs) != len(ex_struct):
		  raise Exception()
           
          cterms = c.get_terminals()
          for i2, ct in enumerate(cterms):
              lilid =  'N{0}'.format(i2)
              ct.name = lilid
              ct.m['str_seq'] = c_new_ali[i2]
              ct.m['str_seq'].id = lilid
	      ct.m['probs'] = ones(len(c_new_ali[i2]))
          
          #BUILD A TREE
          tr = phy.BaseTree.Tree(c)

          #RUN PAML
          paml_run_id = 'ali_anc_c{0:04}_s{0:03}'.format(idx_clade,idx_struct)
          rstfile= paml.run_paml(tr, c_new_ali, run_id = paml_run_id)
          anc_tree = paml.rst_parser(rstfile) 

          #Label extent and internal nodes with sequences.
          for term in anc_tree.get_terminals():
              #Terminals have old (rfam) alis and new (infernal) alis
              term.m = filter( lambda x: x.name == term.name, cterms)[0].m
          for node in anc_tree.get_nonterminals():
              #Internals only have new alis. m['seq'] = m['str_seq']
              node.m['str_seq'] = node.m['seq']
              node.m['str_seq'].seq = node.m['str_seq'].seq.replace('T', 'U')
          subtree = anc_tree
              
 
          #Evaluate all of the structs on the first pass
          #to have access to mean frequencies of different
          #mutational types in the final score computation
	  
          refnode, refseq = subtree_refseq(subtree, method = refseq_method)
          muts, times, gaps, irresolvables = subtree_count_struct(subtree, pairs)
          all_muts.append(muts)
          all_times.append(times)
	  all_gaps.append(gaps)
	  all_irr.append(irresolvables)
        
	compute_signatures(all_vecs,idx_clade,
			   all_muts,all_times,
			   exemplar_structs,ungapped_ref )
				      
	aamuts.append(all_muts)
	aatimes.append(all_times)
	aairr.append(all_irr)
	aagaps.append(all_gaps)
    outputs = {
	    'all_vecs':all_vecs,
	    'all_muts':aamuts,
	    'all_times':aatimes,
	    'exemplar_structs':exemplar_structs,
	    'reference_seq':ungapped_ref,
	    'thermo_ex_inds':inds,
	    'thermo_embedding':pca_vecs,
	    'title':title,
	    'thermo_aff_type':aff_type,
	    'tree':tree,
	    'run_id':run_id
	    }
	 
    pickle.dump(outputs, open(cfg.dataPath('cs874/runs/{0}.pickle'.format(run_id)),'w'))
    return(outputs)
예제 #49
0
def _write_ml_ctl(run_id, outgroup_name):
    datafile = cfg.dataPath('phase/{0}/datafile.rna'.format(run_id))
    outfile = cfg.dataPath('phase/{0}/outfile.phylip'.format(run_id))
    cfgfile = cfg.dataPath('phase/{0}/control.ml'.format(run_id))

    data = '''
#Phylogenetic tree reconstruction in the ML framework with mlphase
#The dataset in this example is small and mlphase can be used.

{DATAFILE}
Data file = %(datafile)s
Interleaved data file = no
#Use the "automatic method" to analyse this dataset:
#unpaired nucleotides ('.' in the secondary structure) are
#handled by the MODEL1 of the MIXED model (see below).
#pairs (corresponding parenthesis in the secondary structure)
#are handled by the MODEL2 of the MIXED model (see balow)
Heterogeneous data models = auto
{\DATAFILE}

''' % {
        'datafile': datafile
    }
    model = '''#Set up a MIXED model with REV for loops and 7D for stems
{MODEL}
Model = MIXED
Number of models = 2
  {MODEL1}
  Model = REV
  Discrete gamma distribution of rates = yes
  Number of gamma categories = 6
  Invariant sites             = no
  {\MODEL1}
  {MODEL2}
  Model = RNA7D
  Discrete gamma distribution of rates = yes
  Number of gamma categories = 6
  Invariant sites             = no
{\MODEL2}
{\MODEL}'''
    tree = '''
#A TREE block
{TREE}
#You must specify an outgroup although it is used for representation
#purpose only and it does not affect the results.
#This outgroup must be the name of a species in your datafile or the name
#of a clade in your clusters file (see below).
Outgroup = %(outgroup)s
#See manual for the available heuristic/exhaustive search method.
Search algorithm = Stepwise addition
#Optional: we specify a file that contains monophyletic clades. Tree topologies
#that do not match these constraints are not evaluated.
#Clusters file = sequence-data/hiv6.cls
{\TREE}
''' % {
        'outgroup': outgroup_name
    }

    run_cfg = '''
Random seed=9

Output file   = %(outfile)s

''' % {
        'outfile': outfile
    }

    all_text = '\n'.join([data, model, tree, run_cfg])
    fopen = open(cfgfile, 'w')
    fopen.write(all_text)
예제 #50
0
def get_muts_structs():
    ofile = open(cfg.dataPath('RNAfoldz/out.pickle'), 'w')
    otfile = open(cfg.dataPath('RNAfoldz/out_tree.pickle'), 'w')
    ofile.close
    otfile.close()
    return pickle.load(out, ofile), pickle.load(out_tree, otfile)
예제 #51
0
def get_remote_runs(run_range):
	outdir = cfg.dataPath('batch/outputs')
	files = [os.path.join(outdir, f) for f in os.listdir(outdir) if 'ra2_' in f]
	outs = [ pickle.load(open(f)) for f in files[0:20] ]
	return outs
예제 #52
0
def _write_mcmc_ctl(run_id, outgroup_name):
    datafile = cfg.dataPath('phase/{0}/datafile.rna'.format(run_id))
    outfile = cfg.dataPath('phase/{0}/outfile.phylip'.format(run_id))
    cfgfile = cfg.dataPath('phase/{0}/control.mcmc'.format(run_id))
    '''Write a control file for the mcmc tree builder in phase

'''
    data = '''
#A standard DATAFILE block for RNA sequences having a secondary structure.
#see also the sequence file sequence-data/mammals69.rna 

{DATAFILE}
Data file = %(datafile)s
Interleaved data file = no
#Use the "automatic method" to analyse this dataset:
#unpaired nucleotides ('.' in the secondary structure) are
#handled by the MODEL1 of the MIXED model (see below).
#pairs (corresponding parenthesis in the secondary structure)
#are handled by the MODEL2 of the MIXED model (see balow)
Heterogeneous data models = auto
{\DATAFILE}

''' % {
        'datafile': datafile
    }
    model = '''

#Set up a MIXED model with REV for loops and 7D for stems
{MODEL}
Model = MIXED
Number of models = 2
  {MODEL1}
  Model = REV
  Discrete gamma distribution of rates = yes
  Number of gamma categories = 6
  Invariant sites             = no
  {\MODEL1}
  {MODEL2}
  Model = RNA7D
  Discrete gamma distribution of rates = yes
  Number of gamma categories = 6
  Invariant sites             = no
{\MODEL2}
{\MODEL}

'''

    tree = '''

#Use a standard unrooted tree. The outgroup is compulsory but do not affect the results.
{TREE}
Tree = Unrooted MCMC tree
Outgroup = %(outgroup)s
{\TREE}

''' % {
        'outgroup': outgroup_name
    }
    perturbation = '''
    
#Tuning parameters for the MCMC runs. 
{PERTURBATION}

#relative proposals probabilities between the tree and the substitution model
Tree, proposal priority = 8
Model, proposal priority = 1

{PERTURBATION_TREE}
#We use 10/40 for topology change vs branch length changes.
#It is not exactly equivalent to 1/4 because this is also given relative to the
#proposal priority for hyperparameters that are introduced with the
#the prior on branch lengths (Hyperpriors, proposal priority)
Topology changes, proposal priority = 10
Branch lengths, proposal priority = 40
Hyperpriors, proposal priority = 1

#We use a vague prior exp(lambda) on branch lengths rather than the default exp(10)
Branch lengths, prior = exponential(uniform(0,100))
#A lambda hyperparameter has been introduced. It needs a "proposal priority"
#but this is not used because it is the only hyperparameter
Branch lengths exponential hyperparameter, proposal priority = 1
{\PERTURBATION_TREE}

{PERTURBATION_MODEL}
#relative probabilities for the proposals on the two models and the average substitution rate of MODEL2 
Model 1, proposal priority = 10
Model 2, proposal priority = 10
Average rates, proposal priority = 1
{PERTURBATION_MODEL1}
    Frequencies, proposal priority = 2
    Rate ratios, proposal priority = 1
    Gamma parameter, proposal priority = 1
{\PERTURBATION_MODEL1}
{PERTURBATION_MODEL2}
    Frequencies, proposal priority = 2
    Rate ratios, proposal priority = 1
    Gamma parameter, proposal priority = 1
{\PERTURBATION_MODEL2}
{\PERTURBATION_MODEL}

{\PERTURBATION}
'''
    run_cfg = '''

Random seed = 11

Burnin iterations = 750
Sampling iterations = 150
Sampling period = 150


Output file   = %(outfile)s
Output format = phylip
''' % {
        'outfile': outfile
    }

    #OLD VALUES:
    #Burnin iterations = 750000
    #Sampling iterations = 1500000
    #Sampling period = 150

    all_text = '\n'.join([data, model, tree, perturbation, run_cfg])
    fopen = open(cfgfile, 'w')
    fopen.write(all_text)

    return
예제 #53
0
파일: utils.py 프로젝트: bh0085/projects
        return
    elif cluster_type == 'just_list':
        final_structs, final_energies = select_exemplars_from_list(structs,struct_counts,seq, draw = draw)

    if draw:
        try:
            print 'DRAWING final subopts' 
            verts = struct_verts(final_structs, seq, rfid )
            show_subopts(final_structs, verts, final_energies)
            f = plt.gcf()
            f.savefig(cfg.dataPath('figs/RNAfoldz/exemplars_{0}.ps'.format(savename)))
        except Exception, e:
            print "EXCEPTION!"
            pass
    
    fopen = open(cfg.dataPath('RNAfoldz/subopts_{0}.pickle'.format(savename)),'w')

    return final_structs,final_energies, seq
    pickle.dump({'structs':final_structs, 'energies':final_energies, 'seq':seq},
                fopen)
    

def select_exemplars_from_clustering(structs,struct_counts,seq, draw = False):
      min_count = 2

      freq_structs = [s for i, s in enumerate(structs) if struct_counts[i] >= min_count]
      if len(freq_structs) < 10:
          min_count = 1
          freq_structs = [s for i, s in enumerate(structs) if struct_counts[i] >= min_count]
      struct_counts= [s for i, s in enumerate(struct_counts) if s >= min_count]
      structs = freq_structs