def split_family_seqs(): alis_dir = cfg.dataPath('rfam/family_alis/') meta_dir = cfg.dataPath('rfam/family_metas/') fopen = open(cfg.dataPath('rfam/Rfam.seed')) alis = aio.parse(fopen,'stockholm') while 1: infos = {} start = fopen.tell() while 1: l = fopen.readline() if l == '': break if l[0] == '#': ukey = str(l[5:7]) infos.update( [(ukey, infos.get(ukey,'') + l[8:])]) else: if l.strip() != '': break fopen.seek(start) ali = alis.next() if not ali: break rfname = infos['AC'].strip() alifile = open(os.path.join(alis_dir, rfname+'.fa'),'w') metafile = open(os.path.join(meta_dir, rfname+'.pickle'),'w') aio.write(ali, alifile, 'fasta') pickle.dump(infos, metafile) alifile.close() metafile.close()
def split_family_seqs(): alis_dir = cfg.dataPath('rfam/family_alis/') meta_dir = cfg.dataPath('rfam/family_metas/') fopen = open(cfg.dataPath('rfam/Rfam.seed')) alis = aio.parse(fopen, 'stockholm') while 1: infos = {} start = fopen.tell() while 1: l = fopen.readline() if l == '': break if l[0] == '#': ukey = str(l[5:7]) infos.update([(ukey, infos.get(ukey, '') + l[8:])]) else: if l.strip() != '': break fopen.seek(start) ali = alis.next() if not ali: break rfname = infos['AC'].strip() alifile = open(os.path.join(alis_dir, rfname + '.fa'), 'w') metafile = open(os.path.join(meta_dir, rfname + '.pickle'), 'w') aio.write(ali, alifile, 'fasta') pickle.dump(infos, metafile) alifile.close() metafile.close()
def setNet(**kwargs): method =kwargs.get('method', 'tree') num = kwargs.get('num', 1) description_path = cfg.dataPath('::daniel/net%s_chip_features.tsv') % num data_path = cfg.dataPath('::daniel/informativeness/%s%s.txt') %(method,num) split_re = re.compile('\s') desc_open = open(description_path) description_cols = split_re.split(desc_open.readline().strip()) + ['Exp_Index'] description_vals = [split_re.split(l.strip()) for l in desc_open.readlines()] for idx, d in enumerate(description_vals): d.append(idx) data_open = open(data_path) weight, tf, exp = zip(*[array(split_re.split(l.strip()), float) for l in data_open.readlines()]) exp = [ e -1 for e in exp] description = {} for i in range(len(description_cols)): description[description_cols[i]] = [d[i] for d in description_vals] ntf = np.max(tf) + 1 nexp = len(description.values()[0]) grid = zeros((ntf,nexp)) for vals in zip(weight,tf,exp): grid[vals[1], vals[2]] = float(vals[0]) return grid, description
def rna_draw(seq, struct, name, out_type = 'svg'): lines = '{0}\n{1}\n'.format(seq,struct) if out_type == 'png': outfile = cfg.dataPath('rnafold/{0}.png'.format(name)) rprc = spc.Popen('RNAplot -o svg; convert rna.svg {0}'.format(outfile), shell = True, stdin = spc.PIPE, stdout = spc.PIPE) out = rprc.communicate(input = lines)[0].splitlines() from matplotlib._png import read_png image = read_png(outfile) elif out_type== 'svg': outfile = cfg.dataPath('rnafold/{0}.svg'.format(name)) tempdir = 'tmp_{0}'.format(name); rprc = spc.Popen('mkdir {1}; cd {1}; RNAplot -o svg; mv rna.svg {0}; cd ..; rm -r {1};'.format(outfile, tempdir), shell = True, stdin = spc.PIPE, stdout = spc.PIPE) out = rprc.communicate(input = lines)[0].splitlines() struct_svg = open(outfile).read() data = xparse.parse(struct_svg) arr = svg.get_polys(data)[0] else: raise Exception() return arr
def get_fam(rfid): '''Get a family including tree and sequence information from an Rfam data dump stored in data/rfam inputs: rfid: rfam family id. outputs: ali: a biopython alignment tree: a biopython tree from a newick file. info: information parsed from the original stockholm file. ''' fmeta = open(cfg.dataPath('rfam/family_metas/{0}.pickle'.format(rfid))) fali = open(cfg.dataPath('rfam/family_alis/{0}.fa'.format(rfid))) ali = aio.parse(fali, 'fasta').next() info = pickle.load(fmeta) fname = cfg.dataPath('rfam/Rfam.seed_tree/{0}.seed_tree'.format(rfid)) tree = nio.parse( open(cfg.dataPath( 'rfam/Rfam.seed_tree/{0}.seed_tree'.format(rfid)))).next() return ali, tree, info
def names(): files = [l for l in os.listdir(cfg.dataPath("batch/tmp")) if "mcmc" in l] fpaths = [os.path.join(cfg.dataPath("batch/tmp"), f) for f in files] data = sio.loadmat(fpaths[0]) gnames = data["gene_names"] tfnames = data["tf_names"] return gnames, tfnames
def load(net = 2, num = 1676, min_module_size = 10, min_go_size = 5, max_go_modules = 2, prb_threshold = [1e-6,.01]): fopen = open(cfg.dataPath('daniel/heatplots/net{0}_top{1}_heatplot_matrix.txt'.format(net,num)),'r') l0 = fopen.readline() arr = array([[float(elt) for elt in l.split('\t')] for l in fopen.xreadlines() if l.strip() != '']) ids = arr[:,:1] arr = arr[:,1:] arr[equal(arr,0)] = np.min(arr[not_equal(arr,0)]) arr = -1 * log10(arr) clines = open(cfg.dataPath('daniel/heatplots/net{0}_top1676_communities.txt'.\ format(net))).readlines() n_per_modules = [len(c.split('\t')) for c in clines] glines = open(cfg.dataPath('daniel/heatplots/net{0}_goterm_counts.txt'.\ format(net))).readlines() n_per_go = dict([c.split('\t') for c in glines if c.strip() != '']) for k, v in n_per_go.iteritems(): n_per_go[k] = int(v.strip()) big_mods = nonzero(greater(n_per_modules,min_module_size))[0] big_gos = set([ k for k, v in n_per_go.iteritems() if v > min_go_size]) col_tits = [s.strip() for s in l0.split('\t')[1:] if s.strip() != ''] #FOR SOME WEIRD REASON, ONE OF THE COLUMNS THAT SHOULD BE A GO NAME IS #CALLED 'V3'. AS V3 IS NOT PRESENT IN THE GO DESCRIPTIONS LIST, #I LEAVE IT OUT. acols = array([ idx for idx, elt in enumerate(col_tits) if elt in big_gos ]) arows = array([ idx for idx, elt in enumerate(ids) if elt in big_mods ]) arr = arr[arows][:, acols] thr = -1 * log10(array(prb_threshold)) arr[greater(arr,thr[0])] = thr[0] arr[less(arr,thr[1])] = thr[1] arr = arr - np.min(arr) arr = arr / np.max(arr) go_modules = sum(arr, 0) final_cols = nonzero(less(go_modules,max_go_modules)*\ greater(go_modules,0))[0] acols = acols[final_cols] arr = arr[:,final_cols] return arr, \ array([ col_tits[idx] for idx in acols]), \ array([ ids[idx] for idx in arows])
def alignment(seqs_in, profile, run_id): '''Compute an alignment of multiple sequences to a given covariance model profile such as constructed by cmbuild via infernal.profiles. input: seqs: a list of biopython SeqRecord objects profile: the filename of a covariance model profile run_id: a run id to use for naming temporary files to avoid collisions output: ali: an rfam multiple sequence alignment ref: the profile reference sequence aligned to ali struct: the profile reference structure aligned to ali ''' if type(seqs_in[0]) == str: raise Exception( 'Sorry but string lists are not supported. We need ids!') #seqs = [Bio.SeqRecord.SeqRecord(Bio.Seq.Seq(s, # Bio.Seq.Alphabet.RNAAlphabet), # 'S{0:03}'.format(idx)) # for idx, s in enumerate(seqs)] else: seqs = [ Bio.SeqRecord.SeqRecord( Bio.Seq.Seq( ''.join([let for let in str(ali.seq) if let in 'AUTGC']), Bio.Seq.Alphabet.RNAAlphabet), 'S{0:03}'.format(idx)) for idx, ali in enumerate(seqs_in) ] name_maps = dict([('S{0:03}'.format(idx), s.id) for idx, s in enumerate(seqs_in)]) infile = cfg.dataPath('infernal/temp/{0}_{1:03}_unaligned.fa'.format( run_id, idx)) outfile = cfg.dataPath('infernal/temp/{0}_{1:03}_aligned.stk'.format( run_id, idx)) Bio.SeqIO.write(seqs, infile, 'fasta') cstr = 'cmalign -o {0} {1} {2}'.format(outfile, profile, infile) ispc = spc.Popen(cstr, shell=True, stdout=spc.PIPE) out = ispc.communicate()[0] fopen = open(outfile) seqs, ref, struct = rutils.stk_parse(fopen) fopen.close() ali = ba.MultipleSeqAlignment(seqs) for a in ali: a.seq = a.seq.upper() a.id = name_maps[a.id] return ali, ref, struct
def alignment(seqs_in, profile,run_id): '''Compute an alignment of multiple sequences to a given covariance model profile such as constructed by cmbuild via infernal.profiles. input: seqs: a list of biopython SeqRecord objects profile: the filename of a covariance model profile run_id: a run id to use for naming temporary files to avoid collisions output: ali: an rfam multiple sequence alignment ref: the profile reference sequence aligned to ali struct: the profile reference structure aligned to ali ''' if type(seqs_in[0]) == str: raise Exception('Sorry but string lists are not supported. We need ids!') #seqs = [Bio.SeqRecord.SeqRecord(Bio.Seq.Seq(s, # Bio.Seq.Alphabet.RNAAlphabet), # 'S{0:03}'.format(idx)) # for idx, s in enumerate(seqs)] else: seqs = [Bio.SeqRecord.SeqRecord(Bio.Seq.Seq(''.join([let for let in str(ali.seq) if let in 'AUTGC' ]), Bio.Seq.Alphabet.RNAAlphabet), 'S{0:03}'.format(idx)) for idx, ali in enumerate(seqs_in)] name_maps = dict( [('S{0:03}'.format(idx), s.id) for idx, s in enumerate(seqs_in)]) infile = cfg.dataPath('infernal/temp/{0}_{1:03}_unaligned.fa'.format(run_id,idx)) outfile= cfg.dataPath('infernal/temp/{0}_{1:03}_aligned.stk'.format(run_id,idx)) Bio.SeqIO.write(seqs, infile, 'fasta') cstr = 'cmalign -o {0} {1} {2}'.format(outfile, profile, infile) ispc = spc.Popen(cstr, shell = True, stdout = spc.PIPE) out = ispc.communicate()[0] fopen = open(outfile) seqs, ref, struct = rutils.stk_parse(fopen) fopen.close() ali = ba.MultipleSeqAlignment(seqs) for a in ali: a.seq = a.seq.upper() a.id = name_maps[a.id] return ali, ref, struct
def parse_CL(): f = open(config.dataPath('network/CL.geneexp')).read() elts =f.split('\n') seqdict = {} for e in elts: matches = list(re.finditer(re.compile('([^\s]+)'), e)) if not len(matches): continue name = matches[0].group(1) seqdict[name] = [] for i in matches[1:]: seqdict[name].append(float(i.group(1))) pickle.dump(seqdict,open(config.dataPath('network/CL.pickle','w')))
def run_paml(tree_in,ali_in, run_id= 'T%05i' % (0,), verbose = False): ''' Given an input tree in the form of a Biopython tree with branch lengths and names, write to a file and run paml's baseml to generate a maximum likelihood ancestry in the path data/paml/rst ''' paml_d = config.dataPath('paml') run_d = config.dataPath(os.path.join(paml_d , 'run_{0}'.format(run_id))) if not os.path.isdir(paml_d): os.mkdir(paml_d) if not os.path.isdir(run_d): os.mkdir(run_d) old_cwd = os.getcwd() os.chdir(run_d) outfilepath = 'paml_tree_{0}.paml'.format(run_id) treefilepath = 'paml_tree_{0}.newick'.format(run_id) treefile = open(treefilepath,'w') phylo.write(tree_in,treefile,'newick', plain = True) treefile.close() alifilepath ='paml_tree_{0}.phylip' .format(run_id) alifile = open(alifilepath, 'w') aio.write(ali_in, alifile, 'phylip') alifile.close() ctlfilepath= 'baseml_{0}.ctl'.format(run_id) ctlfile = open(ctlfilepath,'w') ctlfile.write(make_baseml(treefilepath, alifilepath, outfilepath, ancestors = 1)) ctlfile.close() command = 'baseml {0} '.format(ctlfilepath) #fix a damned paml bug. sed_command = "sed -i -e '1 s/$/\ \ I/' {0}"\ .format(alifilepath) sprc = subprocess.Popen(sed_command, stdout = subprocess.PIPE, shell = True) comms = sprc.communicate() pprc = subprocess.Popen( command, stdout = subprocess.PIPE, shell = True) comms = pprc.communicate() if verbose: print comms[0] os.chdir(old_cwd) rstfile = os.path.join(run_d,'rst') return rstfile
def print_soheil(): n_tfs = 8 trgs, tfs = parse_net() trg_subset = dict([ (tgkey, v )for tgkey, v in trgs.iteritems() if len(v['weights']) >= n_tfs]) for k in trg_subset.keys()[20:]: trg_subset.pop(k) TS = load_TS() tpts = arange(20) for k, tg in trg_subset.iteritems(): t_lines = [''] * len(tpts) gts = array(TS[k])[tpts] for i in range(len(gts)): t_lines[i] += '{0:20}'.format(gts[i]) sorted_weights = argsort(tg['weights'])[::-1][0:n_tfs] tfs= [tg['tfs'][i] for i in sorted_weights] for tf in tfs: fts = array(TS[tf])[tpts] for i in range(len(gts)): t_lines[i] += ' {0:20}'.format(fts[i]) keys = ['{0:20}'.format(k)] keys.extend(['{0:20}'.format(tf) for tf in tfs]) l0 = ' '.join(keys) fname = '{0}.txt'.format(k) fopened = open(config.dataPath('network/'+fname),'w') fopened.write(l0 + '\n' + '\n'.join(t_lines)) fopened.close()
def polyfile(chr = 10, grp = 'ASW', **kwargs): root = cfg.dataPath('hapmap/phase3/polymorphic') fname = os.path.join(root, 'genotypes_chr{0}_{1}_phase3.2_nr.b36_fwd.txt.gz'\ .format(chr,grp)); contents = gzip.open(fname).readlines() return contents
def tmp_fnames(run_id, num): '''get temporary filenames that scripts can write to''' tmp_dir = cfg.dataPath('batch/tmp') names = [os.path.join(tmp_dir, run_id + '_tmp{0:03d}'.format(idx)) for idx in range(num)] return names
def tree(alignment, run_id = 'T%05i' % (0,), bionj = False): old_cwd = os.getcwd() new_wd = config.dataPath('phyml') if not os.path.isdir(new_wd): os.mkdir(new_wd) os.chdir(new_wd) infilepath = 'infile{0}'.format(run_id) infile = open(infilepath,'w') aio.write(alignment, infile, 'phylip') infile.close() command = 'phyml --quiet -i {0} -o {1} '.format(infilepath, 'n' if bionj else 'tlr' ) print command subprocess.call(command, shell = True, stdout = subprocess.PIPE) treefilepath = infilepath + '_phyml_tree.txt' treefile = open(treefilepath) tree =phylo.read(treefile, 'newick') treefile.close() os.chdir(old_cwd) return tree
def launch_many(run_id): ''' Generate script paramaters and launch a bunch of bsub jobs. Designed to be run on the cluster via an interactive shell. Note: If this is not run on cluster, since it does not look up a remote url for files, it won't be able to find expression data. ''' print 'Launching all jobs!' #MAKE INPUTS expr_filenames = ['soheil/expression_c4d_n4_tt_{0}.mat'.format(ttnum) for ttnum in range(70)] + ['soheil/expression_c4d_n4_intercluster.mat'] urls = [ cfg.dataURL(f) for f in expr_filenames ] remote_exprnames =[ cfg.dataPath(url) for url in urls ] inp_dicts = [dict(out_iter_num = out_iter_num, in_iter_num = in_iter_num, k = k, beta = beta, f_mix = f_mix, f_sim = f_sim, f_en_in = f_en_in, f_en_out = f_en_out, th_cor = th_cor, trunc_value = trunc_value, degree_bound = degree_bound, filename = filename) for out_iter_num in array([25],double) for in_iter_num in array([100],double) for k in array([6],double) for beta in array([4],double) for f_mix in array([2],double) for f_sim in array([.8],double) for f_en_in in array([1.],double) for f_en_out in array([1.],double) for th_cor in array([.6],double) for trunc_value in array([3],double) for degree_bound in array([3],double) for filename in remote_exprnames ] #MAKE EYEBALL eyeball = bsub.eyeball(run_id, os.path.abspath(inspect.stack()[0][1]), inp_dicts, func = 'run_single', name = 'mcmc_', mem = 3) #LAUNCH EYEBALL JOBS eyeball.launch() #RETURN A LIST OF LAUNCHED JOBS return dict(cmds=eyeball.cmds, inputs = inp_dicts)
def setCRE(**kwargs): cre_des = open(cfg.dataPath('CRE/27k/CRE_Randomization_Design.txt')) cre_rnd = open(cfg.dataPath('CRE/27k/CRE_Randomization.dat')) cre_rnd = cre_rnd.readlines() cre_des = cre_des.readlines() cre_rndvals = [[elt.strip() for elt in line.split('\t')] for line in cre_rnd[1:]] cre_seqs = [[elt.strip() for elt in line.split('\t')] for line in cre_des] cre_rndvals = dict([(e[0], e[1:]) for e in cre_rndvals]) cre_seqs = dict([[e[0],e[1]] for e in cre_seqs]) keys = list(set(cre_rndvals.keys()).intersection(cre_seqs.keys())) cre = array([list(cre_seqs[k]) for k in keys]) cre_rndvals = array([array(cre_rndvals[k], float) for k in keys]) return cre, cre_rndvals, keys
def setIFNB(**kwargs): IFNB_des = open(cfg.dataPath('CRE/27k/IFNB_Randomization_Design.txt')) IFNB_rnd = open(cfg.dataPath('CRE/27k/IFNB_Randomization.dat')) IFNB_rnd = IFNB_rnd.readlines() IFNB_des = IFNB_des.readlines() IFNB_rndvals = [[elt.strip() for elt in line.split('\t')] for line in IFNB_rnd[1:]] IFNB_seqs = [[elt.strip() for elt in line.split('\t')] for line in IFNB_des] IFNB_rndvals = dict([(e[0], e[1:]) for e in IFNB_rndvals]) IFNB_seqs = dict([[e[0],e[1]] for e in IFNB_seqs]) keys = list(set(IFNB_rndvals.keys()).intersection(IFNB_seqs.keys())) IFNB = array([list(IFNB_seqs[k]) for k in keys]) IFNB_rndvals = array([array(IFNB_rndvals[k], float) for k in keys]) return IFNB, IFNB_rndvals, keys
def cluster(similarities, self_sim): if not os.path.isdir( cfg.dataPath('bdtnp/clustering/nuclei/')): os.mkdir( cfg.dataPath('bdtnp/clustering/nuclei/')) ny = len(similarities) simfile = open(\ cfg.dataPath('bdtnp/clustering/nuclei/Similarities.txt'),'w') ssfile = open(\ cfg.dataPath('bdtnp/clustering/nuclei/Preferences.txt'),'w') simlines = ['{0:05d} {1:05d} {2:g}\n'.\ format(i+1, j+1, similarities[i,j]) for i in range(ny) for j in range(ny) if i != j] for s in simlines: simfile.write(s) preflines = ['{0:0.8g}\n'.format(self_sim) for i in range(ny)] for p in preflines: ssfile.write(p) ssfile.close() simfile.close()
def enzyme_link(name): index = open(cfg.dataPath('zhang/neb_products.html')).read() d = pq(index) print d('a') named_elt = d('a').filter(lambda x: name.lower() in pq(this).text().lower())[0] product_link = named_elt.attrib['href'] return product_link return 0
def load_seqs(seq_dir = cfg.dataPath('zhang/tal_array/seqs')): seqs = {} for f in os.listdir(seq_dir): fopen = open(os.path.join(seq_dir,f)) lines = fopen.readlines() seqs[lines[0][1:].strip()] = ''.\ join([l.strip() for l in lines[1:]]) fopen.close() return seqs
def setModules(**kwargs): files = [l for l in os.listdir(cfg.dataPath("batch/tmp")) if "mcmc" in l] fpaths = [os.path.join(cfg.dataPath("batch/tmp"), f) for f in files] ids = [l[0:10] for l in files] inps = [butils.load_data(i, "input") for i in ids] modules = {} lin_modules = {} for fidx, f in enumerate(fpaths): print "Getting module info for: {0}".format(f) data = sio.loadmat(f) tfnames = [d[0][0] for d in data["tf_names"]] tgnames = [d[0][0] for d in data["gene_names"]] coefs = [d[0][0] for d in data["coefs_dic_nonlinear"]] inp = inps[fidx] term_list = [list(it.chain(*mod)) for mod in data["model"]] for j, terms in enumerate(term_list): if sum([len(t) for t in terms]) == 0: continue for k, t in enumerate(terms): mod = tuple([tfnames[i] for i in sorted(t - 1)]) mod_d = modules.get(mod, dict(genes=[], coefs=[], fpaths=[], clust_fpaths=[])) mod_d["genes"].append(tgnames[j]) mod_d["coefs"].append(coefs[j][k]) mod_d["clust_fpaths"].append(inp["filename"]) mod_d["fpaths"].append(f) modules[mod] = mod_d lin_coefs = [d[0][0] for d in data["coefs_dic_nonlinear"]] term_list = [list(it.chain(*mod)) for mod in data["model_linear"]] for j, terms in enumerate(term_list): if sum([len(t) for t in terms]) == 0: continue for k, t in enumerate(terms): mod = tuple([tfnames[i] for i in sorted(t - 1)]) mod_d = lin_modules.get(mod, dict(genes=[], coefs=[], fpaths=[], clust_fpaths=[])) mod_d["genes"].append(tgnames[j]) mod_d["coefs"].append(coefs[j][k]) mod_d["fpaths"].append(f) mod_d["clust_fpaths"].append(inp["filename"]) lin_modules[mod] = mod_d return modules, lin_modules
def setNet(**kwargs): net_name = kwargs.get('net_name', 'unsup') if net_name == 'unsup': netfile = 'unsup_patrick.txt' elif net_name == 'logistic': netfile = 'logistic_0.6.txt' else: raise Exception() fpath = config.dataPath('network/patrick/{0}'.format(netfile)) TC = getTC( reset = mod(kwargs.get('reset',0),2)) CL = getCL( reset = mod(kwargs.get('reset',0),2)) nwdata = open(fpath).read() #A few functions defined here to be used later trgfun = lambda x: x[1] wtfun = lambda x:float( x[2] ) tffun = lambda x: x[0] sigmafun = lambda x: 1 / (1 + np.exp(-x /1)) r = re.compile('^[ ]*(?P<tf>\S+)\s+(?P<target>\S+)\s+(?P<weight>\S+)' ,re.M) matches = list(re.finditer(r,nwdata)) #Unsorted lists of tfs and targets targets =map(lambda x:x.group('target'),matches) tfs = map(lambda x:x.group('tf'),matches) weights =map(lambda x:x.group('weight'),matches) #Concat the data for easier sorting cat = [] for i in np.argsort(tfs): if TC.has_key(tfs[i]) and CL.has_key(targets[i]): cat.append([tfs[i],targets[i],weights[i]]) #Extract a dictionary with information for each target. trg_d = {} count = 0.0 for k, g in it.groupby(sorted(cat,key = trgfun),key = trgfun): l = list(g) count += 1.0 trg_d[k] = {'color': np.array([count, 0, 0]), 'tfs' : map(tffun,l), 'weights': map(wtfun,l) } #Extract a dictionary with information for each TF tf_d = {} for k, g in it.groupby(cat,key = lambda x: x[0]): l = list(g) tf_targets = map(lambda x: x[1],l) tf_d[k] = {'targets':map(trgfun,l), 'weights':map(wtfun,l)} return (trg_d, tf_d)
def view2(): files = [l for l in os.listdir(cfg.dataPath("batch/outputs")) if "mcmc" in l] ids = [l[0:10] for l in files] ids = ids[::10] inps = [butils.load_data(i, "input") for i in ids] outs = [butils.load_data(i, "output") for i in ids] # idxs_good = nonzero(greater([elt.get('improve_ratio') for elt in outs],, .2 )[0] idxs_good = range(len(outs)) outs = [o for idx, o in enumerate(outs) if idx in idxs_good] inps = [i for idx, i in enumerate(inps) if idx in idxs_good] params = inps[0].keys() f = myplots.fignum(1, (8, 8)) params = params for i, p in enumerate(params): ax = f.add_axes([0.05, i * (1.0 / len(params)), 0.9, 1.0 / len(params)], title=p) # ax.set_yticks([]) # ax.set_xticks([]) xvals = [elt.get(p) for elt in inps] if type(xvals[0]) == str: continue yvals = [elt.get("improve_ratio") for elt in outs] yvals2 = [elt.get("stay_same") for elt in outs] yvals += random.rand(*shape(yvals)) * (max(yvals) - min(yvals)) / 50 yvals2 += random.rand(*shape(yvals)) * (max(yvals) - min(yvals)) / 50 xvals += random.rand(*shape(xvals)) * (max(xvals) - min(xvals)) / 50 ax.scatter(xvals, yvals) # ax.scatter(xvals , yvals + yvals2, 25, color = 'red') ax.annotate(p, [0, 0], xycoords="axes fraction", ha="left", va="bottom") f.savefig(cfg.dataPath("figs/soheil/broad_run0_psplits.ps")) raise Exception() return inps
def enzyme_link(name): index = open(cfg.dataPath('zhang/neb_products.html')).read() d = pq(index) print d('a') named_elt = d('a').filter( lambda x: name.lower() in pq(this).text().lower())[0] product_link = named_elt.attrib['href'] return product_link return 0
def get_leaf_16s(clade): cltree = Phylo.BaseTree.Tree(clade) leaves = clade.get_terminals() l0 = leaves[0] p0 = clade.get_path(l0)[-3] siblings = p0.get_terminals() rrnas = [] random.seed(5) t0 = Phylo.BaseTree.Tree(p0) ct = 0 names = [] for l in t0.get_terminals(): gbacc= clade_gbacc(l) gbid = gbl.search_sorted(gbl.prefix(gbacc), gbacc) rrna = rna4gbid(gbid, dbname = '16s') rrnas.append(list(map(lambda x: ord(x),rrna))) l.name = 'SEQ%i '%(ct) names.append(l.name) ct += 1 raise Exception() arr = array([list(x) for x in rrnas]) letters = sum( not_equal(arr, ord('-')), 0) ungapped_arr = arr[:,nonzero(letters)[0]] seq_letters = [''.join([chr(x) for x in y]).replace('-','-') for y in ungapped_arr] #there are about a thousand nonzero elements and really quite few #gaps in the sequence that we get out of this method. align = Align.Generic.Alignment(Gapped(IUPAC.unambiguous_dna, "-")) for i in range(len(seq_letters)): align.add_sequence(names[i], seq_letters[i]) AlignIO.write(align,open(config.dataPath('alignments/halo_16s.phylip'),'w'), 'phylip') AlignIO.write(align,open(config.dataPath('alignments/halo_16s.fasta'),'w'), 'fasta') AlignIO.write(align,open(config.dataPath('alignments/halo_16s.nexus'),'w'), 'nexus') #t0 = Phylo.BaseTree.Tree(p0) Phylo.write(t0,open(config.dataPath('trees/halo_16s.newick'), 'w'), 'newick')
def mat_tmp_fnames(run_id, num): '''get temporary filenames with .mat appended that matlab saves can be written to. (matlab doesn't like loading save files without .mat extension). ''' tmp_dir = cfg.dataPath('batch/tmp') names = [os.path.join(tmp_dir, run_id + '_tmp{0:03d}.mat'.format(idx)) for idx in range(num)] return names
def profiles(seq, structs, run_id): '''Compute a sequence profile using cmbuild with --rsearch from a single sequence and fixed secondary structure. The reason to call profiles for several structures at once is to avoid filename collisions by automatically generating filenames for each of n structs. input: seq: a biopython SeqRecord object. structs: an array of biopython. run_id: a run id to avoid collisions of temporary files. output: profiles: paths to files containing cm profiles for each struct ''' exemplar_stks = [] for i, s in enumerate(structs): stk = ['.'] * len(seq) for p in s: stk[p[0]], stk[p[1]] = '(', ')' stk = ''.join(stk) exemplar_stks.append(rutils.stk_format(seq, stk)) profiles = [] for idx, stktext in enumerate(exemplar_stks): stkfile = cfg.dataPath('infernal/temp/{0}_{1:03}_{2}.stk'.format( seq.id, idx, run_id)) cmfile = cfg.dataPath('infernal/temp/{0}_{1:03}_{2}.cm'.format( seq.id, idx, run_id)) fopen = open(stkfile, 'w') fopen.write(stktext) fopen.close() cstr = 'cmbuild -F --rsearch {0} {1} {2}'.format( cfg.dataPath('infernal/matrices/RIBOSUM85-60.mat'), cmfile, stkfile) ispc = spc.Popen(cstr, shell=True, stdout=spc.PIPE) out = ispc.communicate()[0] profiles.append(cmfile) return profiles
def set_motifs(**kwargs): mfpath = cfg.dataPath('motifs/all_vert_motifs.txt') fpath = cfg.dataPath('CRE/{0}_for_motifs.txt'.format(promoter_type)) cmd = 'motif-match -n 1 -m {0} -V 1'.format(mfpath) cmd2 = 'xargs echo' prc = spc.Popen(cmd, shell = True, stdin = spc.PIPE, stdout = spc.PIPE) mlines = prc.communicate(input = open(fpath).read())[0].splitlines() seqs = {} for o in mlines: o = o.split(' ') name = o[1] entry = seqs.get(name, []) entry.append({'motif':o[0], 'start':int(o[2]), 'end':int(o[3]), 'strand':o[4], 'score':float(o[6])}) seqs[name] = entry return seqs
def sort_prefixes(volume_name="cb"): prefix_path = config.dataPath(config.dataURL("genbank/prefixes")) for p in os.listdir(prefix_path): f = os.path.join(prefix_path, p) fopen = open(f) lines = fopen.readlines() lsort = sorted(lines) fopen.close() fopen = open(f, "w") fopen.writelines(lsort) fopen.close() print p
def load_TS(reset = 0): hardcopy = True net_dir = os.path.abspath(os.path.dirname(inspect.getfile(inspect.currentframe()))) if not reset: #no reason to use name... only one cl is available out, sxs = mem.read(default_name, hardcopy = hardcopy, np = False) if not sxs: raise Exception() else: out = pickle.load(open(config.dataPath('network/TC.pickle'))) mem.write(default_name , out, hardcopy = hardcopy, np = False) return out
def select_exemplars_from_clustering(structs,struct_counts,seq, draw = False): min_count = 2 freq_structs = [s for i, s in enumerate(structs) if struct_counts[i] >= min_count] if len(freq_structs) < 10: min_count = 1 freq_structs = [s for i, s in enumerate(structs) if struct_counts[i] >= min_count] struct_counts= [s for i, s in enumerate(struct_counts) if s >= min_count] structs = freq_structs struct_energies = [struct_energy(seq, s) for s in structs] if len(structs) > 225: high_e = argsort(struct_energies)[::-1][:225] structs =[ structs[i] for i in high_e] struct_counts =[ struct_counts[i] for i in high_e] struct_energies =[ struct_energies[i] for i in high_e] clusters = cluster_2(structs, struct_counts, seq, ptype = 'full_pairs') if draw: print 'DRAWING Clusters' verts = struct_verts(structs, seq, 'tempname') cluster_2_show(clusters, verts) f = plt.gcf() f.savefig(cfg.dataPath('figs/RNAfoldz/clusters_{0}.ps'.format(savename))) exemplars = set(clusters) cluster_exemplars = [] for e in exemplars: reps =array([ (i, eng) for i, eng in enumerate(struct_energies) if clusters[i] == e]) min_rep = reps[:,0][argmax(reps[:,1])] cluster_exemplars.append(min_rep) cluster_exemplars = set([int(e) for e in cluster_exemplars]) sorted_exemplars = set(argsort(struct_counts)[::-1][:n_countsorted]) energy_exemplars = set(argsort(struct_energies)[::-1][:n_esorted]) final_exemplars = cluster_exemplars.union(sorted_exemplars).union(energy_exemplars) print '''Structural exemplars found: Clustering: {0} {4} Count sorting: {1} {5} Energy sorting: {2} {6} Total unique: {3}'''.format(len(cluster_exemplars),len(sorted_exemplars), len(energy_exemplars),len(final_exemplars), mean([struct_energies[i] for i in cluster_exemplars]), mean([struct_energies[i] for i in sorted_exemplars]), mean([struct_energies[i] for i in energy_exemplars])) final_structs, final_energies = zip(*[(structs[i],struct_energies[i]) for i in final_exemplars]) return final_structs, final_energies
def get_run_num(): ''' Automatically get a run number from the max of all files so far saved in input/output/logs ''' cur_id = max([int(e) for e in re.findall(\ re.compile('([0-9]+)'),' '.join(it.chain(* [os.listdir(cfg.dataPath(d)) for d in ['batch/inputs', 'batch/outputs', 'batch/logs']])))]+ [-1]) num = cur_id + 1 return num
def write_seqs_to_motifs(): seqs, rnd, keys = get_mutants() cons = get_cons() contents = '' for i, c in enumerate(seqs): k = keys[i] name = k contents += '\n'.join(['A {0} 1 {1}'.format(k,len(cons)), '>{0}'.format(promoter_type), ''.join(c).lower(),'\n']) outfile = open(cfg.dataPath('CRE/{0}_for_motifs.txt'.format(promoter_type)),'w') outfile.write(contents)
def errors(): files = [l for l in os.listdir(cfg.dataPath("batch/tmp")) if "mcmc" in l] fpaths = [os.path.join(cfg.dataPath("batch/tmp"), f) for f in files] ids = [l[0:10] for l in files] inps = [butils.load_data(i, "input") for i in ids] idxs_good = nonzero(greater([elt.get("out_iter_num") for elt in inps], -1))[0] inps = [inps[i] for i in idxs_good] fpaths = [fpaths[i] for i in idxs_good] errors, staysames, improves = [], [], [] for l, elt in enumerate(zip(fpaths, inps)): f, inp = elt data = sio.loadmat(f) errors.append(data["error"]) staysames.append(data["stay_same"]) improves.append(data["improve_ratio"]) gnames = data["gene_names"] return errors, staysames, improves, gnames
def _write_rna(run_id, struct, seqs, seqnames): ''' Write a datafile for the mcmc tree builder in phase. Seqs should be specified simply as an (ascii) list of strings having values AUGC. The file itself should have a first line giving: nseqs, lenseqs, seqtype eg: '16 3571 STRUCT' then the structure should be spec'd with '(.)' and then the seqs in format: 'NAME' AUGC... GCUGUGUGUGCUU... 'NAME2' AUGU... AUAUAUUAUAAUA... ... INPUTS: struct (specified as pairs) seqs (specified as strlist) seqnames (specified as s.sttrlist) ''' rutils = RNAfoldz.utils l = len(seqs[0]) n = len(seqs) dtype = 'STRUCT' lines = '{0} {1} {2}\n'.format(n, l, dtype) lines += '\n' stk = rutils.pairs_stk(struct, l) lines += '\n'.join(tw.wrap(stk)) + '\n\n' for seq, name in zip(seqs, seqnames): lines += name + '\n' lines += '\n'.join(tw.wrap(seq)) + '\n' lines += '\n' datafile = cfg.dataPath('phase/{0}/datafile.rna'.format(run_id)) fopen = open(datafile, 'w') fopen.write(lines) return
def draw_remote_runs(show = 'conservation'): outdir = cfg.dataPath('batch/outputs') files = [os.path.join(outdir, f) for f in os.listdir(outdir) if 'ra2_' in f][1:] for idx, f in enumerate(files): print '{0} of {1} files'.format(idx,len(files)) print f[-100::] fopen = open(f) out = pickle.load(fopen) if transform: '''Fix stuff''' out_t = out else: out_t = out rplots.show_output(out_t) fopen.close() return outs
def family_clustered_suboptimals(rfid, plots = True, num = 5000, min_count = 2, n_countsorted = 10, n_esorted = 10, draw = False, cluster_type = 'just_list', savename = None): if savename == None: savename = rfid ali, tree, infos = rfam.get_fam(rfid) ali_ids = [a.name for a in ali] for i, n in enumerate(tree.get_terminals()): match = re.compile('_([^_]*)_').search(n.name) if not match or not '/' in match.group(1): this_seq = [] else: term_id = match.group(1) this_seq = ali[ali_ids.index(term_id)] n.m = {'seq':this_seq, 'probs':[1 for j in range(len(this_seq))]} big_refnode, big_refseq = \ subtree_refseq(tree) ungapped_ref = ungapped_seq(big_refseq, rfid) seq = ungapped_ref structs = suboptimals(ungapped_ref, sp_method = 'sample',name = rfid, n = num) stks = [pairs_stk(s,len(seq)) for s in structs] stk_srt = sorted([ (i,s) for i,s in enumerate(stks)], key = lambda x: x[1]) stk_groups = [ list(g) for k, g in it.groupby(stk_srt,key =lambda x: x[1])] stk_unq, struct_counts = zip(*[( g[0][0] , len(g)) for g in stk_groups]) structs = [structs[elt] for elt in stk_unq ] if cluster_type == 'full_clustering': final_structs, final_energies = select_exemplars_from_clustering(structs,struct_counts,seq, draw = draw) return elif cluster_type == 'just_list': final_structs, final_energies = select_exemplars_from_list(structs,struct_counts,seq, draw = draw) if draw: try: print 'DRAWING final subopts' verts = struct_verts(final_structs, seq, rfid ) show_subopts(final_structs, verts, final_energies) f = plt.gcf() f.savefig(cfg.dataPath('figs/RNAfoldz/exemplars_{0}.ps'.format(savename))) except Exception, e: print "EXCEPTION!" pass
def tree_similarity(dist1, dist2, run_id,criterion = 'knn', k = 6): if criterion == 'knn': nq = len(dist1) nb1 = argsort(dist1, 1)[:,1:k+1] nb2 = argsort(dist2, 1)[:,1:k+1] all_nbs = [set(n1).union(set(n2)) for n1, n2 in zip(nb1, nb2)] nb_intersection = [set(n1).intersection(set(n2)) for n1, n2 in zip(nb1, nb2)] nb_dists = [ array([[dist1[i, n], dist2[i,n]]for n in nbs ]) for i,nbs in enumerate(all_nbs)] #take the first k distances. n_disagreements = [len(nbd) - k for nbd in nb_dists] nb_dists = array([ sorted(nbd, key = lambda x: min(x))[:k] for nbd in nb_dists]) frac_diffs = [abs(diff(elt, 1).flatten()) / mean(elt,1) for elt in nb_dists] abs_diffs = [abs(diff(elt, 1).flatten()) for elt in nb_dists] ct = mycolors.getct(nq) f = myplots.fignum(4, (10,8)) ax = f.add_axes([.05,.08,.25,.87]) seismic.seismic(abs_diffs, ax = ax, colors = ct) jaccard = mean([float(len(nb_intersection[i])) / float(len(all_nbs[i])) for i in range(nq)]) ax2 = f.add_axes([.34,.08,.6,.87]) for i,d in enumerate(nb_dists): ax2.scatter(d[:,0], d[:,1], 20, alpha = .5,color =ct[i]) lin = linregress(nb_dists[:,:,0].flatten(),nb_dists[:,:,1].flatten()) rsquared = lin[2]**2 ax2.annotate('NN dists for multi/struct-aligned trees.\nK = {0}'.format(k), [0,1], xycoords = 'axes fraction', va = 'top', xytext = [10,-10],textcoords = 'offset pixels') ax2.annotate('R-Squared: {0:3.3}\nJaccard Index: {1:3.3}'.format(rsquared, mean(jaccard)), [1,0], xycoords = 'axes fraction', ha = 'right', xytext = [-10,10],textcoords = 'offset pixels') ax2.set_xlabel('Muscle aligned tree distances') ax2.set_ylabel('Struct algined tree distances') datafile = cfg.dataPath('figs/gpm2/pt2_mus_cm_tree_dists_{0}_k{1}.tiff'.format(run_id, k)) f.savefig(datafile)
import networkx as nx from numpy import * import subprocess as spc import compbio.config as cfg import os, re default_flow_dir = cfg.dataPath('graph_flows') def run_flow(g, gid): if not os.path.isdir(default_flow_dir): os.mkdir(default_flow_dir) write_flow(default_flow_dir, g, gid) compute_flow(default_flow_dir, gid) return parse_flow(default_flow_dir, gid) def flow_inp_file(flow_dir, gid): return os.path.join(flow_dir, 'flow_{0}.inp'.format(gid)) def flow_out_file(flow_dir, gid): return os.path.join(flow_dir, 'flow_{0}.out'.format(gid)) def write_flow(flow_dir, g, gid): nn = len(g.nodes()) ne = len(g.edges()) lines = [] lines.append('p min {0} {1}'.format(nn, ne)) lines.append('n 1 0')
from numpy import * import numpy as np, itertools as it import matplotlib.pyplot as plt import compbio.utils.plots as myplots import compbio.utils.colors as mycolors import compbio.utils.memo as mem import compbio.config as cfg import pickle figsize = (8,8) figtype = 'ps' figfile = cfg.dataPath('figs/gpm2/pt3_fana/{{0}}.{0}'.format(figtype)) do_make_figs = True if do_make_figs: do_make_subopts = False #flist = [50,311,140,143,495,637,1304] flist = [311,1304] def setFamData(rfid = None, ftype = None,**kwargs): assert rfid; assert ftype; fprefix = 'FA' if ftype == 'all' else 'RS' sdat = bsu.load_data('{1}_{0}'.format(rfid,fprefix), 'output') tdat = bsu.load_data('{1}_tree_{0}'.format(rfid,fprefix), 'output')
def draw_cm_muscle_congruencies(seqs, profiles, run_id, reset = True): print 'computing alignments...' print ' ...using muscle' malis, mrefs, mpairs =\ mem.getOrSet(setAlignments, **mem.rc({}, seqs = seqs, profiles = profiles, run_id = run_id, ali_type = 'muscle', reset = reset, on_fail = 'compute', register = 'tuali_musc_{0}'.format(run_id))) print ' ...using cmalign.' salis, srefs, spairs =\ mem.getOrSet(setAlignments, **mem.rc({}, seqs = seqs, profiles = profiles, run_id = run_id, ali_type = 'struct', reset = reset, on_fail = 'compute', register = 'tuali__struct_{0}'.format(run_id))) print ' ...making trees.' for idx, alis in enumerate(zip(malis, salis)): m, s = alis mtree = phyml.tree(m,run_id, bionj = True) stree = phyml.tree(s,run_id, bionj = True) maps = dict([(elt.id,i) for i, elt in enumerate(m)]) mdists = zeros((len(maps),len(maps))) sdists = zeros((len(maps),len(maps))) for n1 in mtree.get_terminals(): for n2 in mtree.get_terminals(): mdists[maps[n1.name],maps[n2.name]] = \ mtree.distance(n1,n2) for n1 in stree.get_terminals(): for n2 in stree.get_terminals(): sdists[maps[n1.name],maps[n2.name]] = \ stree.distance(n1,n2) tree_similarity(sdists, mdists, '{0}_struct_{1}'.format(run_id,idx), k = len(sdists - 1)) tree_similarity(sdists, mdists, '{0}_struct_{1}'.format(run_id,idx), k = 6) f = myplots.fignum(4, (8,10)) ct = mycolors.getct(len(mtree.get_terminals())) import networkx for t, sp, ttype in zip([mtree, stree], [211,212], ['sequence', 'structural']): a = f.add_subplot(sp) layout = 'neato' G = phylo.to_networkx(t) Gi = networkx.convert_node_labels_to_integers(G, discard_old_labels=False) posi = networkx.pygraphviz_layout(Gi, layout, args = '') posn = dict((n, posi[Gi.node_labels[n]]) for n in G) networkx.draw(G, posn, labels = dict([(n, '') for n in G.nodes()]), node_size = [100 if n.name in maps.keys() else 0 for n in G.nodes()], width = 1, edge_color = 'black', ax = a, node_color = [ct[maps.get(n.name, -1)] for n in G.nodes()] ) a.annotate('Embedded tree for {0} alignment.'.format(ttype), [0,1], xycoords = 'axes fraction', va = 'top', xytext = [10,0],textcoords = 'offset pixels') a.annotate('Total branch length is {0}'.format(t.total_branch_length()), [1,0], xycoords = 'axes fraction', ha = 'right', xytext = [-10,10],textcoords = 'offset pixels') #phylo.draw_graphviz( mtree, label_func = lambda x: '', # node_color = [ct[maps.get(n.name, -1)] for n in G.nodes()] +\ # [ct[0] for n in mtree.get_nonterminals()], axes = ax) datafile = cfg.dataPath('figs/gpm2/pt2_mus_cm_tree_embeddings_{0}_struct_{1}.ps'.format(run_id, idx)) f.savefig(datafile, dpi = 200, format = 'ps')
#!/usr/bin/env python ''' nt.py Contains a few utilities for looking up nucleotide level info for zhang lab sequences of interest. ''' import compbio.config as cfg from Bio import SeqIO ntfiles = { 'nrx':cfg.dataPath('sequences/zhang/nt/nrx1_human_nt.gb'), 'nlg':cfg.dataPath('sequences/zhang/nt/nlg1_human_nt.gb') } aafiles = { 'nrx':cfg.dataPath('sequences/zhang/aa/nrx1_human_aa.gb'), 'nlg':cfg.dataPath('sequences/zhang/aa/nlg1_human_aa.gb') } def get_seq( name, aa = True): seq = SeqIO.parse(open(aafiles[name]), 'genbank') if aa \ else SeqIO.parse(open(ntfiles[name]),'genbank') return seq
def get_seq_groups(rfid = 'RF00167', reset = True, tree = True, draw_distances = draw_all_easy, draw_clusters = draw_all_easy, draw_single_cluster = draw_all_hard): ''' Run the tree computation for each clsuter in the rfam family. (Or just one) 1) Compute clusters using a distance measure derived either phyml or a simple levenshtein dist. kwds: tree [True] Use a tree or just a levenshtein distance to get distances for init clustering. 2) Choose a cluster of well related sequences and for this this cluster, compute an alignment (For each structure using phase or for sequences using MUSCLE) kwds: struct_align [True] Whether to compute structural alignments or use MUSCLE ''' rutils = utils ali, tree, infos = rfam.get_fam(rfid) n = len(ali) if draw_distances: dists_t = seq_dists(ali,rfid, tree = True) dists_l = seq_dists(ali,rfid, tree = False) dtf = dists_t.flatten() dlf = dists_l.flatten() lin = linregress(dtf, dlf) rsquared = lin[2]**2 f = myplots.fignum(5, (7,7)) ax = f.add_subplot(111) ax.annotate('Levenshtein distance vs. BioNJ branch lengths', [0,1], xycoords = 'axes fraction', va = 'top', xytext = [10,-10],textcoords = 'offset pixels') ax.annotate('R-Squared: {0}'.format(rsquared), [1,0], xycoords = 'axes fraction', ha = 'right', xytext = [-10,10],textcoords = 'offset pixels') ax.set_xlabel('BIONJ Tree ML Distance') ax.set_ylabel('Levenshtein Distance') ax.scatter(dtf, dlf, 100) datafile = cfg.dataPath('figs/gpm2/pt2_lev_tree_dists.tiff') f.savefig(datafile) dists = mem.getOrSet(setDistances, ali = ali, tree = tree, run_id = rfid, register = rfid, on_fail = 'compute', reset = reset) clusters = maxclust_dists(dists, k = 5, method = 'complete') clusters -= 1 if draw_clusters: ct = mycolors.getct(len(set(clusters))) colors = [ct[elt] for elt in clusters] pca_vecs = mlab.PCA(dists).project(dists) f = myplots.fignum(5, (8,8)) ax = f.add_subplot(111) ax.annotate('Rfam sequence clusters in first 2 PC of sequence space.', [0,1], xycoords = 'axes fraction', va = 'top', xytext = [10,-10],textcoords = 'offset pixels') ax.annotate('Number of Clusters: {0}'.format(len(ct)), [1,0], xycoords = 'axes fraction', ha = 'right', xytext = [-10,10],textcoords = 'offset pixels') ax.set_xlabel('PC 1') ax.set_ylabel('PC 2') ax.scatter(pca_vecs[:,0],pca_vecs[:,1], 20, color = colors) datafile = cfg.dataPath('figs/gpm2/pt2_all_seqs_clustered.ps') f.savefig(datafile) #now take the largest cluster and do the analysis. cgrps = dict([ (k, list(g)) for k , g in it.groupby(\ sorted( list(enumerate(clusters)),key = lambda x: x[1]), key = lambda x: x[1])]) cbig = argmax([len(x) for x in cgrps.values()]) cluster_seqs = [ elt[0] for elt in cgrps.values()[cbig] ] csize = len(cluster_seqs) seqs =[ali[c] for c in cluster_seqs] if 0: ct = mycolors.getct(2) pca_vecs = mlab.PCA(dists).project(dists) colors =[ct[1] if elt in cluster_seqs else ct[0] for elt in range(len(pca_vecs))] f = myplots.fignum(5, (8,8)) ax = f.add_subplot(111) ax.annotate('Inter and intra cluster distances vs. PC0 component for chosen cluster.', [0,1], xycoords = 'axes fraction', va = 'top', xytext = [10,-10],textcoords = 'offset pixels') ax.annotate('Number of cluster sequences: {0}, Number of total sequences'.format(csize, n - csize), [1,0], xycoords = 'axes fraction', ha = 'right', xytext = [-10,10],textcoords = 'offset pixels') ax.set_xlabel('PC 0') ax.set_ylabel('Distance') for s in cluster_seqs: ax.scatter(pca_vecs[:,0],dists[s,:] ,200 *exp(-(dists[s,:] / .5) **2), color = colors, alpha = .2) datafile = cfg.dataPath('figs/gpm2/pt2_focused_cluster_dists.ps') f.savefig(datafile) clusters_final = [ [ elt[0] for elt in cgrps.values()[i] ] for i in range(len(cgrps.values()))] seqs_final = [ [ ali[idx] for idx in clust ] for clust in clusters_final] return seqs_final
def save_muts_structs(out, out_tree): ofile = open(cfg.dataPath('RNAfoldz/out.pickle'), 'w') otfile = open(cfg.dataPath('RNAfoldz/out_tree.pickle'), 'w') pickle.dump(out, ofile) pickle.dump(out_tree, otfile) ofile.close(), otfile.close()
def _clear_folder(run_id): datadir = cfg.dataPath('phase/{0}/'.format(run_id)) for f in os.listdir(datadir): os.remove(os.path.join(datadir, f))
import align as ali import compbio.config as cfg import os import Bio.SeqIO as sio default = cfg.dataPath('zhang/sequencing/piggybac') def run_directory(directory = default): refs = [os.path.join(root, f) for root, dirs, files in os.walk(os.path.join(directory, 'refs')) for f in files] all_refs = [sio.parse( f, format = 'fasta').next() for f in refs] results = [os.path.join(root, f) for root, dirs, files in os.walk(os.path.join(directory, 'results')) for f in files] result_sequences = load_genewiz_seqs(results) for r in all_refs: print 'Aligning to reference: {0} '.format(r) out = align_seqs(r, result_sequences) for k,v in out.iteritems(): print 'result {0}: {1}'.format(k,v) def load_genewiz_seqs(filenames): seqs = {} for f in filenames: fopen = open(f) lines = fopen.readlines()
def get_consensus(rfid = 'RF00', mweight = .5, refseq_method = 'root', sp_method = 'sample', aff_type = 'pairs', reset = True, do_plot = False, run_id = 'CONS_TEST'): ali, tree, infos = rfam.get_fam(rfid) ali_ids = [a.name for a in ali] for i, n in enumerate(tree.get_terminals()): term_id = re.compile('_([^_]*)_').search(n.name).group(1) this_seq = ali[ali_ids.index(term_id)] n.m = {'seq':this_seq, 'probs':[1 for j in range(len(this_seq))]} #if do_plot : rplots.plot_clusters(inds,{'pca embedding':pca_vecs},title = title,plot3d = True) big_refnode, big_refseq = \ subtree_refseq(tree, method = refseq_method) ungapped_ref = rutils.ungapped_seq(big_refseq, rfid) #pca_vecs,exemplar_structs = return family_exemplar_structs(rfid, sp_method = sp_method, refseq_method = refseq_method, aff_type = aff_type, ) struct_profiles = infernal.profiles(ungapped_ref,exemplar_structs, run_id) clades = split_tree(tree) all_vecs = {'all_time':[ [ [] for i in range(len(struct_profiles))] for j in range(len(clades)) ], 'all_mut':[ [ [] for i in range(len(struct_profiles))] for j in range(len(clades)) ], 'fiftyfifty':[ [ [] for i in range(len(struct_profiles))] for j in range(len(clades)) ]} aamuts, aatimes, aairr, aagaps = [], [], [], [] for idx_clade, c in enumerate(clades): if len(c.get_terminals()) < 3: print 'SKIPPPING CUZ SUBTREE TOO SMALL' continue c_ids = [ n.m['seq'].name for n in c.get_terminals() ] if len(nonzero(greater([len(list(g)) for k, g in it.groupby(sorted(c_ids))],1))[0])>0: print 'SKIPPING CUZ THERE ARE TWO COPIES OF SOME F*****G SEQUENCE IN TREE' continue all_muts, all_times , all_gaps, all_irr = [], [], [], [] print print 'Clade: {0}'.format(idx_clade) for idx_struct, struct_info in enumerate( zip( struct_profiles, exemplar_structs)): struct_profile, ex_struct = struct_info ngaps = 0 #OLD ALIGNMENTS calis = ba.MultipleSeqAlignment(\ [n.m['seq'] for n in c.get_terminals() ]) #NEW ALIGNMENTS AND REF STRUCTURE c_new_ali , stk, struct = infernal.alignment(calis, struct_profile, rfid) #REF STRUCTURE PAIRS pairs = rutils.stk_pairs(struct) if len(pairs) != len(ex_struct): raise Exception() cterms = c.get_terminals() for i2, ct in enumerate(cterms): lilid = 'N{0}'.format(i2) ct.name = lilid ct.m['str_seq'] = c_new_ali[i2] ct.m['str_seq'].id = lilid ct.m['probs'] = ones(len(c_new_ali[i2])) #BUILD A TREE tr = phy.BaseTree.Tree(c) #RUN PAML paml_run_id = 'ali_anc_c{0:04}_s{0:03}'.format(idx_clade,idx_struct) rstfile= paml.run_paml(tr, c_new_ali, run_id = paml_run_id) anc_tree = paml.rst_parser(rstfile) #Label extent and internal nodes with sequences. for term in anc_tree.get_terminals(): #Terminals have old (rfam) alis and new (infernal) alis term.m = filter( lambda x: x.name == term.name, cterms)[0].m for node in anc_tree.get_nonterminals(): #Internals only have new alis. m['seq'] = m['str_seq'] node.m['str_seq'] = node.m['seq'] node.m['str_seq'].seq = node.m['str_seq'].seq.replace('T', 'U') subtree = anc_tree #Evaluate all of the structs on the first pass #to have access to mean frequencies of different #mutational types in the final score computation refnode, refseq = subtree_refseq(subtree, method = refseq_method) muts, times, gaps, irresolvables = subtree_count_struct(subtree, pairs) all_muts.append(muts) all_times.append(times) all_gaps.append(gaps) all_irr.append(irresolvables) compute_signatures(all_vecs,idx_clade, all_muts,all_times, exemplar_structs,ungapped_ref ) aamuts.append(all_muts) aatimes.append(all_times) aairr.append(all_irr) aagaps.append(all_gaps) outputs = { 'all_vecs':all_vecs, 'all_muts':aamuts, 'all_times':aatimes, 'exemplar_structs':exemplar_structs, 'reference_seq':ungapped_ref, 'thermo_ex_inds':inds, 'thermo_embedding':pca_vecs, 'title':title, 'thermo_aff_type':aff_type, 'tree':tree, 'run_id':run_id } pickle.dump(outputs, open(cfg.dataPath('cs874/runs/{0}.pickle'.format(run_id)),'w')) return(outputs)
def _write_ml_ctl(run_id, outgroup_name): datafile = cfg.dataPath('phase/{0}/datafile.rna'.format(run_id)) outfile = cfg.dataPath('phase/{0}/outfile.phylip'.format(run_id)) cfgfile = cfg.dataPath('phase/{0}/control.ml'.format(run_id)) data = ''' #Phylogenetic tree reconstruction in the ML framework with mlphase #The dataset in this example is small and mlphase can be used. {DATAFILE} Data file = %(datafile)s Interleaved data file = no #Use the "automatic method" to analyse this dataset: #unpaired nucleotides ('.' in the secondary structure) are #handled by the MODEL1 of the MIXED model (see below). #pairs (corresponding parenthesis in the secondary structure) #are handled by the MODEL2 of the MIXED model (see balow) Heterogeneous data models = auto {\DATAFILE} ''' % { 'datafile': datafile } model = '''#Set up a MIXED model with REV for loops and 7D for stems {MODEL} Model = MIXED Number of models = 2 {MODEL1} Model = REV Discrete gamma distribution of rates = yes Number of gamma categories = 6 Invariant sites = no {\MODEL1} {MODEL2} Model = RNA7D Discrete gamma distribution of rates = yes Number of gamma categories = 6 Invariant sites = no {\MODEL2} {\MODEL}''' tree = ''' #A TREE block {TREE} #You must specify an outgroup although it is used for representation #purpose only and it does not affect the results. #This outgroup must be the name of a species in your datafile or the name #of a clade in your clusters file (see below). Outgroup = %(outgroup)s #See manual for the available heuristic/exhaustive search method. Search algorithm = Stepwise addition #Optional: we specify a file that contains monophyletic clades. Tree topologies #that do not match these constraints are not evaluated. #Clusters file = sequence-data/hiv6.cls {\TREE} ''' % { 'outgroup': outgroup_name } run_cfg = ''' Random seed=9 Output file = %(outfile)s ''' % { 'outfile': outfile } all_text = '\n'.join([data, model, tree, run_cfg]) fopen = open(cfgfile, 'w') fopen.write(all_text)
def get_muts_structs(): ofile = open(cfg.dataPath('RNAfoldz/out.pickle'), 'w') otfile = open(cfg.dataPath('RNAfoldz/out_tree.pickle'), 'w') ofile.close otfile.close() return pickle.load(out, ofile), pickle.load(out_tree, otfile)
def get_remote_runs(run_range): outdir = cfg.dataPath('batch/outputs') files = [os.path.join(outdir, f) for f in os.listdir(outdir) if 'ra2_' in f] outs = [ pickle.load(open(f)) for f in files[0:20] ] return outs
def _write_mcmc_ctl(run_id, outgroup_name): datafile = cfg.dataPath('phase/{0}/datafile.rna'.format(run_id)) outfile = cfg.dataPath('phase/{0}/outfile.phylip'.format(run_id)) cfgfile = cfg.dataPath('phase/{0}/control.mcmc'.format(run_id)) '''Write a control file for the mcmc tree builder in phase ''' data = ''' #A standard DATAFILE block for RNA sequences having a secondary structure. #see also the sequence file sequence-data/mammals69.rna {DATAFILE} Data file = %(datafile)s Interleaved data file = no #Use the "automatic method" to analyse this dataset: #unpaired nucleotides ('.' in the secondary structure) are #handled by the MODEL1 of the MIXED model (see below). #pairs (corresponding parenthesis in the secondary structure) #are handled by the MODEL2 of the MIXED model (see balow) Heterogeneous data models = auto {\DATAFILE} ''' % { 'datafile': datafile } model = ''' #Set up a MIXED model with REV for loops and 7D for stems {MODEL} Model = MIXED Number of models = 2 {MODEL1} Model = REV Discrete gamma distribution of rates = yes Number of gamma categories = 6 Invariant sites = no {\MODEL1} {MODEL2} Model = RNA7D Discrete gamma distribution of rates = yes Number of gamma categories = 6 Invariant sites = no {\MODEL2} {\MODEL} ''' tree = ''' #Use a standard unrooted tree. The outgroup is compulsory but do not affect the results. {TREE} Tree = Unrooted MCMC tree Outgroup = %(outgroup)s {\TREE} ''' % { 'outgroup': outgroup_name } perturbation = ''' #Tuning parameters for the MCMC runs. {PERTURBATION} #relative proposals probabilities between the tree and the substitution model Tree, proposal priority = 8 Model, proposal priority = 1 {PERTURBATION_TREE} #We use 10/40 for topology change vs branch length changes. #It is not exactly equivalent to 1/4 because this is also given relative to the #proposal priority for hyperparameters that are introduced with the #the prior on branch lengths (Hyperpriors, proposal priority) Topology changes, proposal priority = 10 Branch lengths, proposal priority = 40 Hyperpriors, proposal priority = 1 #We use a vague prior exp(lambda) on branch lengths rather than the default exp(10) Branch lengths, prior = exponential(uniform(0,100)) #A lambda hyperparameter has been introduced. It needs a "proposal priority" #but this is not used because it is the only hyperparameter Branch lengths exponential hyperparameter, proposal priority = 1 {\PERTURBATION_TREE} {PERTURBATION_MODEL} #relative probabilities for the proposals on the two models and the average substitution rate of MODEL2 Model 1, proposal priority = 10 Model 2, proposal priority = 10 Average rates, proposal priority = 1 {PERTURBATION_MODEL1} Frequencies, proposal priority = 2 Rate ratios, proposal priority = 1 Gamma parameter, proposal priority = 1 {\PERTURBATION_MODEL1} {PERTURBATION_MODEL2} Frequencies, proposal priority = 2 Rate ratios, proposal priority = 1 Gamma parameter, proposal priority = 1 {\PERTURBATION_MODEL2} {\PERTURBATION_MODEL} {\PERTURBATION} ''' run_cfg = ''' Random seed = 11 Burnin iterations = 750 Sampling iterations = 150 Sampling period = 150 Output file = %(outfile)s Output format = phylip ''' % { 'outfile': outfile } #OLD VALUES: #Burnin iterations = 750000 #Sampling iterations = 1500000 #Sampling period = 150 all_text = '\n'.join([data, model, tree, perturbation, run_cfg]) fopen = open(cfgfile, 'w') fopen.write(all_text) return
return elif cluster_type == 'just_list': final_structs, final_energies = select_exemplars_from_list(structs,struct_counts,seq, draw = draw) if draw: try: print 'DRAWING final subopts' verts = struct_verts(final_structs, seq, rfid ) show_subopts(final_structs, verts, final_energies) f = plt.gcf() f.savefig(cfg.dataPath('figs/RNAfoldz/exemplars_{0}.ps'.format(savename))) except Exception, e: print "EXCEPTION!" pass fopen = open(cfg.dataPath('RNAfoldz/subopts_{0}.pickle'.format(savename)),'w') return final_structs,final_energies, seq pickle.dump({'structs':final_structs, 'energies':final_energies, 'seq':seq}, fopen) def select_exemplars_from_clustering(structs,struct_counts,seq, draw = False): min_count = 2 freq_structs = [s for i, s in enumerate(structs) if struct_counts[i] >= min_count] if len(freq_structs) < 10: min_count = 1 freq_structs = [s for i, s in enumerate(structs) if struct_counts[i] >= min_count] struct_counts= [s for i, s in enumerate(struct_counts) if s >= min_count] structs = freq_structs