def launch_many(run_id): ''' Generate script paramaters and launch a bunch of bsub jobs. Designed to be run on the cluster via an interactive shell. Note: If this is not run on cluster, since it does not look up a remote url for files, it won't be able to find expression data. ''' print 'Launching all jobs!' #MAKE INPUTS expr_filenames = ['soheil/expression_c4d_n4_tt_{0}.mat'.format(ttnum) for ttnum in range(70)] + ['soheil/expression_c4d_n4_intercluster.mat'] urls = [ cfg.dataURL(f) for f in expr_filenames ] remote_exprnames =[ cfg.dataPath(url) for url in urls ] inp_dicts = [dict(out_iter_num = out_iter_num, in_iter_num = in_iter_num, k = k, beta = beta, f_mix = f_mix, f_sim = f_sim, f_en_in = f_en_in, f_en_out = f_en_out, th_cor = th_cor, trunc_value = trunc_value, degree_bound = degree_bound, filename = filename) for out_iter_num in array([25],double) for in_iter_num in array([100],double) for k in array([6],double) for beta in array([4],double) for f_mix in array([2],double) for f_sim in array([.8],double) for f_en_in in array([1.],double) for f_en_out in array([1.],double) for th_cor in array([.6],double) for trunc_value in array([3],double) for degree_bound in array([3],double) for filename in remote_exprnames ] #MAKE EYEBALL eyeball = bsub.eyeball(run_id, os.path.abspath(inspect.stack()[0][1]), inp_dicts, func = 'run_single', name = 'mcmc_', mem = 3) #LAUNCH EYEBALL JOBS eyeball.launch() #RETURN A LIST OF LAUNCHED JOBS return dict(cmds=eyeball.cmds, inputs = inp_dicts)
def sort_prefixes(volume_name="cb"): prefix_path = config.dataPath(config.dataURL("genbank/prefixes")) for p in os.listdir(prefix_path): f = os.path.join(prefix_path, p) fopen = open(f) lines = fopen.readlines() lsort = sorted(lines) fopen.close() fopen = open(f, "w") fopen.writelines(lsort) fopen.close() print p
def split_prefixes(volume_name="cb"): """splits the massive genebank accession list up by prefixes. takes a volume name as a parameter in case the accesion list is stored in an atypical location""" path = config.dataPath(config.dataURL("genbank/gb_acclist.genbank", volume_name=volume_name)) path_home = os.path.dirname(path) fopen = open(path) prefixes = {} count = 0 long_count = 0 for l in fopen.xreadlines(): if l[1].isdigit(): pend = 1 elif l[2].isdigit(): pend = 2 elif l[3].isdigit(): pend = 3 elif l[4].isdigit(): pend = 4 elif l[5].isdigit(): pend = 5 elif l[6].isdigit(): pend = 6 else: raise Exception() prefix = l[0:pend] if not prefixes.has_key(prefix): print "getting pre" prefixes[prefix] = open(os.path.join(path_home, "prefixes/" + prefix), "a") f = prefixes[prefix] f.write(l) count += 1 if count > 100000: count = 0 long_count += 1 print prefix, l if long_count > 10: print prefixes long_count = 0 while prefixes: f = prefixes.pop(prefixes.keys()[0]) f.close() for k, p in prefixes.iteritems(): p.close()
def fill_all_rdb16s(reset = True): paths = [] for r, ds, fs in os.walk(config.dataPath('alignments/16s')): for f in fs: if '.gbk' in f: paths.append(os.path.join(r,f)) cbdb = compbio.projects.cbdb dbi = cbdb.getName('16s', tables = get_tables(), reset = np.mod(reset, 2)) last_ofs = 0 for p in paths: fopen = open(p) a = dbi.Alignment(file_name =config.dataURL(p)) dbi.Session.add(a) dbi.Session.commit() count = 0 for rec in SeqIO.parse(fopen, 'genbank'): try: src_taxon = rec.features[0].qualifiers['db_xref'][0][6:] except Exception, e: src_taxon = None ann = sjson.dumps(rec.annotations, default = lambda x: x.__str__()) seq = dbi.Sequence(name = rec.name, file_name = p, file_offset = last_ofs, sequence = rec.seq.__str__(), gb_accession = rec.id, gb_accession_version = 1, gb_id = None, annotations = ann, alignment = a, source_taxon = src_taxon ) dbi.Session.add(seq) last_ofs = fopen.tell() if np.mod(count, 1000) == 0: print count, p, seq.source_organism dbi.Session.commit() count += 1 dbi.Session.commit()
def search_sorted(prefix_name, query, volume_name="cb"): """performs a binary search within a sorted file to find the genbank id for a given query""" prefix_file = os.path.join( config.dataPath(config.dataURL("genbank", volume_name=volume_name)), "prefixes/" + prefix_name ) fopen = open(prefix_file) size = os.path.getsize(prefix_file) start = 0 stop = size hplast = 0 while 1: halfpt = (start + stop) / 2 fopen.seek(halfpt) if halfpt == 0: line = fopen.readline() else: blank = fopen.readline() line = fopen.readline() c0 = line.split(",")[0] if c0 == query: return line.split(",")[2].strip() elif c0 < query: start = halfpt else: stop = halfpt if halfpt == hplast: raise Exception("Query not for: %s" % query) hplast = halfpt if start == stop: raise Exception("Query not found: %s" % query)
#!/usr/bin/env python import compbio.config as cfg import sys def usage(): print ''' Usage: pydatapath.py path volume ''' if __name__ == '__main__': path = sys.argv[1] if len(sys.argv) > 1 else '' volume = sys.argv[2] if len(sys.argv) > 2 else '' host = '' if path == 'usage': usage() exit(0) localpath = sys.argv[1] if len(sys.argv) > 1 else '' path = cfg.dataPath(cfg.dataURL(path, volume_name = volume)) sys.stdout.write(path) exit(0)
import os import compbio.config as config for r, d, fs in os.walk(config.dataPath(config.dataURL("unseen_data"))): for f in fs: if ".stk" in f: print f
def fill_db( name = 'bacterial_genomes', reset = False, postgres = False, host = 'broad'): dbi = cbdb.getName( name, postgres = postgres, tables = get_tables(), reset = np.mod(reset, 2), host = host) paths = [] for r,ds, fs in os.walk('/Volumes/ganymede/all.gbk/'): for f in fs: if 'gbk' in f: paths.append(os.path.join(r, f)) count = 0 for p in paths: if count < 1668: count += 1 continue count += 1 fopen = open(p) for rec in SeqIO.parse(fopen, 'genbank'): f0 = rec.features[0] if f0.type == 'source': source_taxon = f0.qualifiers['db_xref'][0][6:] source_organism=f0.qualifiers['organism'][0] else: source_taxon = None source_organism = None fa_seqpath = 'genomes/'+rec.id+'.fa' fa_sequrl = config.dataURL(fa_seqpath) fa_seqfile = config.dataPath(fa_sequrl) fopen = open(fa_seqfile,'w') SeqIO.write(rec,fopen, 'fasta') fopen.close() adds = [] genome = dbi.Genome(name = rec.name, seq_url =fa_sequrl, source_taxon = source_taxon, source_organism = source_organism, gb_accession = rec.id, annotations = rec.annotations.__str__()) #adds.append(genome) print 'adding genome ' + source_organism dbi.Session.add(genome) print 'commiting update ' dbi.Session.commit() print 'genome added! ' for f in rec.features: feature = dbi.Feature(type = f.type, start = f.location.start.position, start_ext = f.location.start.extension, end = f.location.end.position, end_ext = f.location.end.extension, strand = f.strand, genomeobj = genome) #print 'adding feature ' + f.type #dbi.Session.add(feature) adds.append(feature) for k,v in f.qualifiers.iteritems(): q = dbi.Qualifier(key = k, value = v.__str__(), featureobj = feature) #dbi.Session.add(q) adds.append(q) for sf in f.sub_features: sub = dbi.SubFeature(type = sf.type, start = sf.location.start.position, start_ext = sf.location.start.extension, end =sf.location.end.position, end_ext = sf.location.end.extension, strand = sf.strand, featureobj = feature) adds.append(sub) #dbi.Session.add(sub) for k,v in sf.qualifiers.iteritems(): q = dbi.Qualifier(key = k, value = v.__str__(), subfeatureobj = sf) #Session.add(q) adds.append(q) dbi.Session.add_all(adds) if np.mod(count, 2) == 0: print count #print count, p , seq.source_organism print 'committing update' dbi.Session.commit() print 'update commited!' dbi.Session.commit()