def fetchUniprotEntries(fasta, newaccs=None): print "Fetching uniprot entries.." uniprot_file = fasta + ".uniprot" already_exists = False if os.path.isfile(uniprot_file): already_exists = True lastline = tail(uniprot_file, 1) lastline = lastline[0] lastline = lastline.rstrip() #TODO: change head linux use to the appropriate python command if lastline == "</uniprot>": lines = file(uniprot_file, 'r').readlines() del lines[-1] file(uniprot_file, 'w').writelines(lines) #cmd = "head -n -1 " + uniprot_file + " > " + uniprot_file + ".2; " + "mv " + uniprot_file + ".2 " + uniprot_file #print cmd #os.system(cmd) fasta_ids = newaccs if newaccs == None: fasta_ids = getACCsFromFastaFile(fasta) print fasta_ids if fasta_ids: l = range(0, len(fasta_ids)) chunks = FetchUtils.getChunks(l, 50) for i in range(0, len(chunks)): print "Fetching " + str(i) + " chunk out of " + str(len(chunks)) sys.stdout.flush() accs = [fasta_ids[j] for j in chunks[i]] # fetchBatch <dbName> <id1,id2,...> [formatName [styleName]] [options...] try: uniprots = FetchUtils.returnFetchBatch('uniprot', ','.join(accs), 'xml', 'default') with open(fasta + ".uniprot", "a") as f: if not i == 0 or already_exists: uniprots = re.sub(r"\<uniprot.+", '', uniprots) uniprots = re.sub(r"\<\?xml version=.+\>", '', uniprots) if not i == len(chunks) - 1: uniprots = re.sub(r"\</uniprot.*>", '', uniprots) f.write(uniprots) except: print "Coundnot fetch this chunk" continue else: print "succeeded in fetching chunk" continue return (fasta_ids)
def fetchEntries(reftype, pool, fname, add=True): if os.path.isfile(fname) and add == False: print "File: " + fname + " data already exists. Reading sequences from file.." return 0 # read_cdss(fname) else: print "File: " + fname + " data does not exist or partial.\nRetrieving relevant data from EBI.." l = range(0, len(pool)) chunks = FetchUtils.getChunks(l, 100) print reftype if reftype == 'refseqn': for i in range(0, len(chunks)): print i ids = [pool[j] for j in chunks[i]] # fetchBatch <dbName> <id1,id2,...> [formatName [styleName]] [options...] print ','.join(ids) try: entries = None entries = FetchUtils.returnFetchBatch(reftype, str(','.join(ids)), 'default', 'default') with open(fname, "a") as f: if len(entries) > 0: f.write(''.join(str(v) for v in entries)) else: print "Nothing to write" except suds.WebFault as detail: print "MY DETAILS!!!!!!!!!!!!!!!!!!!!:" print detail # entries = returnFetchBatch(reftype, ','.join(ids), 'default', 'default') # with open(fname, "a") as f: # f.write(entries) with open(fname, "r") as f, open(fname + "2", "w") as fo: lastline = None for line in f: match = re.match("(.+)<a href=.+>(\w+)</a>(.+)", line) if match: fo.write(match.group(1) + match.group(2) + match.group(3) + "\n") else: fo.write(line) lastline = line if lastline != "//": fo.write("//") fo.flush() os.fsync(fo) if os.path.isfile(fname + "2"): os.remove(fname) os.rename(fname + "2", fname) elif reftype == 'emblcds': for i in range(0, len(chunks)): print i ids = [pool[j] for j in chunks[i]] # fetchBatch <dbName> <id1,id2,...> [formatName [styleName]] [options...] print ','.join(ids) try: entries = None entries = FetchUtils.returnFetchBatch(reftype, str(','.join(ids)), 'default', 'default') with open(fname, "a") as f: if len(entries) > 0: f.write(''.join(str(v) for v in entries)) else: print "Nothing to write" except suds.WebFault as detail: print "MY DETAILS!!!!!!!!!!!!!!!!!!!!:" print detail # entries = returnFetchBatch(reftype, ','.join(ids), 'default', 'default') # with open(fname, "a") as f: # f.write(entries) elif reftype == 'ensembltranscript': l = range(0, len(pool)) chunks = FetchUtils.getChunks(l, 50) print reftype DEVNULL = open(os.devnull, 'wb') for i in range(0, len(chunks)): print "retrieveing: " + str(i) + "\n" ids = [pool[j] for j in chunks[i]] if _platform == "linux" or _platform == "linux2": sep = '"' elif _platform == "darwin": sep = '"' elif _platform == "win32": sep = '\\\"' myids = (', '.join(sep + item + sep for item in ids)) if _platform == "linux" or _platform == "linux2": curlCmd = "curl -H'Accept: application/json' -H'Content-type: application/json' -XPOST --data '{\"ids\" : [" + myids + "] }' 'http://rest.ensembl.org/sequence/id?object_type=transcript&type=cds'" elif _platform == "darwin": curlCmd = "curl -H'Accept: application/json' -H'Content-type: application/json' -XPOST --data '{\"ids\" : [" + myids + "] }' 'http://rest.ensembl.org/sequence/id?object_type=transcript&type=cds'" elif _platform == "win32": curlCmd = "curl -H\"Accept: application/json\" -H\"Content-type: application/json\" -XPOST --data \"{\\\"ids\\\" : [" + myids + "] }\" \"http://rest.ensembl.org/sequence/id?object_type=transcript&type=cds\"" print curlCmd with open(fname + "tmp.dnld2", "w") as f: # print "Writing " + gene + " to " + fname p = subprocess.Popen( curlCmd, shell=True, stdout=f, stderr=DEVNULL) # p = subprocess.Popen("wget -q --header='Content-type:text/x-fasta' 'http://rest.ensembl.org/sequence/id/" + str(gene) + "?type=cds' -O -", shell=True, stdout=f, stderr=subprocess.STDOUT) ret_code = p.wait() with open(fname + "tmp.dnld2", "r") as fo2: myres = json.loads(fo2.read()) print myres if (len(myres) > 0): with open(fname, "a") as f: for i_id in range(0, len(myres)): f.write(">" + myres[i_id]["id"] + "\n" + myres[i_id]["seq"] + "\n") DEVNULL.close() elif reftype == 'ensemblgenomestranscript': l = range(0, len(pool)) chunks = FetchUtils.getChunks(l, 50) print reftype DEVNULL = open(os.devnull, 'wb') for i in range(0, len(chunks)): print "retrieveing: " + str(i) + "\n" ids = [pool[j] for j in chunks[i]] if _platform == "linux" or _platform == "linux2": sep = '"' elif _platform == "darwin": sep = '"' elif _platform == "win32": sep = '\\\"' myids = (', '.join(sep + item + sep for item in ids)) if _platform == "linux" or _platform == "linux2": curlCmd = "curl -H'Accept: application/json' -H'Content-type: application/json' -XPOST --data '{\"ids\" : [" + myids + "] }' 'http://rest.ensemblgenomes.org/sequence/id?object_type=transcript&type=cds'" elif _platform == "darwin": curlCmd = "curl -H'Accept: application/json' -H'Content-type: application/json' -XPOST --data '{\"ids\" : [" + myids + "] }' 'http://rest.ensemblgenomes.org/sequence/id?object_type=transcript&type=cds'" elif _platform == "win32": curlCmd = "curl -H\"Accept: application/json\" -H\"Content-type: application/json\" -XPOST --data \"{\\\"ids\\\" : [" + myids + "] }\" \"http://rest.ensemblgenomes.org/sequence/id?object_type=transcript&type=cds\"" print curlCmd with open(fname + "tmp.dnld2", "w") as f: # print "Writing " + gene + " to " + fname p = subprocess.Popen( curlCmd, shell=True, stdout=f, stderr=DEVNULL) # subprocess.STDOUT) # p = subprocess.Popen("wget -q --header='Content-type:text/x-fasta' 'http://rest.ensembl.org/sequence/id/" + str(gene) + "?type=cds' -O -", shell=True, stdout=f, stderr=subprocess.STDOUT) ret_code = p.wait() with open(fname + "tmp.dnld2", "r") as fo2: myres = json.loads(fo2.read()) print myres if (len(myres) > 0): with open(fname, "a") as f: for i_id in range(0, len(myres)): f.write(">" + myres[i_id]["id"] + "\n" + myres[i_id]["seq"] + "\n") DEVNULL.close() else: print reftype + " No such reftype." return read_cdss(fname)