def getDataFromUniprot(fasta, accs): global fetchme uniprot_file = fasta + ".uniprot" print uniprot_file lastline = tail(uniprot_file, 1) lastline = lastline[0] if lastline: lastline = lastline.rstrip() print "::::::::::: " + lastline if lastline != "</uniprot>": with open(uniprot_file, "a") as f: f.write("</uniprot>\n") handle = open(uniprot_file) cntr = 0 proteins = {} for record in UniprotIterator(handle): print record.id, record.name proteins[record.name] = Protein(xmlrecord=record) cntr = cntr + 1 # if cntr > 1000: # break if accs != None: accs2get = [acc for acc in accs if acc not in proteins.keys()] if len(accs2get) > 0: print "fetching: " print accs2get if fetchme: fetchUniprotEntries(fasta, accs2get) return getDataFromUniprot(fasta, None) print "There are total of " + str(len(proteins)) + " entries mapped in uniprot" return proteins
def fetchUniprotEntries(fasta, newaccs=None): print "Fetching uniprot entries.." uniprot_file = fasta + ".uniprot" already_exists = False if os.path.isfile(uniprot_file): already_exists = True lastline = tail(uniprot_file, 1) lastline = lastline[0] lastline = lastline.rstrip() #TODO: change head linux use to the appropriate python command if lastline == "</uniprot>": lines = file(uniprot_file, 'r').readlines() del lines[-1] file(uniprot_file, 'w').writelines(lines) #cmd = "head -n -1 " + uniprot_file + " > " + uniprot_file + ".2; " + "mv " + uniprot_file + ".2 " + uniprot_file #print cmd #os.system(cmd) fasta_ids = newaccs if newaccs == None: fasta_ids = getACCsFromFastaFile(fasta) print fasta_ids if fasta_ids: l = range(0, len(fasta_ids)) chunks = FetchUtils.getChunks(l, 50) for i in range(0, len(chunks)): print "Fetching " + str(i) + " chunk out of " + str(len(chunks)) sys.stdout.flush() accs = [fasta_ids[j] for j in chunks[i]] # fetchBatch <dbName> <id1,id2,...> [formatName [styleName]] [options...] try: uniprots = FetchUtils.returnFetchBatch('uniprot', ','.join(accs), 'xml', 'default') with open(fasta + ".uniprot", "a") as f: if not i == 0 or already_exists: uniprots = re.sub(r"\<uniprot.+", '', uniprots) uniprots = re.sub(r"\<\?xml version=.+\>", '', uniprots) if not i == len(chunks) - 1: uniprots = re.sub(r"\</uniprot.*>", '', uniprots) f.write(uniprots) except: print "Coundnot fetch this chunk" continue else: print "succeeded in fetching chunk" continue return (fasta_ids)