def generateSomeSpeciesLists(ftmp, fnontmp, species='MYCPN'): do = DataOperation('uniprot', 'uniprot_sprot') projection = {'_id': False, 'accession': True} docs = do.QueryObj({ "keyword.@id": 'KW-0812', '_id': { '$regex': species } }, projection=projection) tmplist = [ x['accession'][0] if isinstance(x['accession'], list) else x['accession'] for x in docs ] docs = do.QueryObj( { "keyword.@id": { '$ne': 'KW-0812' }, 'keyword': { '$exists': True }, '_id': { '$regex': 'HUMAN' } }, projection=projection) nontmplist = [ x['accession'][0] if isinstance(x['accession'], list) else x['accession'] for x in docs ] print('query %d tmp and %d nontmp' % (len(tmplist), len(nontmplist))) saveList(tmplist, ftmp) saveList(nontmplist, fnontmp) return tmplist, nontmplist
def save(aclist, dbnam, fnotSave=None): # aclist = ['P06685', 'P06686'] # dbnam = 'seqtmppi_test1' do_all = DataOperation('uniprot', 'seqtmppi_positive') do_new = DataOperation('uniprot', dbnam) notsaveCount = 0 notsaveList = [] for ac in aclist: protein = queryProtein(ac, do_all) if protein == None: print('not save protein %s' % ac) continue # projection = {'_id': True, 'sequence.@length': True, 'sequence.#text': True, 'keyword.@id': True, # 'comment.subcellularLocation.location': True} # protein_ens = ensomblePortein(queryProtein(ac, do_all,projection=projection)) # protein_ens['accession'] = ac # protein['ensomble'] = protein_ens result = do_new.UpSertOne({'accession': ac}, protein) if result.matched_count != 0: print('not save %s' % ac) notsaveCount = notsaveCount + 1 notsaveList.append(ac) print('not save %d' % (notsaveCount)) if fnotSave: saveList(notsaveList, fnotSave) return notsaveList
def generateCoQLists(fcoQ): do = DataOperation('uniprot', 'uniprot_sprot') projection = {'_id': True, 'accession': True} docs = do.QueryObj({"keyword.@id": 'KW-0830'}, projection=projection) coQlist = [ x['accession'][0] if isinstance(x['accession'], list) else x['accession'] for x in docs ] print('query %d coQ' % (len(coQlist))) saveList(coQlist, fcoQ) return coQlist
def generateCoQInfo(finfo): do = DataOperation('uniprot', 'uniprot_sprot') projection = {'_id': True, 'accession': True} docs = do.QueryObj({"keyword.@id": 'KW-0830'}, projection=projection) coQinfo = [ (x['accession'][0], x['_id'].split('_')[1]) if isinstance(x['accession'], list) else (x['accession'], x['_id'].split('_')[1]) for x in docs ] print('query %d coQ info' % (len(coQinfo))) saveList(coQinfo, finfo) return coQinfo
def findKeyProtein(fin, fout, keyword): # keyword = 'KW-0297' tmplist = readIDlist(fin) GPCRlist = [] do = DataOperation('uniprot', 'uniprot_sprot') projection = {'_id': True} for ac in tmplist: dic = {'accession': ac, "keyword.@id": keyword} result = do.QueryObj(dic, projection=projection) for r in result: GPCRlist.append(ac) print(r) saveList(GPCRlist, fout)
def getALlGprotein(fout): # {$or:[ # {"protein.recommendedName.fullName":{$regex:/G protein +/}}, # {"protein.recommendedName.fullName":{$regex:/G protein-coupled receptor +/}}, # {"protein.alternativeName.fullName":{$regex:/G protein +/}}, # {"protein.alternativeName.fullName":{$regex:/G protein-coupled receptor +/}} # ]} GPCRlist = [] do = DataOperation('uniprot', 'uniprot_sprot') projection = { '_id': True, 'protein.recommendedName.fullName': True, 'protein.alternativeName.fullName': True } count = 0 dic = { # 'accession': ac, '$or': [{ "protein.recommendedName.fullName": { '$regex': 'G protein ' } }, { "protein.recommendedName.fullName": { '$regex': 'Guanine nucleotide-binding protein' } }, { "protein.alternativeName.fullName": { '$regex': 'G protein ' } }, { "protein.alternativeName.fullName": { '$regex': 'Guanine nucleotide-binding protein' } }] } result = do.QueryObj(dic, projection=projection) for r in result: count = count + 1 GPCRlist.append(((r['_id'], [ x for x in handleSubcelluarLeaf( r['protein'], keys=['alternativeName', 'recommendedName', 'fullName']) ]))) print(((r['_id'], [ x for x in handleSubcelluarLeaf( r['protein'], keys=['alternativeName', 'recommendedName', 'fullName']) ]))) saveList(GPCRlist, fout)
def getallProtein(): ''' dao :param AC: :param do: :return: ''' do = DataOperation(db_name, table_target) projection = {'_id': True, 'UNIPROID':True} docs = do.GetALL(projection=projection,limit=0) for protein in docs: if protein['UNIPROID'] == '':continue for pro in multiSplit(protein['UNIPROID']): yield pro.strip()
def funcPfam(x): # ac = 'P03372' do = DataOperation('uniprot', 'uniprot_sprot') result = queryPfam(x[0], do, tophit=True) x[1] = result[0] x[2] = result[1] return x
def getPairInfo_TMP_nonTMP(fin, fout, sep='\t', checkTMP=True, keepOne=False): ''' :param fin: Q7BCK4 B6JN06 E7QG89 B2FN41 :param fout: TMP + nonTMP ['accession', 'name', 'length', 'noX', 'inlenRange', 'subcellularLocations', 'seq'] Q7BCK4 ICSA_SHIFL 1102 True True ['Periplasm', 'Secreted', 'Cell surface', 'Cell outer membrane'] MNQIHKFFCNMTQCSQGGAGELPTVKEKTCKLSFSPFVVGASLLLGGPIAFATPLSGTQELHFSEDNYEKLLTPVDGLSPLGAGEDGMDAWYITSSNPSHASRTKLRINSDIMISAGHGGAGDNNDGNSCGGNGGDSITGSDLSIINQGMILGGSGGSGADHNGDGGEAVTGDNLFIINGEIISGGHGGDSYSDSDGGNGGDAVTGVNLPIINKGTISGGNGGNNYGEGDGGNGGDAITGSSLSVINKGTFAGGNGGAAYGYGYDGYGGNAITGDNLSVINNGAILGGNGGHWGDAINGSNMTIANSGYIISGKEDDGTQNVAGNAIHITGGNNSLILHEGSVITGDVQVNNSSILKIINNDYTGTTPTIEGDLCAGDCTTVSLSGNKFTVSGDVSFGENSSLNLAGISSLEASGNMSFGNNVKVEAIINNWAQKDYKLLSADKGITGFSVSNISIINPLLTTGAIDYTKSYISDQNKLIYGLSWNDTDGDSHGEFNLKENAELTVSTILADNLSHHNINSWDGKSLTKSGEGTLILAEKNTYSGFTNINAGILKMGTVEAMTRTAGVIVNKGATLNFSGMNQTVNTLLNSGTVLINNINAPFLPDPVIVTGNMTLEKNGHVILNNSSSNVGQTYVQKGNWHGKGGILSLGAVLGNDNSKTDRLEIAGHASGITYVAVTNEGGSGDKTLEGVQIISTDSSDKNAFIQKGRIVAGSYDYRLKQGTVSGLNTNKWYLTSQMDNQESKQMSNQESTQMSSRRASSQLVSSLNLGEGSIHTWRPEAGSYIANLIAMNTMFSPSLYDRHGSTIVDPTTGQLSETTMWIRTVGGHNEHNLADRQLKTTANRMVYQIGGDILKTNFTDHDGLHVGIMGAYGYQDSKTHNKYTSYSSRGTVSGYTAGLYSSWFQDEKERTGLYMDAWLQYSWFNNTVKGDGLTGEKYSSKGITGALEAGYIYPTIRWTAHNNIDNALYLNPQVQITRHGVKANDYIEHNGTMVTSSGGNNIQAKLGLRTSLISQSCIDKETLRKFEPFLEVNWKWSSKQYGVIMNGMSNHQIGNRNVIELKTGVGGRLADNLSIWGNVSQQLGNNSYRDTQGILGVKYTF B6JN06 G6PI_HELP2 545 True True ['Cytoplasm'] MLTQLKTYPKLLKHYEEIKEAHMRDWFSKDKERASRYFVQLESLSLDYSKNRLNDTTLKLLFELANDCSLKEKIEAMFKGEKINTTEKRAVLHTALRSLNDTEILLDNMEVLKSVRSVLKRMRAFSDSVRSGKRLGYTNQVITDIVNIGIGGSDLGALMVCTALKRYGHPRLKMHFVSNVDGTQILDVLEKINPASTLFIVASKTFSTQETLTNALTARKWFVERSGDEKHIAKHFVAVSTNKEAVQQFGIDEHNMFEFWDFVGGRYSLWSAIGLSIMIYLGKKNFNALLKGAYLMDEHFRNAPFESNLPVLMGLIGVWYINFFQSKSHLIAPYDQYLRHFPKFIQQLDMESNGKRISKKGETIPYDTCPVVWGDMGINAQHAFFQLLHQGTHLIPIDFIASLDKKPNAKGHHEILFSNVLAQAQAFMKGKSYEEALGELLFKGLDKDEAKDLAHHRVFFGNRPSNILLLEKISPSNIGALVALYEHKVFVQGVIWDINSFDQWGVELGKELAVPILQELEGHKSNAYFDSSTKHLIELYKNYNQ E7QG89 SEC11_YEASZ 167 True True ['Endoplasmic reticulum membrane'] MNLRFELQKLLNVCFLFASAYMFWQGLAIATNSASPIVVVLSGSMEPAFQRGDILFLWNRNTFNQVGDVVVYEVEGKQIPIVHRVLRQHNNHADKQFLLTKGDNNAGNDISLYANKKIYLNKSKEIVGTVKGYFPQLGYITIWISENKYAKFALLGMLGLSALLGGE B2FN41 EX7L_STRMK 443 True True ['Cytoplasm'] MQPRNNDILTPSQLNTLARDLLEGSFPAIWVEAELGSVARPASGHLYFTLKDARAQLRAAMFRMKAQYLKFVPREGMRVLVRGKVTLYDARGEYQMVLDHMEEAGEGALRRAFEELKARLEAEGLFDPARKRPLPTHVQRLAVITSPTGAAVRDVLSVLGRRFPLLEVDLLPTLVQGSSAAAQITRLLQAADASGRYDVILLTRGGGSLEDLWAFNDEALARAIAASRTPVVSAVGHETDFSLSDFAADLRAPTPSVAAELLVPDQRELALRLRRTAARMVQLQRHAMQQAMQRADRALLRLNAQSPQARLDLLRRRQLDLGRRLHAVFNQQQERRAARLRHAAAVLRGHHPQRQLDAMQRRLAALRGRPQAAMQRLLERDALRLRGLARSLEAVSPLATVARGYSILTRTDDGTLVRKVNQVQPGDALQARVGDGVIDVQVK :return: fin = 'file/_1pair.txt' fout = 'file/_2pair_info.txt' getPairInfo_TMP_nonTMP(fin,fout) ''' do = DataOperation('uniprot', 'uniprot_sprot') with open(fout, 'w') as fo: for pa, pb in getPairs(fin, sep=sep, title=False): print('%s\t%s' % (pa, pb)) result = getTaN(pa, pb, do, checkTMP=checkTMP, keepOne=keepOne) if result == None: continue tmp = ensomblePortein(result[0]) nontmp = ensomblePortein(result[1]) for v in tmp.values(): fo.write(str(v)) fo.write('\t') for v in nontmp.values(): fo.write(str(v)) fo.write('\t') fo.write('\n') fo.flush()
def getSingleInfo(fin, fout, fin_type='single', col=[0, 1]): if fin_type == 'pair': df = pd.read_table(fin, header=None)[col] dat = df.to_numpy().reshape(1, -1) proteins = set(dat[0]) elif fin_type == 'single': proteins = readIDlist(fin) else: pass do = DataOperation('uniprot', 'uniprot_sprot') projection = { '_id': True, 'sequence.@length': True, 'sequence.#text': True, 'keyword.@id': True, 'comment.subcellularLocation.location': True } prod = Protein() with open(fout, 'w') as fo: for AC in proteins: pro = queryProtein(AC, do, projection=projection) pro['accession'] = AC if not prod.checkProtein( pro['sequence']['#text'], 50, 2000, uncomm=True): continue proinfo = ensomblePortein(pro) for v in proinfo.values(): fo.write(str(v)) fo.write('\t') fo.write('\n') fo.flush()
def getallProtein(fout): ''' dao :param AC: :param do: :return: ''' do = DataOperation('DrugKB', 'protein') projection = {'_id': True, 'accession': True} # one accession mapping several protein sequence docs = do.GetALL(projection=projection, limit=0) proteinlist = [ x['accession'][0] if isinstance(x['accession'], list) else x['accession'] for x in docs ] saveList(proteinlist, fout)
def getPairTag(fin, f1pairWithTag_info, sep='\t'): ''' :param fin: fin pair :param f1pairWithTag_info: fout :return: tag 0, TMP_SP 1, TMP_TMP 2, SP_SP ''' do = DataOperation('uniprot', 'uniprot_sprot') with open(f1pairWithTag_info, 'w') as fo: for pa, pb in getPairs(fin, sep=sep, title=False): print('%s\t%s' % (pa, pb)) result = tagPair(pa, pb, do) if result == None: continue proA = ensomblePortein(result[0]) proB = ensomblePortein(result[1]) for v in proA.values(): fo.write(str(v)) fo.write('\t') for v in proB.values(): fo.write(str(v)) fo.write('\t') fo.write(str(result[2])) fo.write('\n') fo.flush()
def getTmp(fin, fout): ''' :param fin: protein list :param fout: tmp list :return: ''' do = DataOperation('uniprot', 'uniprot_sprot') projection = {'_id': True, 'keyword.@id': True} aclist = readIDlist(fin) with open(fout, 'w') as fo: for AC in aclist: qa = do.QueryOne('accession', AC, projection=projection) if qa == None: continue if checkTaN(qa['keyword']): print('%s TMP' % AC) pa_TMP_flag = False if 'keyword' not in qa.keys() else checkTaN( qa['keyword']) if pa_TMP_flag: fo.write('%s\n' % AC) fo.flush()
def test_queryProtein(self): do = DataOperation('uniprot', 'uniprot_sprot') AC = 'P34397' protein = None # one accession mapping several protein sequence projection = { '_id': True, 'sequence.@length': True, 'sequence.#text': True, 'keyword.@id': True, 'comment.subcellularLocation.location': True } docs = do.Query('accession', AC, projection=projection) count = 0 for doc in docs: count = count + 1 if count > 1: # 一个accession 查询到多个蛋白质 # 保存这个列表 print('%s is more than one entry' % AC) protein = doc protein['accession'] = AC ensomblePortein(protein)
def findGProtein(fin, fout): # {"protein.recommendedName.fullName":{$regex:/Guanine nucleotide-binding protein*/}} # keyword = 'KW-0297' tmplist = readIDlist(fin) GPCRlist = [] do = DataOperation('uniprot', 'uniprot_sprot') projection = {'_id': True} count = 0 for ac in tmplist: dic = { 'accession': ac, '$or': [{ "protein.recommendedName.fullName": { '$regex': 'G protein ' } }, { "protein.recommendedName.fullName": { '$regex': 'Guanine nucleotide-binding protein' } }, { "protein.alternativeName.fullName": { '$regex': 'G protein ' } }, { "protein.alternativeName.fullName": { '$regex': 'Guanine nucleotide-binding protein' } }] } result = do.QueryObj(dic, projection=projection) for r in result: count = count + 1 GPCRlist.append(ac) print(count, r) saveList(GPCRlist, fout)
def generateCriterLists(ftmp, fnontmp): do = DataOperation('uniprot', 'uniprot_sprot') projection = {'_id': False, 'accession': True} docs = do.QueryObj( { "keyword.@id": 'KW-0812', 'comment.subcellularLocation.location.#text': { '$exists': True } }, projection=projection) tmplist = [ x['accession'][0] if isinstance(x['accession'], list) else x['accession'] for x in docs ] docs = do.QueryObj( { "keyword.@id": { '$ne': 'KW-0812' }, 'keyword': { '$exists': True }, 'comment.subcellularLocation.location.#text': { '$exists': True } }, projection=projection) nontmplist = [ x['accession'][0] if isinstance(x['accession'], list) else x['accession'] for x in docs ] print('query %d tmp and %d nontmp' % (len(tmplist), len(nontmplist))) saveList(tmplist, ftmp) saveList(nontmplist, fnontmp) return tmplist, nontmplist
def getFasta(fin_all_protein, fout): proteins = readIDlist(fin_all_protein) do = DataOperation('uniprot', 'uniprot_sprot') projection = {'sequence.#text': True} with open(fout, 'w') as fo: from Bio import SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord records = [] for AC in proteins: pro = queryProtein(AC, do, projection=projection) if not pro: continue record = SeqRecord(Seq(pro['sequence']['#text']), id=AC, description='') records.append(record) #fo.write('>%s\n%s\n'%(AC,pro['sequence']['#text'])) #fo.flush() SeqIO.write(records, fout, 'fasta')
from DatabaseOperation2 import DataOperation from ProteinDealer import Protein from dao import queryProtein, ensomblePortein import time if __name__ == '__main__': print('start', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())) start = time.time() pa = 'Q14118' projcetion = {'dbReference.text':True,'_id':False} dic = {'accession':pa,'dbReference.@type':'pfam'} do = DataOperation('uniprot', 'uniprot_sprot') qa = do.QueryObj(dic,projcetion= projcetion) dbReference = qa['dbReference'] subcellularLocations = [] do = DataOperation('uniprot', 'uniprot_sprot') projection = {'_id': True, 'sequence.@length': True, 'sequence.#text': True, 'keyword.@id': True, 'comment.subcellularLocation.location': True} AC = 'P82432' pro = queryProtein(AC, do, projection=projection) pro['accession'] = AC proinfo = ensomblePortein(pro) pass print('stop', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())) print('time', time.time() - start)
def proteinPfam(fin,fout,tophit=True,item='Pfam'): do = DataOperation('uniprot', 'uniprot_sprot') df = pd.read_csv(fin, header=None) df = pd.DataFrame(df) df2 = df.apply(lambda x: funcPfam(x,do,tophit=tophit,item=item), axis=1) df2.to_csv(fout, sep='\t',header=None, index=None)
for q in qa: dbReference = q['dbReference'] # {'@type': 'Pfam', '@id': 'PF18424', 'property': [{'@type': 'entry name', '@value': 'a_DG1_N2'} # {'@type': 'Pfam', '@id': 'PF12743', 'property': [{'@type': 'entry name', '@value': 'ESR1_C'}, {'@type': 'match status', '@value': '1'}]} result = getPfam(dbReference) return result if not tophit else result[0] def addProteinPfam(): pass def funcPfam(x): return x if __name__ == '__main__': do = DataOperation('uniprot', 'uniprot_sprot') projection = { '_id': True, 'sequence.@length': True, 'sequence.#text': True, 'keyword.@id': True, 'comment.subcellularLocation.location': True } AC = 'Q13206' pro = queryProtein(AC, do, projection=projection) pro['accession'] = AC proinfo = ensomblePortein(pro)