def save(aclist, dbnam, fnotSave=None): # aclist = ['P06685', 'P06686'] # dbnam = 'seqtmppi_test1' do_all = DataOperation('uniprot', 'seqtmppi_positive') do_new = DataOperation('uniprot', dbnam) notsaveCount = 0 notsaveList = [] for ac in aclist: protein = queryProtein(ac, do_all) if protein == None: print('not save protein %s' % ac) continue # projection = {'_id': True, 'sequence.@length': True, 'sequence.#text': True, 'keyword.@id': True, # 'comment.subcellularLocation.location': True} # protein_ens = ensomblePortein(queryProtein(ac, do_all,projection=projection)) # protein_ens['accession'] = ac # protein['ensomble'] = protein_ens result = do_new.UpSertOne({'accession': ac}, protein) if result.matched_count != 0: print('not save %s' % ac) notsaveCount = notsaveCount + 1 notsaveList.append(ac) print('not save %d' % (notsaveCount)) if fnotSave: saveList(notsaveList, fnotSave) return notsaveList
def pairToproteinList(fin,fout=''): """ # Gets a list of non-repeating proteins from the protein pair :param fin: :param fout: :return: """ proteinlist = [] with open(fin,'r') as fi: line = fi.readline() while(line): pair = line.split('\t') a = pair[0] b = pair[1][:-1] if a not in proteinlist: proteinlist.append(a) # fo.write(a+'\n') # fo.flush() if b not in proteinlist: proteinlist.append(b) # fo.write(b+'\n') # fo.flush() line = fi.readline() if fout:saveList(proteinlist,fout) print('pairToproteinList end') return proteinlist
def generateSomeSpeciesLists(ftmp, fnontmp, species='MYCPN'): do = DataOperation('uniprot', 'uniprot_sprot') projection = {'_id': False, 'accession': True} docs = do.QueryObj({ "keyword.@id": 'KW-0812', '_id': { '$regex': species } }, projection=projection) tmplist = [ x['accession'][0] if isinstance(x['accession'], list) else x['accession'] for x in docs ] docs = do.QueryObj( { "keyword.@id": { '$ne': 'KW-0812' }, 'keyword': { '$exists': True }, '_id': { '$regex': 'HUMAN' } }, projection=projection) nontmplist = [ x['accession'][0] if isinstance(x['accession'], list) else x['accession'] for x in docs ] print('query %d tmp and %d nontmp' % (len(tmplist), len(nontmplist))) saveList(tmplist, ftmp) saveList(nontmplist, fnontmp) return tmplist, nontmplist
def queryPathway_Gene(foutPathway_Gene, fpathway=None, fpathwayInfo=None, hsa='hsa'): ''' :param foutPathway_Gene: 0 hsa00010 pathway id 1 3101 gene id 2 HK3 gene symbol 3 [K00844] KO id 4 [2.7.1.1] EC number :param fpathway: :param fpathwayInfo: :param hsa: :return: ''' if os.access(fpathway, os.F_OK): repair_pathways = readIDlist(fpathway) else: repair_pathways = queryAllPathway(fpathway=fpathway, fpathwayInfo=fpathwayInfo, hsa=hsa) for idx, pathway in enumerate(repair_pathways): geneID_geneName_KO_EC = [] print(idx, end='.') if idx < 65: continue for gene_id, gene_symbol, KO, EC in extractGeneFromPathway(pathway): geneID_geneName_KO_EC.append( (pathway, gene_id, gene_symbol, KO, EC)) saveList(geneID_geneName_KO_EC, foutPathway_Gene, file_mode='a')
def queryKid_pathway(fTmpKEGGCount, outListFile): df = pd.read_table(fTmpKEGGCount, header=None) # kid_pathway = [] for id, item in df.iterrows(): # kid_pathway.extend(queryPathwayByKid(item[0])) print(id, 'query for ', item[0], item[1]) result = queryPathwayByKid(item[0]) if result == []: continue saveList(result, outListFile, file_mode='a')
def queryAllPathway(fpathway=None, fpathwayInfo=None, hsa='hsa'): human_pathways = REST.kegg_list("pathway", hsa).read() repair_pathways = [] repair_pathways_info = [] for line in human_pathways.rstrip().split("\n"): entry, description = line.split("\t") entry = entry.split(':')[1] repair_pathways.append(entry) repair_pathways_info.append((entry, description)) if fpathway: saveList(repair_pathways, fpathway) if fpathwayInfo: saveList(repair_pathways_info, fpathwayInfo) return repair_pathways
def findKeyProtein(fin, fout, keyword): # keyword = 'KW-0297' tmplist = readIDlist(fin) GPCRlist = [] do = DataOperation('uniprot', 'uniprot_sprot') projection = {'_id': True} for ac in tmplist: dic = {'accession': ac, "keyword.@id": keyword} result = do.QueryObj(dic, projection=projection) for r in result: GPCRlist.append(ac) print(r) saveList(GPCRlist, fout)
def getALlGprotein(fout): # {$or:[ # {"protein.recommendedName.fullName":{$regex:/G protein +/}}, # {"protein.recommendedName.fullName":{$regex:/G protein-coupled receptor +/}}, # {"protein.alternativeName.fullName":{$regex:/G protein +/}}, # {"protein.alternativeName.fullName":{$regex:/G protein-coupled receptor +/}} # ]} GPCRlist = [] do = DataOperation('uniprot', 'uniprot_sprot') projection = { '_id': True, 'protein.recommendedName.fullName': True, 'protein.alternativeName.fullName': True } count = 0 dic = { # 'accession': ac, '$or': [{ "protein.recommendedName.fullName": { '$regex': 'G protein ' } }, { "protein.recommendedName.fullName": { '$regex': 'Guanine nucleotide-binding protein' } }, { "protein.alternativeName.fullName": { '$regex': 'G protein ' } }, { "protein.alternativeName.fullName": { '$regex': 'Guanine nucleotide-binding protein' } }] } result = do.QueryObj(dic, projection=projection) for r in result: count = count + 1 GPCRlist.append(((r['_id'], [ x for x in handleSubcelluarLeaf( r['protein'], keys=['alternativeName', 'recommendedName', 'fullName']) ]))) print(((r['_id'], [ x for x in handleSubcelluarLeaf( r['protein'], keys=['alternativeName', 'recommendedName', 'fullName']) ]))) saveList(GPCRlist, fout)
def getallProtein(fout): ''' dao :param AC: :param do: :return: ''' do = DataOperation('DrugKB', 'protein') projection = {'_id': True, 'accession': True} # one accession mapping several protein sequence docs = do.GetALL(projection=projection, limit=0) proteinlist = [ x['accession'][0] if isinstance(x['accession'], list) else x['accession'] for x in docs ] saveList(proteinlist, fout)
def findGProtein(fin, fout): # {"protein.recommendedName.fullName":{$regex:/Guanine nucleotide-binding protein*/}} # keyword = 'KW-0297' tmplist = readIDlist(fin) GPCRlist = [] do = DataOperation('uniprot', 'uniprot_sprot') projection = {'_id': True} count = 0 for ac in tmplist: dic = { 'accession': ac, '$or': [{ "protein.recommendedName.fullName": { '$regex': 'G protein ' } }, { "protein.recommendedName.fullName": { '$regex': 'Guanine nucleotide-binding protein' } }, { "protein.alternativeName.fullName": { '$regex': 'G protein ' } }, { "protein.alternativeName.fullName": { '$regex': 'Guanine nucleotide-binding protein' } }] } result = do.QueryObj(dic, projection=projection) for r in result: count = count + 1 GPCRlist.append(ac) print(count, r) saveList(GPCRlist, fout)
def generateCriterLists(ftmp, fnontmp): do = DataOperation('uniprot', 'uniprot_sprot') projection = {'_id': False, 'accession': True} docs = do.QueryObj( { "keyword.@id": 'KW-0812', 'comment.subcellularLocation.location.#text': { '$exists': True } }, projection=projection) tmplist = [ x['accession'][0] if isinstance(x['accession'], list) else x['accession'] for x in docs ] docs = do.QueryObj( { "keyword.@id": { '$ne': 'KW-0812' }, 'keyword': { '$exists': True }, 'comment.subcellularLocation.location.#text': { '$exists': True } }, projection=projection) nontmplist = [ x['accession'][0] if isinstance(x['accession'], list) else x['accession'] for x in docs ] print('query %d tmp and %d nontmp' % (len(tmplist), len(nontmplist))) saveList(tmplist, ftmp) saveList(nontmplist, fnontmp) return tmplist, nontmplist
def writeProtins(fout): proteins = [] for protein in getallProtein(): proteins.extend(protein) saveList(getallProtein(),fout)