def composeTMP_nonTMP(ftmp, fnontmp, fpair, num): ''' 如果给了ftmp,fnontmp,就参考所给的列表组合 如果这个文件不存在,则从数据库中查询这两个列表 :param ftmp: :param fnontmp: :param fpair: 组合结果 :param num: 组合的总数 :return: ''' if not (os.access(ftmp, os.F_OK) and os.access(fnontmp, os.F_OK)): tmplist, nontmplist = generateCriterLists(ftmp, fnontmp) else: tmplist, nontmplist = readIDlist(ftmp), readIDlist(fnontmp) sampL1 = min(len(tmplist), num) sampL2 = min(len(nontmplist), num) L1 = random.sample(range(0, len(tmplist)), sampL1) L2 = random.sample(range(0, len(nontmplist)), sampL2) with open(fpair, 'w') as fo: for idx in range(num): fo.write('%s\t%s\n' % (tmplist[L1[(idx + random.randint(2, 9)) % sampL1]], nontmplist[L2[(idx + random.randint(1, 5)) % sampL2]])) fo.flush() print(idx)
def getSingleInfo(fin, fout, fin_type='single', col=[0, 1]): if fin_type == 'pair': df = pd.read_table(fin, header=None)[col] dat = df.to_numpy().reshape(1, -1) proteins = set(dat[0]) elif fin_type == 'single': proteins = readIDlist(fin) else: pass do = DataOperation('uniprot', 'uniprot_sprot') projection = { '_id': True, 'sequence.@length': True, 'sequence.#text': True, 'keyword.@id': True, 'comment.subcellularLocation.location': True } prod = Protein() with open(fout, 'w') as fo: for AC in proteins: pro = queryProtein(AC, do, projection=projection) pro['accession'] = AC if not prod.checkProtein( pro['sequence']['#text'], 50, 2000, uncomm=True): continue proinfo = ensomblePortein(pro) for v in proinfo.values(): fo.write(str(v)) fo.write('\t') fo.write('\n') fo.flush()
def queryPathway_Gene(foutPathway_Gene, fpathway=None, fpathwayInfo=None, hsa='hsa'): ''' :param foutPathway_Gene: 0 hsa00010 pathway id 1 3101 gene id 2 HK3 gene symbol 3 [K00844] KO id 4 [2.7.1.1] EC number :param fpathway: :param fpathwayInfo: :param hsa: :return: ''' if os.access(fpathway, os.F_OK): repair_pathways = readIDlist(fpathway) else: repair_pathways = queryAllPathway(fpathway=fpathway, fpathwayInfo=fpathwayInfo, hsa=hsa) for idx, pathway in enumerate(repair_pathways): geneID_geneName_KO_EC = [] print(idx, end='.') if idx < 65: continue for gene_id, gene_symbol, KO, EC in extractGeneFromPathway(pathway): geneID_geneName_KO_EC.append( (pathway, gene_id, gene_symbol, KO, EC)) saveList(geneID_geneName_KO_EC, foutPathway_Gene, file_mode='a')
def extractFasta(self, fin_fasta, fin_idlist, fout_fasta, in_multi=True, out_multi=True): oridict = self.getDict(fin_fasta, multi=in_multi) desdict = {} idlist = readIDlist(fin_idlist) for id in idlist: desdict[id] = oridict[id] self.dict2fasta(desdict, fout_fasta, multi=out_multi)
def fullyComposeTMP_nonTMP(ftmp, fnontmp, fpair): ''' 如果给了ftmp,fnontmp,就参考所给的列表组合 如果这个文件不存在,则从数据库中查询这两个列表 :param ftmp: :param fnontmp: :param fpair: 组合结果 :param num: 组合的总数 :return: ''' if not (os.access(ftmp, os.F_OK) and os.access(fnontmp, os.F_OK)): print('Cretira not found, qury from Mongodb...') tmplist, nontmplist = generateCriterLists(ftmp, fnontmp) else: tmplist, nontmplist = readIDlist(ftmp), readIDlist(fnontmp) with open(fpair, 'w') as fo: for elem1 in tmplist: for elem2 in nontmplist: print(elem1, elem2) fo.write('%s\t%s\n' % (elem1, elem2)) fo.flush()
def findKeyProtein(fin, fout, keyword): # keyword = 'KW-0297' tmplist = readIDlist(fin) GPCRlist = [] do = DataOperation('uniprot', 'uniprot_sprot') projection = {'_id': True} for ac in tmplist: dic = {'accession': ac, "keyword.@id": keyword} result = do.QueryObj(dic, projection=projection) for r in result: GPCRlist.append(ac) print(r) saveList(GPCRlist, fout)
def getFasta(fin_all_protein, fout): proteins = readIDlist(fin_all_protein) do = DataOperation('uniprot', 'uniprot_sprot') projection = {'sequence.#text': True} with open(fout, 'w') as fo: from Bio import SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord records = [] for AC in proteins: pro = queryProtein(AC, do, projection=projection) if not pro: continue record = SeqRecord(Seq(pro['sequence']['#text']), id=AC, description='') records.append(record) #fo.write('>%s\n%s\n'%(AC,pro['sequence']['#text'])) #fo.flush() SeqIO.write(records, fout, 'fasta')
def findGProtein(fin, fout): # {"protein.recommendedName.fullName":{$regex:/Guanine nucleotide-binding protein*/}} # keyword = 'KW-0297' tmplist = readIDlist(fin) GPCRlist = [] do = DataOperation('uniprot', 'uniprot_sprot') projection = {'_id': True} count = 0 for ac in tmplist: dic = { 'accession': ac, '$or': [{ "protein.recommendedName.fullName": { '$regex': 'G protein ' } }, { "protein.recommendedName.fullName": { '$regex': 'Guanine nucleotide-binding protein' } }, { "protein.alternativeName.fullName": { '$regex': 'G protein ' } }, { "protein.alternativeName.fullName": { '$regex': 'Guanine nucleotide-binding protein' } }] } result = do.QueryObj(dic, projection=projection) for r in result: count = count + 1 GPCRlist.append(ac) print(count, r) saveList(GPCRlist, fout)
def getTmp_SpPair(tmpf,spf,finPair,foutPair,type1='TMP',type2='SP',crossover = False): """ cretira:/home/jjhnenu/data/PPI/release/criteria/ 20200701 tmpf='/home/jjhnenu/data/PPI/release/criteria/allcession_KW-0812_131609.list' spf='/home/jjhnenu/data/PPI/release/criteria/allcession_soluble_614454.list' :param tmpf: :param spf: :param finPair: :param foutPair: :return: tmpf = r'E:\githubCode\BioComputeCodeStore\JiangJiuhong\data\PPI\tmp\\KW-0812.list' spf = r'E:\githubCode\BioComputeCodeStore\JiangJiuhong\data\PPI\sp\\splist.list' finPair = r'E:\githubCode\BioComputeCodeStore\JiangJiuhong\data\PPI\interactor_interaction\\noprepeat.txt' foutPair = r'E:\githubCode\BioComputeCodeStore\JiangJiuhong\data\PPI\tmp_sp\\tmp_sp.txt' getTmp_SpPair(tmpf, spf, finPair, foutPair) """ tmplist = readIDlist(tmpf) splist = readIDlist(spf) print(len(tmplist),len(splist)) # len(tmplist)<len(splist) splitmark = '\\' if '/' in foutPair: splitmark = '/' mypath = foutPair[:foutPair.rindex(splitmark)+1] fSP_SP = mypath+'%s_%s.txt'%(type2,type2) fTMP_TMP = mypath+'%s_%s.txt'%(type1,type1) fnonTMP_SP = mypath + 'non%s_%s.txt' % (type1, type2) fTmp_nonSP = mypath + '%s_non%s.txt' % (type1, type2) fTMP_nonTmp = mypath+'%s_non%s.txt'%(type1,type1) fnonTmp_nonTmp = mypath+'non%s_non%s.txt'%(type1,type1) fnonsp_sp = mypath+'non%s_%s.txt'%(type2,type2) fnonsp_nonsp = mypath+'non%s_non%s.txt'%(type2,type2) fdrop = mypath+'drop.txt' with open(finPair, 'r') as fin, \ open(foutPair, 'w') as fout,\ open(fSP_SP,'w') as foutsp,\ open(fTMP_TMP,'w') as fouttmp,\ open(fnonTMP_SP,'w') as foutnonTMP_SP,\ open(fTmp_nonSP,'w') as foutTmp_nonSP,\ open(fTMP_nonTmp,'w') as foutTMP_nonTmp,\ open(fnonTmp_nonTmp,'w') as foutnonTmp_nonTmp,\ open(fnonsp_sp, 'w') as fout_nonsp_sp,\ open(fnonsp_nonsp, 'w') as fout_nonsp_nonsp,\ open(fdrop, 'w') as fout_drop\ : line = fin.readline() while (line): pair = line.split('\t') try: a = pair[0] b = pair[1][:-1] except: print(pair) # Atmp = a in tmplist # Asp = False if Atmp and not crossover else a in splist # Btmp = b in tmplist # Bsp = False if Btmp and not crossover else b in splist Atmp = a in tmplist Asp = a in splist Btmp = b in tmplist Bsp = b in splist if Atmp and Bsp: # TMP_SP fout.write(a + '\t' + b + '\n') fout.flush() print('%s %s save this pair' % (a, b)) elif Asp and Btmp: # TMP_SP fout.write(b + '\t' + a + '\n') fout.flush() print('%s %s save this pair' % (a, b)) elif Asp and Bsp: # SP_SP foutsp.write(a + '\t' + b + '\n') foutsp.flush() print('%s %s save this pair %s_%s' % (a, b,type2,type2)) elif Atmp and Btmp: # TMP_TMP fouttmp.write(a + '\t' + b + '\n') fouttmp.flush() print('%s\t%s save this pair %s_%s' % (a, b,type1,type1)) else: pass if not Atmp and Bsp: foutnonTMP_SP.write(a + '\t' + b + '\n') foutnonTMP_SP.flush() print('%s %s save this pair %s_non%s' % (a, b, type1, type2)) elif Atmp and not Bsp: foutTmp_nonSP.write(a + '\t' + b + '\n') foutTmp_nonSP.flush() print('%s %s save this pair %s_non%s' % (a, b, type1, type2)) else:pass # if (a in tmplist and b not in tmplist) or (a not in tmplist and b in tmplist): # foutTMP_nonTmp.write(a + '\t' + b + '\n') # foutTMP_nonTmp.flush() # print('%s %s save this pair TMP_nonTmp' % (a, b)) # if a not in tmplist and b not in tmplist: # foutnonTmp_nonTmp.write(a + '\t' + b + '\n') # foutnonTmp_nonTmp.flush() # print('%s %s save this pair nonTMP_nonTmp' % (a, b)) if Atmp and not Btmp: # TMP_nonTMP foutTMP_nonTmp.write(a + '\t' + b + '\n') foutTMP_nonTmp.flush() print('%s %s save this pair %s_non%s' % (a, b,type1,type1)) elif not Atmp and Btmp: # TMP_nonTMP foutTMP_nonTmp.write(b + '\t' + a + '\n') foutTMP_nonTmp.flush() print('%s %s save this pair %s_non%s' % (a, b,type1,type1)) elif not Atmp and not Btmp: foutnonTmp_nonTmp.write(a + '\t' + b + '\n') foutnonTmp_nonTmp.flush() print('%s %s save this pair non%s_non%s' % (a, b,type1,type1)) else: pass if not Asp and Bsp: fout_nonsp_sp.write(a + '\t' + b + '\n') fout_nonsp_sp.flush() print('%s %s save this pair non%s_%s' % (a, b,type2,type2)) elif Asp and not Bsp: fout_nonsp_sp.write(b + '\t' + a + '\n') fout_nonsp_sp.flush() print('%s %s save this pair non%s_%s' % (a, b,type2,type2)) elif not Asp and not Bsp: fout_nonsp_nonsp.write(a + '\t' + b + '\n') fout_nonsp_nonsp.flush() print('%s %s save this pair non%s_non%s' % (a, b,type2,type2)) else: # fout_drop.write(a + '\t' + b + '\n') # print('%s %s drop this pair' % (a, b)) pass line = fin.readline() func = countpair handledir(mypath, func) print('get%s_%sPair end'%(type1,type2))
# Title : tesKEGGAPI.py # Created by: [email protected] # Created on: 2021/2/4 15:23 # des : TODO import os from common import readIDlist if __name__ == '__main__': from Bio.KEGG import REST dirout = 'file/6bioAnalysis/keggDB/pathwayInfo' human_pathways = REST.kegg_list("pathway", "hsa").read() repair_pathways = readIDlist('file/6bioAnalysis/keggDB/1pathway_human.tsv') # Get the genes for pathways and add them to a list repair_genes = [] for idx,pathway in enumerate(repair_pathways): print(idx,pathway) pathway_file = REST.kegg_get(pathway).read() # query and read each pathway with open(os.path.join(dirout,'%s.txt'%pathway),'w') as fo: fo.write(pathway_file) fo.flush()
def handlePair(foutdir, sep=',', dbname=None, checkTMP=True, jumpStep=None, fin=None, f2tmp_nonTtmp_info_qualified=None, keepOne=False): ''' 数据量较少,直接逐行查询,很多蛋白被查询了多次 :param foutdir: :param sep: sep of fin file :parameter dbname: name of mongodb :parameter jumpStep: skip some step in this method [1,2,3,4] :parameter fin:ignore this parameter when 1 in jumpStep :parameter f2tmp_nonTtmp_info_qualified: sign this path in the dir :return: fin = 'file/1intAct_pair_norepeat.txt' foutdir = 'file/1positive' handlePair(fin,foutdir) ''' ''' config path ''' f1tmp_nontmp_info = os.path.join(foutdir, '1tmp_nontmp_info.tsv') f1TMP_nonTMP = os.path.join(foutdir, '1tmp_nontmp.tsv') if not f2tmp_nonTtmp_info_qualified: f2tmp_nonTtmp_info_qualified = os.path.join( foutdir, '2tmp_nontmp_info_qualified.tsv') fout_fasta = os.path.join(foutdir, '2pair.fasta') fout_tmp_fasta = os.path.join(foutdir, '2tmp.fasta') fout_nontmp_fasta = os.path.join(foutdir, '2nontmp.fasta') f2positive = os.path.join(foutdir, '2pair.tsv') f2tmp = os.path.join(foutdir, '2tmp.list') f2nontmp = os.path.join(foutdir, '2nontmp.list') f2all = os.path.join(foutdir, '2all.list') f2tmp_info = os.path.join(foutdir, '2tmp_info.tsv') f2nontmp_info = os.path.join(foutdir, '2nontmp_info.tsv') f2all_info = os.path.join(foutdir, '2all_info.tsv') f3subcell = os.path.join(foutdir, '3subcellular.tsv') ''' 1. get tmp nontmp pair time 1766.4131457805634 ''' if jumpStep == None or 1 not in jumpStep: getPairInfo_TMP_nonTMP(fin, f1tmp_nontmp_info, sep=sep, checkTMP=checkTMP, keepOne=keepOne) simplifyTable(f1tmp_nontmp_info, f1TMP_nonTMP) ''' 2. get qualified tmp nontmp pair ''' if jumpStep == None or 2 not in jumpStep: saveQualified(f1tmp_nontmp_info, f2tmp_nonTtmp_info_qualified) ''' 3. get related list,fasta,pair ''' if jumpStep == None or 3 not in jumpStep: simplifyTable(f2tmp_nonTtmp_info_qualified, f2positive) extractPairAndFasta(f2tmp_nonTtmp_info_qualified, fout_fasta, fout_tmp_fasta=fout_tmp_fasta, fout_nontmp_fasta=fout_nontmp_fasta) getproteinlist(f2tmp_nonTtmp_info_qualified, ftmp=f2tmp, fnontmp=f2nontmp, fall=f2all, ftmp_info=f2tmp_info, ftmp_nontmp_info=f2nontmp_info, fall_info=f2all_info) ''' 4. save to mongodb ''' if jumpStep == None or (4 not in jumpStep and dbname): notsvaelist = save(readIDlist(f2all), dbname) print('those protein not save in the mongodb', notsvaelist) ''' 5. calcu subcellular ''' if jumpStep == None or 5 not in jumpStep: handleRow(f2tmp_nonTtmp_info_qualified, f3subcell, calcuSubcell)