def getSingleInfo(fin, fout, fin_type='single', col=[0, 1]): if fin_type == 'pair': df = pd.read_table(fin, header=None)[col] dat = df.to_numpy().reshape(1, -1) proteins = set(dat[0]) elif fin_type == 'single': proteins = readIDlist(fin) else: pass do = DataOperation('uniprot', 'uniprot_sprot') projection = { '_id': True, 'sequence.@length': True, 'sequence.#text': True, 'keyword.@id': True, 'comment.subcellularLocation.location': True } prod = Protein() with open(fout, 'w') as fo: for AC in proteins: pro = queryProtein(AC, do, projection=projection) pro['accession'] = AC if not prod.checkProtein( pro['sequence']['#text'], 50, 2000, uncomm=True): continue proinfo = ensomblePortein(pro) for v in proinfo.values(): fo.write(str(v)) fo.write('\t') fo.write('\n') fo.flush()
def loaddata(ftmp_fa, fnontmp_fa): ''' :param ftmp_fa: tmp fasta file :param fnontmp_fa: nontmp fasta file :return: idpair, feature # ftmp_fa = 'file/8humanPredict/3tmp.fasta' # fnontmp_fa = 'file/8humanPredict/3nontmp.fasta' ''' fd = FastaDealer() bf = BaseFeature() p = Protein() # flag = False # flagA = False # flagB = False for pa in SeqIO.parse(ftmp_fa, 'fasta'): # if pa.id == 'Q8TCW7':flagA = True # 程序中断之后,需要用这个代码控制起始预测位置 # if not flagA:continue if not p.checkProtein(pa.seq, 50, 2000, uncomm=True): continue a = fd.phsi_blos(pa.seq) for pb in SeqIO.parse(fnontmp_fa, 'fasta'): # if pb.id == 'Q15942':flagB = True # if not flagB:continue if not p.checkProtein(pb.seq, 50, 2000, uncomm=True): continue b = fd.phsi_blos(pb.seq) c = bf.padding_PSSM(a, b, vstack=True, shape=(2000, 25)) yield '%s-%s' % (pa.id, pb.id), c
def getNpy(self, fin_fasta, out_dir, multi=True, checkprotein=True): check_path(out_dir) p = Protein() for ID, seq in self.getYield(fin_fasta, multi=multi): if not checkprotein or p.checkProtein(seq, 50, 2000, uncomm=True): filename = os.path.join(out_dir, "%s.npy" % ID) result = self.seq2num(seq) if len(result) != 0: np.save(filename, result)
def ensomblePortein(pro): ens_pro = {} prd = Protein() ens_pro['accession'] = pro['accession'] ens_pro['name'] = pro['_id'] ens_pro['length'] = pro['sequence']['@length'] ens_pro['noX'] = prd.checkUncomm(pro['sequence']['#text']) ens_pro['inlenRange'] = prd.checkLengthRange(int( pro['sequence']['@length']), min=50, max=2000) ens_pro['subcellularLocations'] = [] if 'comment' not in pro.keys( ) else getSubcelllist(pro['comment']) ens_pro['seq'] = pro['sequence']['#text'] return ens_pro
""" @author: [email protected] @time: 2020/4/17 17:50 @desc: """ import os from Bio import SeqIO import numpy as np from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from ProteinDealer import Protein from common import check_path, getPairs, readIDlist p = Protein() class FastaDealer: def extractFasta(self, fin_fasta, fin_idlist, fout_fasta, in_multi=True, out_multi=True): oridict = self.getDict(fin_fasta, multi=in_multi) desdict = {} idlist = readIDlist(fin_idlist) for id in idlist: desdict[id] = oridict[id] self.dict2fasta(desdict, fout_fasta, multi=out_multi)
# encoding: utf-8 """ @author: [email protected] @time: 2020/4/17 17:52 @desc: """ import os import pandas as pd from ProteinDealer import Protein import numpy as np from common import getPairs, check_path myprotein = Protein() class ComposeData: def save(self, dirout, flist, ratios, limit, labels=None, sep='\t', filename='all.txt', groupcount=1, repeate=True): """ same length of flist,ratios,labels if not None