Example #1
0
def getSingleInfo(fin, fout, fin_type='single', col=[0, 1]):
    if fin_type == 'pair':
        df = pd.read_table(fin, header=None)[col]
        dat = df.to_numpy().reshape(1, -1)
        proteins = set(dat[0])
    elif fin_type == 'single':
        proteins = readIDlist(fin)
    else:
        pass
    do = DataOperation('uniprot', 'uniprot_sprot')
    projection = {
        '_id': True,
        'sequence.@length': True,
        'sequence.#text': True,
        'keyword.@id': True,
        'comment.subcellularLocation.location': True
    }
    prod = Protein()
    with open(fout, 'w') as fo:
        for AC in proteins:
            pro = queryProtein(AC, do, projection=projection)
            pro['accession'] = AC
            if not prod.checkProtein(
                    pro['sequence']['#text'], 50, 2000, uncomm=True):
                continue
            proinfo = ensomblePortein(pro)
            for v in proinfo.values():
                fo.write(str(v))
                fo.write('\t')
            fo.write('\n')
            fo.flush()
Example #2
0
def loaddata(ftmp_fa, fnontmp_fa):
    '''

    :param ftmp_fa:  tmp fasta file
    :param fnontmp_fa: nontmp fasta file
    :return: idpair, feature

    # ftmp_fa = 'file/8humanPredict/3tmp.fasta'
    # fnontmp_fa = 'file/8humanPredict/3nontmp.fasta'
    '''

    fd = FastaDealer()
    bf = BaseFeature()
    p = Protein()
    # flag = False
    # flagA = False
    # flagB = False
    for pa in SeqIO.parse(ftmp_fa, 'fasta'):
        # if pa.id == 'Q8TCW7':flagA = True  # 程序中断之后,需要用这个代码控制起始预测位置
        # if not flagA:continue
        if not p.checkProtein(pa.seq, 50, 2000, uncomm=True): continue
        a = fd.phsi_blos(pa.seq)
        for pb in SeqIO.parse(fnontmp_fa, 'fasta'):
            # if pb.id == 'Q15942':flagB = True
            # if not flagB:continue
            if not p.checkProtein(pb.seq, 50, 2000, uncomm=True): continue
            b = fd.phsi_blos(pb.seq)
            c = bf.padding_PSSM(a, b, vstack=True, shape=(2000, 25))
            yield '%s-%s' % (pa.id, pb.id), c
Example #3
0
 def getNpy(self, fin_fasta, out_dir, multi=True, checkprotein=True):
     check_path(out_dir)
     p = Protein()
     for ID, seq in self.getYield(fin_fasta, multi=multi):
         if not checkprotein or p.checkProtein(seq, 50, 2000, uncomm=True):
             filename = os.path.join(out_dir, "%s.npy" % ID)
             result = self.seq2num(seq)
             if len(result) != 0:
                 np.save(filename, result)
Example #4
0
def ensomblePortein(pro):
    ens_pro = {}
    prd = Protein()
    ens_pro['accession'] = pro['accession']
    ens_pro['name'] = pro['_id']
    ens_pro['length'] = pro['sequence']['@length']
    ens_pro['noX'] = prd.checkUncomm(pro['sequence']['#text'])
    ens_pro['inlenRange'] = prd.checkLengthRange(int(
        pro['sequence']['@length']),
                                                 min=50,
                                                 max=2000)
    ens_pro['subcellularLocations'] = [] if 'comment' not in pro.keys(
    ) else getSubcelllist(pro['comment'])
    ens_pro['seq'] = pro['sequence']['#text']
    return ens_pro
Example #5
0
"""
@author: [email protected]
@time: 2020/4/17 17:50
@desc:
"""
import os

from Bio import SeqIO
import numpy as np
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

from ProteinDealer import Protein
from common import check_path, getPairs, readIDlist

p = Protein()


class FastaDealer:
    def extractFasta(self,
                     fin_fasta,
                     fin_idlist,
                     fout_fasta,
                     in_multi=True,
                     out_multi=True):
        oridict = self.getDict(fin_fasta, multi=in_multi)
        desdict = {}
        idlist = readIDlist(fin_idlist)
        for id in idlist:
            desdict[id] = oridict[id]
        self.dict2fasta(desdict, fout_fasta, multi=out_multi)
Example #6
0
# encoding: utf-8
"""
@author: [email protected]
@time: 2020/4/17 17:52
@desc:
"""
import os

import pandas as pd

from ProteinDealer import Protein
import numpy as np

from common import getPairs, check_path

myprotein = Protein()


class ComposeData:
    def save(self,
             dirout,
             flist,
             ratios,
             limit,
             labels=None,
             sep='\t',
             filename='all.txt',
             groupcount=1,
             repeate=True):
        """
        same length of flist,ratios,labels if not None