Beispiel #1
0
def getPairTag(fin, f1pairWithTag_info, sep='\t'):
    '''

    :param fin: fin pair
    :param f1pairWithTag_info: fout
    :return:
    tag
    0, TMP_SP
    1, TMP_TMP
    2, SP_SP
    '''
    do = DataOperation('uniprot', 'uniprot_sprot')
    with open(f1pairWithTag_info, 'w') as fo:
        for pa, pb in getPairs(fin, sep=sep, title=False):
            print('%s\t%s' % (pa, pb))
            result = tagPair(pa, pb, do)
            if result == None: continue
            proA = ensomblePortein(result[0])
            proB = ensomblePortein(result[1])
            for v in proA.values():
                fo.write(str(v))
                fo.write('\t')
            for v in proB.values():
                fo.write(str(v))
                fo.write('\t')
            fo.write(str(result[2]))
            fo.write('\n')
            fo.flush()
Beispiel #2
0
def getPairInfo_TMP_nonTMP(fin, fout, sep='\t', checkTMP=True, keepOne=False):
    '''

    :param fin:
        Q7BCK4	B6JN06
        E7QG89	B2FN41
    :param fout:
        TMP + nonTMP ['accession', 'name', 'length', 'noX', 'inlenRange', 'subcellularLocations', 'seq']
        Q7BCK4	ICSA_SHIFL	1102	True	True	['Periplasm', 'Secreted', 'Cell surface', 'Cell outer membrane']	MNQIHKFFCNMTQCSQGGAGELPTVKEKTCKLSFSPFVVGASLLLGGPIAFATPLSGTQELHFSEDNYEKLLTPVDGLSPLGAGEDGMDAWYITSSNPSHASRTKLRINSDIMISAGHGGAGDNNDGNSCGGNGGDSITGSDLSIINQGMILGGSGGSGADHNGDGGEAVTGDNLFIINGEIISGGHGGDSYSDSDGGNGGDAVTGVNLPIINKGTISGGNGGNNYGEGDGGNGGDAITGSSLSVINKGTFAGGNGGAAYGYGYDGYGGNAITGDNLSVINNGAILGGNGGHWGDAINGSNMTIANSGYIISGKEDDGTQNVAGNAIHITGGNNSLILHEGSVITGDVQVNNSSILKIINNDYTGTTPTIEGDLCAGDCTTVSLSGNKFTVSGDVSFGENSSLNLAGISSLEASGNMSFGNNVKVEAIINNWAQKDYKLLSADKGITGFSVSNISIINPLLTTGAIDYTKSYISDQNKLIYGLSWNDTDGDSHGEFNLKENAELTVSTILADNLSHHNINSWDGKSLTKSGEGTLILAEKNTYSGFTNINAGILKMGTVEAMTRTAGVIVNKGATLNFSGMNQTVNTLLNSGTVLINNINAPFLPDPVIVTGNMTLEKNGHVILNNSSSNVGQTYVQKGNWHGKGGILSLGAVLGNDNSKTDRLEIAGHASGITYVAVTNEGGSGDKTLEGVQIISTDSSDKNAFIQKGRIVAGSYDYRLKQGTVSGLNTNKWYLTSQMDNQESKQMSNQESTQMSSRRASSQLVSSLNLGEGSIHTWRPEAGSYIANLIAMNTMFSPSLYDRHGSTIVDPTTGQLSETTMWIRTVGGHNEHNLADRQLKTTANRMVYQIGGDILKTNFTDHDGLHVGIMGAYGYQDSKTHNKYTSYSSRGTVSGYTAGLYSSWFQDEKERTGLYMDAWLQYSWFNNTVKGDGLTGEKYSSKGITGALEAGYIYPTIRWTAHNNIDNALYLNPQVQITRHGVKANDYIEHNGTMVTSSGGNNIQAKLGLRTSLISQSCIDKETLRKFEPFLEVNWKWSSKQYGVIMNGMSNHQIGNRNVIELKTGVGGRLADNLSIWGNVSQQLGNNSYRDTQGILGVKYTF	B6JN06	G6PI_HELP2	545	True	True	['Cytoplasm']	MLTQLKTYPKLLKHYEEIKEAHMRDWFSKDKERASRYFVQLESLSLDYSKNRLNDTTLKLLFELANDCSLKEKIEAMFKGEKINTTEKRAVLHTALRSLNDTEILLDNMEVLKSVRSVLKRMRAFSDSVRSGKRLGYTNQVITDIVNIGIGGSDLGALMVCTALKRYGHPRLKMHFVSNVDGTQILDVLEKINPASTLFIVASKTFSTQETLTNALTARKWFVERSGDEKHIAKHFVAVSTNKEAVQQFGIDEHNMFEFWDFVGGRYSLWSAIGLSIMIYLGKKNFNALLKGAYLMDEHFRNAPFESNLPVLMGLIGVWYINFFQSKSHLIAPYDQYLRHFPKFIQQLDMESNGKRISKKGETIPYDTCPVVWGDMGINAQHAFFQLLHQGTHLIPIDFIASLDKKPNAKGHHEILFSNVLAQAQAFMKGKSYEEALGELLFKGLDKDEAKDLAHHRVFFGNRPSNILLLEKISPSNIGALVALYEHKVFVQGVIWDINSFDQWGVELGKELAVPILQELEGHKSNAYFDSSTKHLIELYKNYNQ
        E7QG89	SEC11_YEASZ	167	True	True	['Endoplasmic reticulum membrane']	MNLRFELQKLLNVCFLFASAYMFWQGLAIATNSASPIVVVLSGSMEPAFQRGDILFLWNRNTFNQVGDVVVYEVEGKQIPIVHRVLRQHNNHADKQFLLTKGDNNAGNDISLYANKKIYLNKSKEIVGTVKGYFPQLGYITIWISENKYAKFALLGMLGLSALLGGE	B2FN41	EX7L_STRMK	443	True	True	['Cytoplasm']	MQPRNNDILTPSQLNTLARDLLEGSFPAIWVEAELGSVARPASGHLYFTLKDARAQLRAAMFRMKAQYLKFVPREGMRVLVRGKVTLYDARGEYQMVLDHMEEAGEGALRRAFEELKARLEAEGLFDPARKRPLPTHVQRLAVITSPTGAAVRDVLSVLGRRFPLLEVDLLPTLVQGSSAAAQITRLLQAADASGRYDVILLTRGGGSLEDLWAFNDEALARAIAASRTPVVSAVGHETDFSLSDFAADLRAPTPSVAAELLVPDQRELALRLRRTAARMVQLQRHAMQQAMQRADRALLRLNAQSPQARLDLLRRRQLDLGRRLHAVFNQQQERRAARLRHAAAVLRGHHPQRQLDAMQRRLAALRGRPQAAMQRLLERDALRLRGLARSLEAVSPLATVARGYSILTRTDDGTLVRKVNQVQPGDALQARVGDGVIDVQVK

    :return:
    fin = 'file/_1pair.txt'
    fout = 'file/_2pair_info.txt'
    getPairInfo_TMP_nonTMP(fin,fout)
    '''
    do = DataOperation('uniprot', 'uniprot_sprot')
    with open(fout, 'w') as fo:
        for pa, pb in getPairs(fin, sep=sep, title=False):
            print('%s\t%s' % (pa, pb))
            result = getTaN(pa, pb, do, checkTMP=checkTMP, keepOne=keepOne)
            if result == None: continue
            tmp = ensomblePortein(result[0])
            nontmp = ensomblePortein(result[1])
            for v in tmp.values():
                fo.write(str(v))
                fo.write('\t')
            for v in nontmp.values():
                fo.write(str(v))
                fo.write('\t')
            fo.write('\n')
            fo.flush()
Beispiel #3
0
    def loadTest(self,fin_pair,dir_in,onehot=False,is_shuffle=False,limit=0):
        """

        :param fin_pair:
        :param dir_in:
        :param limit:
        :param onehot:
        :return: data,label
        """
        x_test = []
        y_test = []
        count = 0
        for proteins in getPairs(fin_pair, title=False):
            count = count +1
            xelem,yelem = self.loadPpair(dir_in, proteins)
            x_test.append(xelem)
            y_test.append(yelem)
            # eachfile = os.path.join(dir_in, '%s_%s.npy' % (proteins[0], proteins[1]))
            # # print(count,eachfile)
            # try:
            #     # elem = np.load(os.path.join(dir_in, eachfile))
            #     elem = np.load(eachfile)
            #     x_test.append(elem)
            #     # loading test dataset or positive dataset
            #     if len(proteins) < 3 or proteins[2] == '1':
            #         y_test.append(1)
            #     else:
            #         y_test.append(0)
            # except:
            #     print('not find feature of this pair', str(proteins))
            if count == limit:break
        data = np.array(x_test)
        label = np.array(y_test)
        return self.subprocess(data,label,test_size=0, random_state=123,onehot=onehot,is_shuffle=is_shuffle)
Beispiel #4
0
    def loadTest(self, fin_pair, dir_in, onehot=False, is_shuffle=False):
        """

        :param fin_pair:
        :param dir_in:
        :param limit:
        :param onehot:
        :return: data,label
        """
        x_test = []
        y_test = []
        for proteins in getPairs(fin_pair, title=False):
            eachfile = os.path.join(dir_in,
                                    '%s_%s.npy' % (proteins[0], proteins[1]))
            try:
                elem = np.load(os.path.join(dir_in, eachfile))
                x_test.append(elem)
                # loading test dataset or positive dataset
                if len(proteins) < 3 or proteins[2] == '1':
                    y_test.append(1)
                else:
                    y_test.append(0)
            except:
                print('not find feature of this pair', str(proteins))
        data = np.array(x_test)
        label = np.array(y_test)
        return self.subprocess(data,
                               label,
                               test_size=0,
                               random_state=123,
                               onehot=onehot,
                               is_shuffle=is_shuffle)
Beispiel #5
0
 def loadPair(self, fin_pair, dir_in, limit=0):
     positive = []
     negative = []
     row = 0
     for proteins in getPairs(fin_pair, title=False):
         eachfile = os.path.join(dir_in,
                                 '%s_%s.npy' % (proteins[0], proteins[1]))
         try:
             elem = np.load(os.path.join(dir_in, eachfile))
             # loading test dataset or positive dataset
             if len(proteins) < 3 or proteins[2] == '1':
                 positive.append(elem)
                 row = row + 1
             else:
                 negative.append(elem)
         except:
             print('not find feature of this pair', str(proteins))
     if limit != 0 and limit < min(len(positive), len(negative)):
         positive, negative = positive[:limit], negative[:limit]
     print('positive : ', len(positive))
     print('negative : ', len(negative))
     positive = np.stack(positive)
     negative = np.stack(negative) if negative != [] else []
     self.positive = positive
     self.negative = negative
Beispiel #6
0
 def getPairseq(self,
                fin_fasta,
                fin_ID_pair,
                fout_seq_pair,
                saveID=False,
                num=0,
                multi=True):
     # fd = FastaDealer()
     # fin_fasta = '/home/jjhnenu/data/PPI/release/featuredb/positiveV1.fasta'
     # fin_ID_pair = '/home/jjhnenu/data/PPI/release/pairdata/positive_2049.txt'
     # fout_seq_pair = '/home/jjhnenu/data/PPI/release/pairdata/positive_2049_seq.txt'
     # mydict = fd.getDict(fin_fasta, multi=True)
     mydict = self.getDict(fin_fasta, multi=multi)
     file_num = 0
     count = 0
     fout_seq_pair = fout_seq_pair
     myfout_seq_pair = fout_seq_pair.split('.')[0] + '_%d.txt' % file_num
     myfout_seq_ID_pair = fout_seq_pair.split(
         '.')[0] + '_ID_%d.txt' % file_num
     fo = open(myfout_seq_pair, 'w')
     fo_ID = open(myfout_seq_ID_pair, 'w')
     for record in getPairs(fin_ID_pair, sep='\t', title=False):
         a = record[0]
         b = record[1]
         c = ''
         if len(record) == 3:
             c = record[2]
         if saveID:
             # fo.write('>%s %s\n' % (a, c))
             fo.write('>%s\n' % a)
             fo.flush()
         fo.write(mydict[a] + '\n')
         fo.flush()
         if saveID:
             # fo.write('>%s %s\n' % (b, c))
             fo.write('>%s\n' % b)
         fo_ID.write('%s\t%s\t%s\n' % (a, b, c))
         fo.write(mydict[b] + '\n')
         fo.flush()
         count = count + 1
         if num != 0 and count == num:
             file_num = file_num + 1
             count = 0
             fo.close()
             fo_ID.close()
             myfout_seq_pair = fout_seq_pair.split(
                 '.')[0] + '_%d.txt' % file_num
             myfout_seq_ID_pair = fout_seq_pair.split(
                 '.')[0] + '_ID_%d.txt' % file_num
             fo = open(myfout_seq_pair, 'w')
             fo_ID = open(myfout_seq_ID_pair, 'w')
     fo.close()
     fo_ID.close()
     print()
Beispiel #7
0
 def base_compose(self,
                  dirout_feature,
                  fin_pair,
                  dir_feature_db,
                  feature_type='V_PSSM',
                  fout_pair=''):
     check_path(dirout_feature)
     fo = open(fout_pair, 'w') if fout_pair != '' else None
     row = 0
     for pairs in getPairs(fin_pair):
         a = pairs[0]
         b = pairs[1]
         # print(pairs)  # ['O35668', 'P00516']
         fa = os.path.join(dir_feature_db, a + '.npy')
         fb = os.path.join(dir_feature_db, b + '.npy')
         row = row + 1
         print('loading %d th feature pair' % row)
         if not (os.access(fa, os.F_OK) and os.access(fb, os.F_OK)):
             print(
                 '===============features of pairs not found %s %s================'
                 % (a, b), os.access(fa, os.F_OK), os.access(fb, os.F_OK))
             continue
         pa = np.load(fa, allow_pickle=True)
         pb = np.load(fb, allow_pickle=True)
         if (len(pa) < 50 or len(pa) > 2000
                 or max(pa) > 20) or (len(pb) < 50 or len(pb) > 2000
                                      or max(pb) > 20):
             print('wrong length or x')
             continue
         if fo != None:
             fo.write('%s\t%s\n' % (a, b))
             fo.flush()
         # padding
         if feature_type == Feature_type.V_PSSM:
             pc = self.padding_PSSM(pa, pb, vstack=True)
         elif feature_type == Feature_type.H_PSSM:
             pc = self.padding_PSSM(pa, pb, vstack=False)
         elif feature_type == Feature_type.SEQ_1D:
             pc = self.padding_seq1D(pa, pb, vstack=False)
             # elif feature_type == Feature_type.SEQ_1D_OH:pc = self.padding_seq1D(pa,pb,vstack=False)
         elif feature_type == Feature_type.SEQ_2D:
             pc = self.padding_seq2D(pa, pb)
         else:
             print('incoreect feature_type')
             return
         # 保存padding后的成对特征
         fout = os.path.join(dirout_feature, "%s_%s.npy" % (a, b))
         np.save(fout, pc)
         del pc, pa, pb
     if fo != None:
         fo.close()