def Decriptor_generator(self, ps): protein = PyPro() protein.ReadProteinSequence(ps) DS_1 = protein.GetAAComp() # print len(DS_1) #DS_2 = protein.GetDPComp() # print len(DS_2) #DS_3 = protein.GetTPComp() # takes time # print len(DS_3) DS_4 = protein.GetTriad() DS_5 = protein.GetPAAC(lamda=5, weight=0.5) # takes time DS_6 = protein.GetAPAAC(lamda=5, weight=0.5) # takes time DS_7 = protein.GetCTD() DS_8 = protein.GetGearyAuto() DS_9 = protein.GetMoranAuto() DS_10 = protein.GetMoreauBrotoAuto() DS_11 = protein.GetQSO() DS_12 = protein.GetSOCN() DS_ALL = {} for DS in (DS_1, DS_4, DS_5, DS_6, DS_7, DS_8, DS_9, DS_10, DS_11, DS_12): DS_ALL.update(DS) # print len(DS_ALL) return DS_ALL
def Protein_gen(data, Proteingroup): import numpy as np from pydpi.pypro import PyPro protein = PyPro() HP_list, D_list = [], [] for ii in range(len(data)): p = data[ii] protein.ReadProteinSequence(p) keys, values = [], [] for jj in Proteingroup: if jj == '0': #All descriptors 2049 res = protein.GetALL() elif jj == '1': #amino acid composition 20 res = protein.GetAAComp() elif jj == '2': #dipeptide composition 400 res = protein.GetDPComp() elif jj == '3': #Tripeptide composition 8000 res = protein.GetTPComp() elif jj == '4': #Moreau-Broto autocorrelation 240 res = protein.GetMoreauBrotoAuto() elif jj == '5': #Moran autocorrelation 240 res = protein.GetMoranAuto() elif jj == '6': #Geary autocorrelation 240 res = protein.GetGearyAuto() elif jj == '7': #composition,transition,distribution 21+21+105 res = protein.GetCTD() elif jj == '8': #conjoint triad features 343 res = protein.GetTriad() elif jj == '9': #sequence order coupling number 60 res = protein.GetSOCN(30) elif jj == '10': #quasi-sequence order descriptors 100 res = protein.GetQSO() elif jj == '11': #pseudo amino acid composition 50 res = protein.GetPAAC(30) keys.extend(res.keys()) values.extend(res.values()) if ii == 0: HP_list = keys D_list.append(values) else: D_list.append(values) D_Pro = np.zeros((len(D_list), len(HP_list)), dtype=float) for k in range(len(D_list)): D_Pro[k, :] = D_list[k] #Variance threshold std > 0.01 import Descriptors_Selection as DesSe ind_var = DesSe.VarinceThreshold(D_Pro) D_Pro = D_Pro[:, ind_var] HP_list = np.array(HP_list)[ind_var] H_Pro = np.reshape(HP_list, (1, len(HP_list))) Array_Pro = np.append(H_Pro, D_Pro, axis=0) return Array_Pro
def descriptor_generator(self, protein_sequence): obj_PyPro = PyPro() obj_PyPro.ReadProteinSequence(protein_sequence) ds1 = obj_PyPro.GetAAComp() ds2 = obj_PyPro.GetDPComp() ds3 = obj_PyPro.GetPAAC(lamda=30) ds4 = obj_PyPro.GetCTD() ds5 = obj_PyPro.GetQSO() ds6 = obj_PyPro.GetTriad() ds_all = [] # This is use to append since sequentially add .. otherwise update() function update not sequentially for ds in (ds1, ds2, ds3, ds4, ds5, ds6): ds_all.append(ds) return ds_all
def Decriptor_generator(self, ps): protein = PyPro() protein.ReadProteinSequence(ps) moran = protein.GetPAAC(lamda=5,weight=0.5) DS_1 = protein.GetAPAAC(lamda=5,weight=0.5) DS_2 = protein.GetCTD() DS_3 = protein.GetDPComp() DS_4 = protein.GetGearyAuto() DS_5 = protein.GetMoranAuto() DS_6 = protein.GetMoreauBrotoAuto() DS_7 = protein.GetQSO() DS_8 = protein.GetSOCN() DS_9 = protein.GetTPComp() DS_ALL = {} for DS in (DS_1,DS_2,DS_3,DS_4,DS_5,DS_6,DS_7,DS_8,DS_9,moran): DS_ALL.update(DS) return DS_ALL
def Decriptor_generator(infile, lamda, weight, maxlag, destype, out_file): list_pep_name = [] f = open(infile) lines = f.readlines() for line in lines: if ">" in line: pass else: list_pep_name.append(line.strip('\n')) out_df = pd.DataFrame() for seq in list_pep_name: protein = PyPro() protein.ReadProteinSequence(seq) if destype == "GetAAComp": DS = protein.GetAAComp() df = pd.DataFrame(DS, index=[0]) elif destype == "GetDPComp": DS = protein.GetDPComp() df = pd.DataFrame(DS, index=[0]) elif destype == "GetTPComp": DS = protein.GetTPComp() df = pd.DataFrame(DS, index=[0]) elif destype == "GetMoreauBrotoAuto": DS = protein.GetMoreauBrotoAuto() df = pd.DataFrame(DS, index=[0]) elif destype == "GetMoranAuto": DS = protein.GetMoranAuto() df = pd.DataFrame(DS, index=[0]) elif destype == "GetGearyAuto": DS = protein.GetGearyAuto() df = pd.DataFrame(DS, index=[0]) elif destype == "GetCTD": DS = protein.GetCTD() df = pd.DataFrame(DS, index=[0]) elif destype == "GetPAAC": DS = protein.GetPAAC(lamda=int(lamda), weight=float(weight)) df = pd.DataFrame(DS, index=[0]) elif destype == "GetAPAAC": DS = protein.GetAPAAC(lamda=int(lamda), weight=float(weight)) df = pd.DataFrame(DS, index=[0]) elif destype == "GetSOCN": DS = protein.GetSOCN(maxlag=int(maxlag)) df = pd.DataFrame(DS, index=[0]) elif destype == "GetQSO": DS = protein.GetQSO(maxlag=int(maxlag), weight=float(weight)) df = pd.DataFrame(DS, index=[0]) elif destype == "GetTriad": DS = protein.GetTriad() df = pd.DataFrame(DS, index=[0]) elif destype == "All": DS1 = protein.GetAAComp() DS2 = protein.GetDPComp() DS3 = protein.GetTPComp() DS4 = protein.GetMoreauBrotoAuto() DS5 = protein.GetMoranAuto() DS6 = protein.GetGearyAuto() DS7 = protein.GetCTD() DS8 = protein.GetPAAC(lamda=int(lamda), weight=float(weight)) DS9 = protein.GetAPAAC(lamda=int(lamda), weight=float(weight)) DS10 = protein.GetSOCN(maxlag=int(maxlag)) DS11 = protein.GetQSO(maxlag=int(maxlag), weight=float(weight)) DS12 = protein.GetTriad() DS = {} for D in (DS1, DS2, DS3, DS4, DS5, DS6, DS7, DS8, DS9, DS10, DS11, DS12): print(D) DS.update(D) df = pd.DataFrame(DS, index=[0]) if destype == 'BinaryDescriptor': out_df = BinaryDescriptor(list_pep_name) else: out_df = pd.concat([out_df, df], axis=0) out_df.to_csv(out_file, index=False, sep='\t')