Ejemplo n.º 1
0
def Protein_gen(data, Proteingroup):
    import numpy as np
    from pydpi.pypro import PyPro
    protein = PyPro()

    HP_list, D_list = [], []
    for ii in range(len(data)):
        p = data[ii]
        protein.ReadProteinSequence(p)
        keys, values = [], []
        for jj in Proteingroup:
            if jj == '0':  #All descriptors          2049
                res = protein.GetALL()
            elif jj == '1':  #amino acid composition   20
                res = protein.GetAAComp()
            elif jj == '2':  #dipeptide composition    400
                res = protein.GetDPComp()
            elif jj == '3':  #Tripeptide composition   8000
                res = protein.GetTPComp()
            elif jj == '4':  #Moreau-Broto autocorrelation  240
                res = protein.GetMoreauBrotoAuto()
            elif jj == '5':  #Moran autocorrelation       240
                res = protein.GetMoranAuto()
            elif jj == '6':  #Geary autocorrelation       240
                res = protein.GetGearyAuto()
            elif jj == '7':  #composition,transition,distribution  21+21+105
                res = protein.GetCTD()
            elif jj == '8':  #conjoint triad features     343
                res = protein.GetTriad()
            elif jj == '9':  #sequence order coupling number  60
                res = protein.GetSOCN(30)
            elif jj == '10':  #quasi-sequence order descriptors   100
                res = protein.GetQSO()
            elif jj == '11':  #pseudo amino acid composition   50
                res = protein.GetPAAC(30)

            keys.extend(res.keys())
            values.extend(res.values())
        if ii == 0:
            HP_list = keys
            D_list.append(values)
        else:
            D_list.append(values)

    D_Pro = np.zeros((len(D_list), len(HP_list)), dtype=float)
    for k in range(len(D_list)):
        D_Pro[k, :] = D_list[k]

    #Variance threshold       std > 0.01
    import Descriptors_Selection as DesSe
    ind_var = DesSe.VarinceThreshold(D_Pro)
    D_Pro = D_Pro[:, ind_var]
    HP_list = np.array(HP_list)[ind_var]

    H_Pro = np.reshape(HP_list, (1, len(HP_list)))
    Array_Pro = np.append(H_Pro, D_Pro, axis=0)

    return Array_Pro
Ejemplo n.º 2
0
    def Decriptor_generator(self, ps):

        protein = PyPro()
        protein.ReadProteinSequence(ps)
        moran = protein.GetPAAC(lamda=5,weight=0.5)
        DS_1 = protein.GetAPAAC(lamda=5,weight=0.5)
        DS_2 = protein.GetCTD()
        DS_3 = protein.GetDPComp()
        DS_4 = protein.GetGearyAuto()
        DS_5 = protein.GetMoranAuto()
        DS_6 = protein.GetMoreauBrotoAuto()
        DS_7 = protein.GetQSO()
        DS_8 = protein.GetSOCN()
        DS_9 = protein.GetTPComp()

        DS_ALL = {}

        for DS in (DS_1,DS_2,DS_3,DS_4,DS_5,DS_6,DS_7,DS_8,DS_9,moran):
            DS_ALL.update(DS)
            
        return DS_ALL
def Decriptor_generator(infile, lamda, weight, maxlag, destype, out_file):

    list_pep_name = []
    f = open(infile)
    lines = f.readlines()

    for line in lines:
        if ">" in line:
            pass
        else:
            list_pep_name.append(line.strip('\n'))

    out_df = pd.DataFrame()

    for seq in list_pep_name:

        protein = PyPro()
        protein.ReadProteinSequence(seq)

        if destype == "GetAAComp":
            DS = protein.GetAAComp()
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetDPComp":
            DS = protein.GetDPComp()
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetTPComp":
            DS = protein.GetTPComp()
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetMoreauBrotoAuto":
            DS = protein.GetMoreauBrotoAuto()
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetMoranAuto":
            DS = protein.GetMoranAuto()
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetGearyAuto":
            DS = protein.GetGearyAuto()
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetCTD":
            DS = protein.GetCTD()
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetPAAC":
            DS = protein.GetPAAC(lamda=int(lamda), weight=float(weight))
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetAPAAC":
            DS = protein.GetAPAAC(lamda=int(lamda), weight=float(weight))
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetSOCN":
            DS = protein.GetSOCN(maxlag=int(maxlag))
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetQSO":
            DS = protein.GetQSO(maxlag=int(maxlag), weight=float(weight))
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetTriad":
            DS = protein.GetTriad()
            df = pd.DataFrame(DS, index=[0])
        elif destype == "All":
            DS1 = protein.GetAAComp()
            DS2 = protein.GetDPComp()
            DS3 = protein.GetTPComp()
            DS4 = protein.GetMoreauBrotoAuto()
            DS5 = protein.GetMoranAuto()
            DS6 = protein.GetGearyAuto()
            DS7 = protein.GetCTD()
            DS8 = protein.GetPAAC(lamda=int(lamda), weight=float(weight))
            DS9 = protein.GetAPAAC(lamda=int(lamda), weight=float(weight))
            DS10 = protein.GetSOCN(maxlag=int(maxlag))
            DS11 = protein.GetQSO(maxlag=int(maxlag), weight=float(weight))
            DS12 = protein.GetTriad()

            DS = {}

            for D in (DS1, DS2, DS3, DS4, DS5, DS6, DS7, DS8, DS9, DS10, DS11,
                      DS12):
                print(D)
                DS.update(D)
            df = pd.DataFrame(DS, index=[0])

        if destype == 'BinaryDescriptor':
            out_df = BinaryDescriptor(list_pep_name)
        else:
            out_df = pd.concat([out_df, df], axis=0)

    out_df.to_csv(out_file, index=False, sep='\t')