Ejemplo n.º 1
0
    def Decriptor_generator(self, ps):
        protein = PyPro()
        protein.ReadProteinSequence(ps)
        DS_1 = protein.GetAAComp()
        # print len(DS_1)
        #DS_2 = protein.GetDPComp()

        # print len(DS_2)
        #DS_3 = protein.GetTPComp() # takes time
        # print len(DS_3)
        DS_4 = protein.GetTriad()

        DS_5 = protein.GetPAAC(lamda=5, weight=0.5)  # takes time

        DS_6 = protein.GetAPAAC(lamda=5, weight=0.5)  # takes time

        DS_7 = protein.GetCTD()

        DS_8 = protein.GetGearyAuto()

        DS_9 = protein.GetMoranAuto()

        DS_10 = protein.GetMoreauBrotoAuto()

        DS_11 = protein.GetQSO()

        DS_12 = protein.GetSOCN()

        DS_ALL = {}

        for DS in (DS_1, DS_4, DS_5, DS_6, DS_7, DS_8, DS_9, DS_10, DS_11,
                   DS_12):
            DS_ALL.update(DS)
        # print len(DS_ALL)
        return DS_ALL
Ejemplo n.º 2
0
def Protein_gen(data, Proteingroup):
    import numpy as np
    from pydpi.pypro import PyPro
    protein = PyPro()

    HP_list, D_list = [], []
    for ii in range(len(data)):
        p = data[ii]
        protein.ReadProteinSequence(p)
        keys, values = [], []
        for jj in Proteingroup:
            if jj == '0':  #All descriptors          2049
                res = protein.GetALL()
            elif jj == '1':  #amino acid composition   20
                res = protein.GetAAComp()
            elif jj == '2':  #dipeptide composition    400
                res = protein.GetDPComp()
            elif jj == '3':  #Tripeptide composition   8000
                res = protein.GetTPComp()
            elif jj == '4':  #Moreau-Broto autocorrelation  240
                res = protein.GetMoreauBrotoAuto()
            elif jj == '5':  #Moran autocorrelation       240
                res = protein.GetMoranAuto()
            elif jj == '6':  #Geary autocorrelation       240
                res = protein.GetGearyAuto()
            elif jj == '7':  #composition,transition,distribution  21+21+105
                res = protein.GetCTD()
            elif jj == '8':  #conjoint triad features     343
                res = protein.GetTriad()
            elif jj == '9':  #sequence order coupling number  60
                res = protein.GetSOCN(30)
            elif jj == '10':  #quasi-sequence order descriptors   100
                res = protein.GetQSO()
            elif jj == '11':  #pseudo amino acid composition   50
                res = protein.GetPAAC(30)

            keys.extend(res.keys())
            values.extend(res.values())
        if ii == 0:
            HP_list = keys
            D_list.append(values)
        else:
            D_list.append(values)

    D_Pro = np.zeros((len(D_list), len(HP_list)), dtype=float)
    for k in range(len(D_list)):
        D_Pro[k, :] = D_list[k]

    #Variance threshold       std > 0.01
    import Descriptors_Selection as DesSe
    ind_var = DesSe.VarinceThreshold(D_Pro)
    D_Pro = D_Pro[:, ind_var]
    HP_list = np.array(HP_list)[ind_var]

    H_Pro = np.reshape(HP_list, (1, len(HP_list)))
    Array_Pro = np.append(H_Pro, D_Pro, axis=0)

    return Array_Pro
    def descriptor_generator(self, protein_sequence):
        obj_PyPro = PyPro()
        obj_PyPro.ReadProteinSequence(protein_sequence)

        ds1 = obj_PyPro.GetAAComp()
        ds2 = obj_PyPro.GetDPComp()
        ds3 = obj_PyPro.GetPAAC(lamda=30)
        ds4 = obj_PyPro.GetCTD()
        ds5 = obj_PyPro.GetQSO()
        ds6 = obj_PyPro.GetTriad()

        ds_all = []
        # This is use to append since sequentially add .. otherwise update() function update not sequentially
        for ds in (ds1, ds2, ds3, ds4, ds5, ds6):
            ds_all.append(ds)

        return ds_all
def Decriptor_generator(infile, lamda, weight, maxlag, destype, out_file):

    list_pep_name = []
    f = open(infile)
    lines = f.readlines()

    for line in lines:
        if ">" in line:
            pass
        else:
            list_pep_name.append(line.strip('\n'))

    out_df = pd.DataFrame()

    for seq in list_pep_name:

        protein = PyPro()
        protein.ReadProteinSequence(seq)

        if destype == "GetAAComp":
            DS = protein.GetAAComp()
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetDPComp":
            DS = protein.GetDPComp()
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetTPComp":
            DS = protein.GetTPComp()
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetMoreauBrotoAuto":
            DS = protein.GetMoreauBrotoAuto()
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetMoranAuto":
            DS = protein.GetMoranAuto()
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetGearyAuto":
            DS = protein.GetGearyAuto()
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetCTD":
            DS = protein.GetCTD()
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetPAAC":
            DS = protein.GetPAAC(lamda=int(lamda), weight=float(weight))
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetAPAAC":
            DS = protein.GetAPAAC(lamda=int(lamda), weight=float(weight))
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetSOCN":
            DS = protein.GetSOCN(maxlag=int(maxlag))
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetQSO":
            DS = protein.GetQSO(maxlag=int(maxlag), weight=float(weight))
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetTriad":
            DS = protein.GetTriad()
            df = pd.DataFrame(DS, index=[0])
        elif destype == "All":
            DS1 = protein.GetAAComp()
            DS2 = protein.GetDPComp()
            DS3 = protein.GetTPComp()
            DS4 = protein.GetMoreauBrotoAuto()
            DS5 = protein.GetMoranAuto()
            DS6 = protein.GetGearyAuto()
            DS7 = protein.GetCTD()
            DS8 = protein.GetPAAC(lamda=int(lamda), weight=float(weight))
            DS9 = protein.GetAPAAC(lamda=int(lamda), weight=float(weight))
            DS10 = protein.GetSOCN(maxlag=int(maxlag))
            DS11 = protein.GetQSO(maxlag=int(maxlag), weight=float(weight))
            DS12 = protein.GetTriad()

            DS = {}

            for D in (DS1, DS2, DS3, DS4, DS5, DS6, DS7, DS8, DS9, DS10, DS11,
                      DS12):
                print(D)
                DS.update(D)
            df = pd.DataFrame(DS, index=[0])

        if destype == 'BinaryDescriptor':
            out_df = BinaryDescriptor(list_pep_name)
        else:
            out_df = pd.concat([out_df, df], axis=0)

    out_df.to_csv(out_file, index=False, sep='\t')