コード例 #1
0
def Protein_gen(data, Proteingroup):
    import numpy as np
    from pydpi.pypro import PyPro
    protein = PyPro()

    HP_list, D_list = [], []
    for ii in range(len(data)):
        p = data[ii]
        protein.ReadProteinSequence(p)
        keys, values = [], []
        for jj in Proteingroup:
            if jj == '0':  #All descriptors          2049
                res = protein.GetALL()
            elif jj == '1':  #amino acid composition   20
                res = protein.GetAAComp()
            elif jj == '2':  #dipeptide composition    400
                res = protein.GetDPComp()
            elif jj == '3':  #Tripeptide composition   8000
                res = protein.GetTPComp()
            elif jj == '4':  #Moreau-Broto autocorrelation  240
                res = protein.GetMoreauBrotoAuto()
            elif jj == '5':  #Moran autocorrelation       240
                res = protein.GetMoranAuto()
            elif jj == '6':  #Geary autocorrelation       240
                res = protein.GetGearyAuto()
            elif jj == '7':  #composition,transition,distribution  21+21+105
                res = protein.GetCTD()
            elif jj == '8':  #conjoint triad features     343
                res = protein.GetTriad()
            elif jj == '9':  #sequence order coupling number  60
                res = protein.GetSOCN(30)
            elif jj == '10':  #quasi-sequence order descriptors   100
                res = protein.GetQSO()
            elif jj == '11':  #pseudo amino acid composition   50
                res = protein.GetPAAC(30)

            keys.extend(res.keys())
            values.extend(res.values())
        if ii == 0:
            HP_list = keys
            D_list.append(values)
        else:
            D_list.append(values)

    D_Pro = np.zeros((len(D_list), len(HP_list)), dtype=float)
    for k in range(len(D_list)):
        D_Pro[k, :] = D_list[k]

    #Variance threshold       std > 0.01
    import Descriptors_Selection as DesSe
    ind_var = DesSe.VarinceThreshold(D_Pro)
    D_Pro = D_Pro[:, ind_var]
    HP_list = np.array(HP_list)[ind_var]

    H_Pro = np.reshape(HP_list, (1, len(HP_list)))
    Array_Pro = np.append(H_Pro, D_Pro, axis=0)

    return Array_Pro
    def descriptor_generator(self, protein_sequence):
        obj_PyPro = PyPro()
        obj_PyPro.ReadProteinSequence(protein_sequence)

        ds1 = obj_PyPro.GetAAComp()
        ds2 = obj_PyPro.GetDPComp()
        ds3 = obj_PyPro.GetPAAC(lamda=30)
        ds4 = obj_PyPro.GetCTD()
        ds5 = obj_PyPro.GetQSO()
        ds6 = obj_PyPro.GetTriad()

        ds_all = []
        # This is use to append since sequentially add .. otherwise update() function update not sequentially
        for ds in (ds1, ds2, ds3, ds4, ds5, ds6):
            ds_all.append(ds)

        return ds_all
コード例 #3
0
    def Decriptor_generator(self, ps):

        protein = PyPro()
        protein.ReadProteinSequence(ps)
        moran = protein.GetPAAC(lamda=5,weight=0.5)
        DS_1 = protein.GetAPAAC(lamda=5,weight=0.5)
        DS_2 = protein.GetCTD()
        DS_3 = protein.GetDPComp()
        DS_4 = protein.GetGearyAuto()
        DS_5 = protein.GetMoranAuto()
        DS_6 = protein.GetMoreauBrotoAuto()
        DS_7 = protein.GetQSO()
        DS_8 = protein.GetSOCN()
        DS_9 = protein.GetTPComp()

        DS_ALL = {}

        for DS in (DS_1,DS_2,DS_3,DS_4,DS_5,DS_6,DS_7,DS_8,DS_9,moran):
            DS_ALL.update(DS)
            
        return DS_ALL
コード例 #4
0
pca = PCA(n_components=2)
X_r = pca.fit(X).transform(X)
plt.figure()
for color, i, target_name in zip(['navy', 'darkorange'], [0, 1], ['0', '1']):
    plt.scatter(X_r[y == i, 0], X_r[y == i, 1], color=color, alpha=.8, lw=2,label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('PCA of hmmscan df')

# Collect physicochemical features homology -- pydpi ##
print 'Collecting physicochemical stats per protein...'
with open('./physicochem_annot_training.csv', 'w') as f:
    for record in SeqIO.parse(all_fasta, "fasta"):
        protein = PyPro()
        protein.ReadProteinSequence(str(record.seq))
        desc = protein.GetGearyAuto()
        desc2 = protein.GetDPComp()
        z = desc.copy()
        z.update(desc2)
        len_p = str(len(record.seq))
        label = str(record.description).strip().split(':')[1]
        id = str(record.id)
        row = [id, label, len_p] + [str(i) for i in z.values()]
        f.write(','.join(row) + '\n')

df_desc = pd.read_table('./physicochem_annot_training.csv', header=None, sep=',')
header = z.keys()
df_desc.columns = ['id', 'label', 'seq_length'] + header
print 'Pydpi table: {}'.format(df_desc.columns)


# plot
コード例 #5
0
def Decriptor_generator(infile, lamda, weight, maxlag, destype, out_file):

    list_pep_name = []
    f = open(infile)
    lines = f.readlines()

    for line in lines:
        if ">" in line:
            pass
        else:
            list_pep_name.append(line.strip('\n'))

    out_df = pd.DataFrame()

    for seq in list_pep_name:

        protein = PyPro()
        protein.ReadProteinSequence(seq)

        if destype == "GetAAComp":
            DS = protein.GetAAComp()
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetDPComp":
            DS = protein.GetDPComp()
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetTPComp":
            DS = protein.GetTPComp()
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetMoreauBrotoAuto":
            DS = protein.GetMoreauBrotoAuto()
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetMoranAuto":
            DS = protein.GetMoranAuto()
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetGearyAuto":
            DS = protein.GetGearyAuto()
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetCTD":
            DS = protein.GetCTD()
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetPAAC":
            DS = protein.GetPAAC(lamda=int(lamda), weight=float(weight))
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetAPAAC":
            DS = protein.GetAPAAC(lamda=int(lamda), weight=float(weight))
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetSOCN":
            DS = protein.GetSOCN(maxlag=int(maxlag))
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetQSO":
            DS = protein.GetQSO(maxlag=int(maxlag), weight=float(weight))
            df = pd.DataFrame(DS, index=[0])
        elif destype == "GetTriad":
            DS = protein.GetTriad()
            df = pd.DataFrame(DS, index=[0])
        elif destype == "All":
            DS1 = protein.GetAAComp()
            DS2 = protein.GetDPComp()
            DS3 = protein.GetTPComp()
            DS4 = protein.GetMoreauBrotoAuto()
            DS5 = protein.GetMoranAuto()
            DS6 = protein.GetGearyAuto()
            DS7 = protein.GetCTD()
            DS8 = protein.GetPAAC(lamda=int(lamda), weight=float(weight))
            DS9 = protein.GetAPAAC(lamda=int(lamda), weight=float(weight))
            DS10 = protein.GetSOCN(maxlag=int(maxlag))
            DS11 = protein.GetQSO(maxlag=int(maxlag), weight=float(weight))
            DS12 = protein.GetTriad()

            DS = {}

            for D in (DS1, DS2, DS3, DS4, DS5, DS6, DS7, DS8, DS9, DS10, DS11,
                      DS12):
                print(D)
                DS.update(D)
            df = pd.DataFrame(DS, index=[0])

        if destype == 'BinaryDescriptor':
            out_df = BinaryDescriptor(list_pep_name)
        else:
            out_df = pd.concat([out_df, df], axis=0)

    out_df.to_csv(out_file, index=False, sep='\t')