forked from LegrandT/TreeGraphEmbedding
/
Paper_RF_embeddings_04_04.py
336 lines (272 loc) · 11.9 KB
/
Paper_RF_embeddings_04_04.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
from sklearn import datasets
import pandas as pd
import numpy as np
#loading data
dataFolder='C:\\dev\\data\\interestProfile\\'
province=pd.read_csv(dataFolder+'provincie.csv',header=0,sep=';',encoding= 'ISO-8859-1')
dataForModelling18062018=pd.read_csv(dataFolder+'dataForModelling18062018.csv',header=0,encoding= 'ISO-8859-1')
df=dataForModelling18062018
#%%
#only select the active people and subscript the newsletter
df=df.loc[(df['CD_STATUS_MCD']=='ACTIEF') & (df['NEWSLETTER']=='Y')]
df.drop(['CD_STATUS_MCD','NEWSLETTER'],inplace=True,axis=1)
#wdat.dtypes
#add provience to the wdat
province.drop(['pc2','PLAATS','GEMEENTE'],axis=1, inplace=True)
df = pd.merge(df, province, left_on='CD_PLAATS', right_on='PC', how='left')
df.iloc[1,:]
df.drop(['CD_PLAATS','PC'],inplace=True,axis=1)
#check all na values in each column
df.isna().sum()
#count the na in pronvince feature
df.PROVINCIE.fillna('Onbekend',inplace=True)
df['PROVINCIE'].value_counts()
df.dtypes
#remove the space of strings
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
#remove rarely levels from cateogorical variables
df['SECTOR_MMS_BIGGEST_JOB'].value_counts(dropna=False)
df=df[(df['SECTOR_MMS_BIGGEST_JOB']!='Energie') &(df['SECTOR_MMS_BIGGEST_JOB']!='Onbekend')& (df['SECTOR_MMS_BIGGEST_JOB']!='NONE')]
df['TYPE_DLN_BIGGEST_JOB'].value_counts(dropna=False)
df=df[(df['TYPE_DLN_BIGGEST_JOB']!='FPU' ) ]
df['OMS_SMLVRM'].value_counts(dropna=False)
df=df[(df['OMS_SMLVRM'] !='Onbekend' )]
df['SECTOR_MMS_LATEST_JOB'].value_counts(dropna=False)
df=df[df['SECTOR_MMS_LATEST_JOB']!='NONE' ]
#%%
#splite the df into clicked and non_clicked groups
df['Target'].value_counts(dropna=False)
df_clicked=df[df.Target.notnull()]
df_non_clicked=df[df.Target.isnull()]
#get the dummies of Target column
target_enc= pd.get_dummies(df_clicked['Target'])
df_clicked=df_clicked.join(target_enc)
#splite the
#import random
#random.seed(123)
df_clicked.drop('Target',inplace=True,axis=1)
n=100000
df_ml=df_clicked.sample(n,replace=False,random_state=123)
#%%
from sklearn.model_selection import train_test_split
from sklearn.datasets import *
from sklearn.tree import DecisionTreeClassifier
# import labelencoder
from sklearn.preprocessing import LabelEncoder, StandardScaler# instantiate labelencoder object
from sklearn.ensemble import RandomTreesEmbedding, RandomForestClassifier
X = df_ml.iloc[:,0:17]
y = df_ml.iloc[:,21]
#label encoding.
#Categorical boolean mask
categorical_feature_mask = X.dtypes==object# filter categorical columns using mask and turn it into a list
categorical_cols = X.columns[categorical_feature_mask].tolist()
le = LabelEncoder()
# apply le on categorical feature columns
X[categorical_cols] = X[categorical_cols].apply(lambda col: le.fit_transform(col))
X = pd.get_dummies(X, prefix_sep='_', drop_first=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0 )
scl = StandardScaler()
X_train = scl.fit_transform(X_train)
X_test = scl.transform(X_test)
estimator_list = []
estimators = RandomForestClassifier(n_estimators=50,max_depth=5, random_state=0)
estimators.fit(X_train,y_train)
print(y_test.value_counts()/y_test.shape[0])
#%%build a graph of each decision tree
import networkx as nx
import matplotlib.pyplot as plt
from node2vec import Node2Vec
def create_graph_list(estimators_trees):
G_list = []
G=nx.Graph()
for estimator in estimators_trees.estimators_ :
n_nodes = estimator.tree_.node_count
children_left = estimator.tree_.children_left
children_right = estimator.tree_.children_right
feature = estimator.tree_.feature
threshold = estimator.tree_.threshold
node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
stack = [(0, -1,0)] # seed is the root node id and its parent depth
while len(stack) > 0:
node_id, parent_depth,parent_id = stack.pop()
node_depth[node_id] = parent_depth + 1
G.add_node(str(node_id))
G.add_edge(str(parent_id),str(node_id))
# If we have a test node
if (children_left[node_id] != children_right[node_id]):
stack.append((children_left[node_id], parent_depth + 1,node_id))
stack.append((children_right[node_id], parent_depth + 1,node_id))
else:
is_leaves[node_id] = True
#append created graph
G_list.append(G)
#return multi graph
return G_list
#%%Build the node2Vec model for all decision trees of the RF
G_list = create_graph_list(estimators)
#nx.draw_networkx(G_list[0])
'''for G in G_list:#print all graphs regarding the number of n_estimators
nx.draw(G,with_labels = True)
plt.show()'''
#build the node2vec node embedding for the tree structure graph
node2vec_list = []
for G in G_list:
node2vec = Node2Vec(G, dimensions=3, walk_length=3, num_walks=10)
node2vec_list.append(node2vec)
## Learn embeddings of each decision tree
models_list = []
for node2v in node2vec_list:
model = node2v.fit(window=10, min_count=1)
models_list.append(model)
#test the embeddings of node 1 in the different decision tree
#different node graph/decision tree has different number of nodes
#the embedding of each node in the different node2Vec is different
model_tree0=models_list[0]['10']
model_tree1=models_list[1]['10']
#%%
# 50 decision trees (estimators) constructures 50 correponding node graph,
# after we built the decision trees, we put a sample/data point into the decision tree
# we can calculate the exit nodes of each point/sample of a tree structure.
# Eg. if we have 22621 points in the training
# for each piont in each decision tree will have the exit node regarding the tree structure.
#Therefore, within a decision tree/ one out of 50 decision trees, we will have node path vector= a levae_id=[22621*1]
#when we know the exit node, we can calculate the embeddings of this node under this tree structure/this decision tree.
reps = []# store all embedings from 50 different decision trees/node graphs of all training data points
#a row represnts each data point, two sequential columns represent tan embedding from one node graph/decision tree
reps_test = []
estimator= estimators.estimators_[0]
model=models_list[0]
for estimator,model in zip(estimators.estimators_,models_list):
leave_id = estimator.apply(X_train)# the exit node number of each data/sample in the training dataset
vect_rep = []
for i in leave_id:
vect_rep.append(model[str(i)])# calculate the embedding of each leave_id (the embeddings
#of each training data point)
leave_id_test = estimator.apply(X_test)
vect_rep_test = []
for i in leave_id_test:
vect_rep_test.append(model[str(i)])
if reps == []:
reps.append(vect_rep)
reps_test.append(vect_rep_test)
else:
reps= np.array(reps).squeeze()
reps_test = np.array(reps_test).squeeze()
reps = np.hstack((reps,np.array(vect_rep)))
reps_test = np.hstack((reps_test,np.array(vect_rep_test)))
print(np.shape(reps))
#%%
data = np.array(reps)
data_test = np.array(reps_test)
X_train2= np.hstack((X_train,data))
X_test2= np.hstack((X_test,data_test))
from sklearn.decomposition import PCA
#find the optimal components number
pca = PCA()
pca.fit(X_train2)
print(pca.explained_variance_ratio_)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
#choose the number of components by the percentage of the variance
#pcaw=PCA(0.4).fit(X_train2)#explained 90% of the variance =13 , 80%=11
#pcaw.n_components_
#%%
'''
pca = PCA(n_components=18)
pca.fit(X_train2)
print(pca.explained_variance_ratio_)
data_p = pca.transform(X_train2)
data_t = pca.transform(X_test2)
pcs = ['pc' + str(i+1) for i in range(pca.n_components_)]
df_train = pd.DataFrame(data_p,columns = pcs)
df_train['target'] = y_train.values
df_train['target'].value_counts(dropna=False)
df_test = pd.DataFrame(data_t,columns = pcs)
df_test['target'] = y_test.values
import seaborn as sns
sns.pairplot(df_test, hue='target')
'''
#plt.plot(data_p[:,0],data_p[:,1],'*b')
#plt.plot(data_t[:,0],data_t[:,1],'*r')
#%%
def performance(y_test,prediction_t):
print(confusion_matrix(y_test,prediction_t))
print(classification_report(y_test,prediction_t))
print(accuracy_score(y_test, prediction_t))
# print(y_test.value_counts()/y_test.shape[0])
#%%
#X_train : original
#data: embeddings
#X_train2: concatnate = original + embeddings'''
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
def ml_algorithms(data,y_train,data_test,y_test):
rlts=[]
print('-----KNN-----------')
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(data, y_train)
prediction_t = neigh.predict(data_test)
knn_acc=accuracy_score(y_test, prediction_t)
performance(y_test,prediction_t)
rlts.append(knn_acc)
print('-----DecisionTreeClassifier-----------')
dec = DecisionTreeClassifier(max_depth=10, random_state=0)
dec.fit(data, y_train)
prediction_t_ds = dec.predict(data_test)
dec_acc=accuracy_score(y_test, prediction_t_ds)
performance(y_test,prediction_t_ds)
rlts.append(dec_acc)
print('-----RF-----------')
clf = RandomForestClassifier(n_estimators=200, max_depth= None, random_state=0)
clf.fit(data, y_train)
prediction_t_rf = clf.predict(data_test)
rf_acc=accuracy_score(y_test, prediction_t)
performance(y_test,prediction_t_rf)
rlts.append(rf_acc)
print('-----LG+Linear-----------')
clf = LogisticRegression()
clf.fit(data, y_train)
prediction_t_lg = clf.predict(data_test)
prediction_tp= clf.predict_proba(data_test)
lg_acc=accuracy_score(y_test, prediction_t_lg)
performance(y_test,prediction_t_lg)
rlts.append(lg_acc)
print('-----LG+Ploynomial-----------')
plf = PolynomialFeatures(2)
datap = plf.fit_transform(data)
datap_test = plf.transform(data_test)
clfp = LogisticRegression()
clfp.fit(datap, y_train)
prediction_tpol = clfp.predict(datap_test)
lg_ploy_acc=accuracy_score(y_test, prediction_tpol)
#
prediction_tpol_prob = clfp.predict_proba(datap_test)
performance(y_test,prediction_tpol)
rlts.append(lg_ploy_acc)
print(rlts)
return prediction_tp,prediction_tpol_prob
#%%
#X_train : original ,
prediction_tn,non_need=ml_algorithms(X_train,y_train,X_test,y_test)
print(y_test.value_counts()/y_test.shape[0])
#data: embeddings
#ml_algorithms(data,y_train,data_test,y_test)
#X_train2: concatnate = original + embeddings'''
prediction_tp,prediction_tpol_prob=ml_algorithms(X_train2,y_train,X_test2,y_test)
#%%
from sklearn.metrics import roc_curve, auc
fpr_pol, tpr_pol,_ = roc_curve((y_test==True).apply(int),prediction_tpol_prob[:,1])
fpr, tpr,_ = roc_curve((y_test==True).apply(int),prediction_tp[:,1])
fprn, tprn,_ = roc_curve((y_test==True).apply(int),prediction_tn[:,1])
print('AUC for Node2Vec Logistic + Poly features + Normal Features : ', auc(fpr_pol, tpr_pol))
print('AUC for Node2Vec Logistic + Linear Features + Normal Features : ', auc(fpr, tpr))
print('AUC for Normal Features LogisticNormal Features Logistic : ', auc(fprn, tprn))
plt.plot(fpr_pol, tpr_pol, 'g', label='Node2Vec Logistic + Poly features + Normal Features')
plt.plot(fpr, tpr, 'r', label='Node2Vec Logistic + Linear Features + Normal Features')
plt.plot(fprn, tprn,'b', label='Normal Features Logistic')
plt.legend()
#%%