cols = [c for c in df_train.columns if c not in ["id", "target"]] cols = [c for c in cols if c not in config.IGNORE_COLS] X_train = df_train[cols].values X_test = df_test[cols].values y_train = df_train["target"].values ids_test = df_test["id"].values cat_features_indices = [ i for i, c in enumerate(cols) if c in config.CATEGORICAL_COLS ] return df_train, df_test, X_train, y_train, X_test, ids_test, cat_features_indices if __name__ == '__main__': log("start to load data...") df_train, df_test, X_train, y_train, X_test, ids_test, cat_features_indices = _load_data( ) # folds log("split folds") folds = list( StratifiedKFold(n_splits=config.NUM_SPLITS, shuffle=True, random_state=config.RANDOM_SEED).split( X_train, y_train)) fd = FeatureDictionary(df_train=df_train, df_test=df_test, numeric_cols=config.NUMERIC_COLS, ignored_cols=config.IGNORE_COLS) log("parse data...")
import pandas as pd import numpy as np from sklearn.model_selection import StratifiedKFold from example.data_reader import DataParser from example.data_reader import FeatureDictionary from example import config from example.log import log from load_data import * if __name__ == '__main__': log("start to load data...") df_train, df_test, X_train, y_train, X_test, ids_test, cat_features_indices = load_ctr_data() # folds log("split folds") folds = list(StratifiedKFold(n_splits=config.NUM_SPLITS, shuffle=True, random_state=config.RANDOM_SEED).split(X_train, y_train)) '''#3.Random Forest Classifier''' seed = 43 from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier(random_state = seed, n_estimators = 100) '''#6.Decision Tree Classifier''' from sklearn.tree import DecisionTreeClassifier
def _init_graph(self): self.graph = tf.Graph() with self.graph.as_default(): tf.set_random_seed(self.random_seed) self.feature_index = tf.placeholder(tf.int32, shape=[None, None], name="feature_index") self.feature_value = tf.placeholder(tf.float32, shape=[None, None], name="feature_value") self.label = tf.placeholder(tf.float32, shape=[None, 1], name="label") self.weights = self._initialize_weights() self.dropout_keep_deep = tf.placeholder(tf.float32, shape=[None], name="dropout_keep_deep") self.train_phase = tf.placeholder(tf.bool, name="train_phase") # 1. embedding layer self.embeddings = tf.nn.embedding_lookup( self.weights["embedding_tensor"], self.feature_index) # feature_value = tf.reshape(self.feature_value, shape=[-1, self.field_dim, 1]) self.embeddings = tf.multiply(self.embeddings, feature_value) # M * F * K # 2. deep network self.y_deep = tf.reshape( self.embeddings, shape=[-1, self.field_dim * self.embedding_dim]) self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[0]) for i, layer_wide in enumerate(self.dnn_wides): self.y_deep = tf.add( tf.matmul(self.y_deep, self.weights["layer_%d" % i]), self.weights["bias_%d" % i]) if self.batch_norm: self.y_deep = self.batch_norm_layer( self.y_deep, train_phase=self.train_phase, scope_bn="bn_%d" % i) self.y_deep = self.dnn_activation(self.y_deep) self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[1 + i]) # 3. cross network input_size = self.field_dim * self.embedding_dim self.y_cross_i = tf.reshape(self.embeddings, shape=[-1, 1, input_size]) self.y_cross = tf.reshape(self.embeddings, shape=[-1, input_size]) self.y_cross_0 = tf.reshape(self.embeddings, shape=[-1, 1, input_size]) for i in range(len(self.cross_wides)): x0T_x_x1 = tf.reshape(tf.matmul(self.y_cross_0, self.y_cross_i, transpose_a=True), shape=[-1, input_size]) self.y_cross_i = tf.add( tf.reshape(tf.matmul(x0T_x_x1, self.weights["cross_layer_%d" % i]), shape=[-1, 1, input_size]), self.y_cross_i) self.y_cross_i = tf.add(self.y_cross_i, self.weights["cross_bias_%d" % i]) self.y_cross = tf.concat([ self.y_cross, tf.reshape(self.y_cross_i, shape=[-1, input_size]) ], axis=1) # 4. concatenate y_deep and y_cross log("concatenating y_deep and y_cross") if self.use_deep and self.use_cross: concat_input = tf.concat([self.y_cross, self.y_deep], axis=1) self.out = tf.add( tf.matmul(concat_input, self.weights["concat_projection"]), self.weights["concat_bias"]) elif self.use_deep: concat_input = self.y_deep self.out = tf.add( tf.matmul(concat_input, self.weights["concat_projection"]), self.weights["concat_bias"]) elif self.use_cross: concat_input = self.y_cross self.out = tf.add( tf.matmul(concat_input, self.weights["concat_projection"]), self.weights["concat_bias"]) # 5. loss log("form loss") self.out = tf.nn.sigmoid(self.out) self.loss = tf.losses.log_loss(self.label, self.out) # 6.regularization log("regularization") if self.l2_reg > 0.0: self.loss += tf.contrib.layers.l2_regularizer(self.l2_reg)( self.weights["concat_projection"]) for i in range(len(self.dnn_wides)): self.loss += tf.contrib.layers.l2_regularizer(self.l2_reg)( self.weights["layer_%d" % i]) # 7. optimizer log("choose optimizer") self.optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8).minimize(self.loss) # 8. init log("run init...") self.saver = tf.train.Saver() init = tf.global_variables_initializer() self.sess = self._init_session() self.sess.run(init) # number of params total_parameters = 0 for variable in self.weights.values(): shape = variable.get_shape() variable_parameters = 1 for dim in shape: variable_parameters *= dim.value total_parameters += variable_parameters if self.verbose > 0: print("#params: %d" % total_parameters)
def _init_graph(self): self.graph = tf.Graph() with self.graph.as_default(): tf.set_random_seed(self.random_seed) self.feature_index = tf.placeholder(tf.int32, shape=[None, None], name="feature_index") self.feature_value = tf.placeholder(tf.float32, shape=[None, None], name="feature_value") self.label = tf.placeholder(tf.float32, shape=[None, 1], name="label") self.weights = self._initialize_weights() self.dropout_keep_deep = tf.placeholder(tf.float32, shape=[None], name="dropout_keep_deep") self.train_phase = tf.placeholder(tf.bool, name="train_phase") # 1. embedding layer self.embeddings = tf.nn.embedding_lookup( self.weights["embedding_tensor"], self.feature_index) # feature_value = tf.reshape(self.feature_value, shape=[-1, self.field_dim, 1]) self.embeddings = tf.multiply(self.embeddings, feature_value) # M * F * K # 2. deep network self.y_deep = tf.reshape( self.embeddings, shape=[-1, self.field_dim * self.embedding_dim]) #self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[0]) for i, layer_wide in enumerate(self.dnn_wides): print("in deep %s" % i) print(self.y_deep.shape.as_list()) print(self.weights["layer_%d" % i].shape.as_list()) print(self.weights["bias_%d" % i].shape.as_list()) self.y_deep = tf.add( tf.matmul(self.y_deep, self.weights["layer_%d" % i]), self.weights["bias_%d" % i]) # ============================================================================= # if self.batch_norm: # self.y_deep = self.batch_norm_layer(self.y_deep, train_phase=self.train_phase, scope_bn="bn_%d" % i) # ============================================================================= self.y_deep = self.dnn_activation(self.y_deep) self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[1 + i]) # 3. cross network input_size = self.field_dim * self.embedding_dim self.y_cross = tf.reshape(self.embeddings, shape=[-1, 1, input_size]) self.y_cross_0 = tf.reshape(self.embeddings, shape=[-1, 1, input_size]) for i in range(len(self.cross_wides)): print("in cross %d" % i) print(self.y_cross_0.shape.as_list()) x0T_x_x1 = tf.reshape(tf.matmul(self.y_cross_0, self.y_cross, transpose_a=True), shape=[-1, input_size]) print(x0T_x_x1.shape.as_list()) print(self.weights["cross_layer_%d" % i].shape.as_list()) self.y_cross = tf.add( tf.reshape(tf.matmul(x0T_x_x1, self.weights["cross_layer_%d" % i]), shape=[-1, 1, input_size]), self.y_cross) self.y_cross = tf.add(self.y_cross, self.weights["cross_bias_%d" % i]) print("+++", self.y_cross.shape.as_list()) self.y_cross = tf.reshape(self.y_cross, shape=[-1, self.cross_wides[0]]) # 4. concatenate y_deep and y_cross log("concatenating y_deep and y_cross") concat_input = tf.concat([self.y_cross, self.y_deep], axis=1) self.out = tf.add( tf.matmul(concat_input, self.weights["concat_projection"]), self.weights["concat_bias"]) # 5. loss log("form loss") self.out1 = tf.nn.sigmoid(self.out) #self.loss = tf.losses.log_loss(self.label, self.out1) self.losses = tf.nn.sigmoid_cross_entropy_with_logits( labels=self.label, logits=self.out) self.loss = tf.reduce_mean(self.losses) self.loss_summary = tf.summary.scalar("loss", self.loss) # 6.regularization log("regularization") if self.l2_reg > 0.0: self.loss += tf.contrib.layers.l2_regularizer(self.l2_reg)( self.weights["concat_projection"]) for i in range(len(self.dnn_wides)): self.loss += tf.contrib.layers.l2_regularizer(self.l2_reg)( self.weights["layer_%d" % i]) # 7. optimizer log("choose optimizer") self.optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8).minimize(self.loss) # 8. init log("run init...") self.saver = tf.train.Saver() init = tf.global_variables_initializer() self.sess = self._init_session() self.writer = tf.summary.FileWriter("logs/a3", self.sess.graph) self.sess.run(init) self.saver = tf.train.Saver(max_to_keep=2) # ============================================================================= # model_path = os.path.abspath('.') + "\model2\my-model1-99" # print(model_path) # model_dict = '/'.join(model_path.split('/')[:-1]) # ckpt = tf.train.get_checkpoint_state(model_dict) # self.saver.restore(self.sess, model_path) # ============================================================================= # number of params total_parameters = 0 for variable in self.weights.values(): shape = variable.get_shape() variable_parameters = 1 for dim in shape: variable_parameters *= dim.value total_parameters += variable_parameters if self.verbose > 0: print("#params: %d" % total_parameters)