Exemple #1
0
    cols = [c for c in df_train.columns if c not in ["id", "target"]]
    cols = [c for c in cols if c not in config.IGNORE_COLS]

    X_train = df_train[cols].values
    X_test = df_test[cols].values
    y_train = df_train["target"].values
    ids_test = df_test["id"].values
    cat_features_indices = [
        i for i, c in enumerate(cols) if c in config.CATEGORICAL_COLS
    ]

    return df_train, df_test, X_train, y_train, X_test, ids_test, cat_features_indices


if __name__ == '__main__':
    log("start to load data...")
    df_train, df_test, X_train, y_train, X_test, ids_test, cat_features_indices = _load_data(
    )

    # folds
    log("split folds")
    folds = list(
        StratifiedKFold(n_splits=config.NUM_SPLITS,
                        shuffle=True,
                        random_state=config.RANDOM_SEED).split(
                            X_train, y_train))
    fd = FeatureDictionary(df_train=df_train,
                           df_test=df_test,
                           numeric_cols=config.NUMERIC_COLS,
                           ignored_cols=config.IGNORE_COLS)
    log("parse data...")
Exemple #2
0
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold

from example.data_reader import DataParser
from example.data_reader import FeatureDictionary
from example import config
from example.log import log
from load_data import *





if __name__ == '__main__':
    log("start to load data...")
    df_train, df_test, X_train, y_train, X_test, ids_test, cat_features_indices = load_ctr_data()

    # folds
    log("split folds")
    folds = list(StratifiedKFold(n_splits=config.NUM_SPLITS, shuffle=True,
                                 random_state=config.RANDOM_SEED).split(X_train, y_train))



    '''#3.Random Forest Classifier'''
    seed = 43
    from sklearn.ensemble import RandomForestClassifier
    rf = RandomForestClassifier(random_state = seed, n_estimators = 100)
    '''#6.Decision Tree Classifier'''
    from sklearn.tree import DecisionTreeClassifier
Exemple #3
0
    def _init_graph(self):
        self.graph = tf.Graph()
        with self.graph.as_default():
            tf.set_random_seed(self.random_seed)

            self.feature_index = tf.placeholder(tf.int32,
                                                shape=[None, None],
                                                name="feature_index")
            self.feature_value = tf.placeholder(tf.float32,
                                                shape=[None, None],
                                                name="feature_value")
            self.label = tf.placeholder(tf.float32,
                                        shape=[None, 1],
                                        name="label")
            self.weights = self._initialize_weights()

            self.dropout_keep_deep = tf.placeholder(tf.float32,
                                                    shape=[None],
                                                    name="dropout_keep_deep")
            self.train_phase = tf.placeholder(tf.bool, name="train_phase")

            # 1. embedding layer
            self.embeddings = tf.nn.embedding_lookup(
                self.weights["embedding_tensor"], self.feature_index)  #
            feature_value = tf.reshape(self.feature_value,
                                       shape=[-1, self.field_dim, 1])
            self.embeddings = tf.multiply(self.embeddings,
                                          feature_value)  # M * F * K

            # 2. deep network
            self.y_deep = tf.reshape(
                self.embeddings,
                shape=[-1, self.field_dim * self.embedding_dim])
            self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[0])

            for i, layer_wide in enumerate(self.dnn_wides):
                self.y_deep = tf.add(
                    tf.matmul(self.y_deep, self.weights["layer_%d" % i]),
                    self.weights["bias_%d" % i])
                if self.batch_norm:
                    self.y_deep = self.batch_norm_layer(
                        self.y_deep,
                        train_phase=self.train_phase,
                        scope_bn="bn_%d" % i)
                self.y_deep = self.dnn_activation(self.y_deep)
                self.y_deep = tf.nn.dropout(self.y_deep,
                                            self.dropout_keep_deep[1 + i])
            # 3. cross network
            input_size = self.field_dim * self.embedding_dim
            self.y_cross_i = tf.reshape(self.embeddings,
                                        shape=[-1, 1, input_size])
            self.y_cross = tf.reshape(self.embeddings, shape=[-1, input_size])
            self.y_cross_0 = tf.reshape(self.embeddings,
                                        shape=[-1, 1, input_size])
            for i in range(len(self.cross_wides)):
                x0T_x_x1 = tf.reshape(tf.matmul(self.y_cross_0,
                                                self.y_cross_i,
                                                transpose_a=True),
                                      shape=[-1, input_size])
                self.y_cross_i = tf.add(
                    tf.reshape(tf.matmul(x0T_x_x1,
                                         self.weights["cross_layer_%d" % i]),
                               shape=[-1, 1, input_size]), self.y_cross_i)
                self.y_cross_i = tf.add(self.y_cross_i,
                                        self.weights["cross_bias_%d" % i])
                self.y_cross = tf.concat([
                    self.y_cross,
                    tf.reshape(self.y_cross_i, shape=[-1, input_size])
                ],
                                         axis=1)

            # 4. concatenate y_deep and y_cross
            log("concatenating y_deep and y_cross")
            if self.use_deep and self.use_cross:
                concat_input = tf.concat([self.y_cross, self.y_deep], axis=1)
                self.out = tf.add(
                    tf.matmul(concat_input, self.weights["concat_projection"]),
                    self.weights["concat_bias"])
            elif self.use_deep:
                concat_input = self.y_deep
                self.out = tf.add(
                    tf.matmul(concat_input, self.weights["concat_projection"]),
                    self.weights["concat_bias"])
            elif self.use_cross:
                concat_input = self.y_cross
                self.out = tf.add(
                    tf.matmul(concat_input, self.weights["concat_projection"]),
                    self.weights["concat_bias"])

            # 5. loss
            log("form loss")
            self.out = tf.nn.sigmoid(self.out)
            self.loss = tf.losses.log_loss(self.label, self.out)

            # 6.regularization
            log("regularization")
            if self.l2_reg > 0.0:
                self.loss += tf.contrib.layers.l2_regularizer(self.l2_reg)(
                    self.weights["concat_projection"])
                for i in range(len(self.dnn_wides)):
                    self.loss += tf.contrib.layers.l2_regularizer(self.l2_reg)(
                        self.weights["layer_%d" % i])

            # 7. optimizer
            log("choose optimizer")
            self.optimizer = tf.train.AdamOptimizer(
                learning_rate=self.learning_rate,
                beta1=0.9,
                beta2=0.999,
                epsilon=1e-8).minimize(self.loss)

            # 8. init
            log("run init...")
            self.saver = tf.train.Saver()
            init = tf.global_variables_initializer()
            self.sess = self._init_session()
            self.sess.run(init)

            # number of params
            total_parameters = 0
            for variable in self.weights.values():
                shape = variable.get_shape()
                variable_parameters = 1
                for dim in shape:
                    variable_parameters *= dim.value
                total_parameters += variable_parameters
            if self.verbose > 0:
                print("#params: %d" % total_parameters)
    def _init_graph(self):
        self.graph = tf.Graph()
        with self.graph.as_default():
            tf.set_random_seed(self.random_seed)

            self.feature_index = tf.placeholder(tf.int32,
                                                shape=[None, None],
                                                name="feature_index")
            self.feature_value = tf.placeholder(tf.float32,
                                                shape=[None, None],
                                                name="feature_value")
            self.label = tf.placeholder(tf.float32,
                                        shape=[None, 1],
                                        name="label")
            self.weights = self._initialize_weights()

            self.dropout_keep_deep = tf.placeholder(tf.float32,
                                                    shape=[None],
                                                    name="dropout_keep_deep")
            self.train_phase = tf.placeholder(tf.bool, name="train_phase")

            # 1. embedding layer
            self.embeddings = tf.nn.embedding_lookup(
                self.weights["embedding_tensor"], self.feature_index)  #
            feature_value = tf.reshape(self.feature_value,
                                       shape=[-1, self.field_dim, 1])
            self.embeddings = tf.multiply(self.embeddings,
                                          feature_value)  # M * F * K

            # 2. deep network
            self.y_deep = tf.reshape(
                self.embeddings,
                shape=[-1, self.field_dim * self.embedding_dim])
            #self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[0])

            for i, layer_wide in enumerate(self.dnn_wides):
                print("in deep %s" % i)
                print(self.y_deep.shape.as_list())
                print(self.weights["layer_%d" % i].shape.as_list())
                print(self.weights["bias_%d" % i].shape.as_list())
                self.y_deep = tf.add(
                    tf.matmul(self.y_deep, self.weights["layer_%d" % i]),
                    self.weights["bias_%d" % i])
                # =============================================================================
                #                 if self.batch_norm:
                #                     self.y_deep = self.batch_norm_layer(self.y_deep, train_phase=self.train_phase, scope_bn="bn_%d" % i)
                # =============================================================================
                self.y_deep = self.dnn_activation(self.y_deep)
                self.y_deep = tf.nn.dropout(self.y_deep,
                                            self.dropout_keep_deep[1 + i])
            # 3. cross network
            input_size = self.field_dim * self.embedding_dim
            self.y_cross = tf.reshape(self.embeddings,
                                      shape=[-1, 1, input_size])
            self.y_cross_0 = tf.reshape(self.embeddings,
                                        shape=[-1, 1, input_size])
            for i in range(len(self.cross_wides)):
                print("in cross %d" % i)
                print(self.y_cross_0.shape.as_list())
                x0T_x_x1 = tf.reshape(tf.matmul(self.y_cross_0,
                                                self.y_cross,
                                                transpose_a=True),
                                      shape=[-1, input_size])
                print(x0T_x_x1.shape.as_list())
                print(self.weights["cross_layer_%d" % i].shape.as_list())
                self.y_cross = tf.add(
                    tf.reshape(tf.matmul(x0T_x_x1,
                                         self.weights["cross_layer_%d" % i]),
                               shape=[-1, 1, input_size]), self.y_cross)
                self.y_cross = tf.add(self.y_cross,
                                      self.weights["cross_bias_%d" % i])
                print("+++", self.y_cross.shape.as_list())
            self.y_cross = tf.reshape(self.y_cross,
                                      shape=[-1, self.cross_wides[0]])

            # 4. concatenate y_deep and y_cross
            log("concatenating y_deep and y_cross")
            concat_input = tf.concat([self.y_cross, self.y_deep], axis=1)
            self.out = tf.add(
                tf.matmul(concat_input, self.weights["concat_projection"]),
                self.weights["concat_bias"])

            # 5. loss
            log("form loss")
            self.out1 = tf.nn.sigmoid(self.out)
            #self.loss = tf.losses.log_loss(self.label, self.out1)
            self.losses = tf.nn.sigmoid_cross_entropy_with_logits(
                labels=self.label, logits=self.out)
            self.loss = tf.reduce_mean(self.losses)
            self.loss_summary = tf.summary.scalar("loss", self.loss)

            # 6.regularization
            log("regularization")
            if self.l2_reg > 0.0:
                self.loss += tf.contrib.layers.l2_regularizer(self.l2_reg)(
                    self.weights["concat_projection"])
                for i in range(len(self.dnn_wides)):
                    self.loss += tf.contrib.layers.l2_regularizer(self.l2_reg)(
                        self.weights["layer_%d" % i])

            # 7. optimizer
            log("choose optimizer")
            self.optimizer = tf.train.AdamOptimizer(
                learning_rate=self.learning_rate,
                beta1=0.9,
                beta2=0.999,
                epsilon=1e-8).minimize(self.loss)

            # 8. init
            log("run init...")
            self.saver = tf.train.Saver()
            init = tf.global_variables_initializer()
            self.sess = self._init_session()
            self.writer = tf.summary.FileWriter("logs/a3", self.sess.graph)
            self.sess.run(init)

            self.saver = tf.train.Saver(max_to_keep=2)
            # =============================================================================
            #             model_path = os.path.abspath('.') + "\model2\my-model1-99"
            #             print(model_path)
            #             model_dict = '/'.join(model_path.split('/')[:-1])
            #             ckpt = tf.train.get_checkpoint_state(model_dict)
            #             self.saver.restore(self.sess, model_path)
            # =============================================================================

            # number of params
            total_parameters = 0
            for variable in self.weights.values():
                shape = variable.get_shape()
                variable_parameters = 1
                for dim in shape:
                    variable_parameters *= dim.value
                total_parameters += variable_parameters
            if self.verbose > 0:
                print("#params: %d" % total_parameters)