def main(): CONFIG = Config() model_conf = CONFIG.read_model_conf()['model_conf'] traindata_list = FileListGenerator(model_conf['data_dir_train']).generate() testdata_list = FileListGenerator(model_conf['data_dir_pred']).generate() if model_conf['mode'] == 'train': traindata = next(traindata_list) tf.logging.info('Start training {}'.format(traindata)) t0 = time.time() train1 = LR(traindata, mode='train').lr_model() t1 = time.time() tf.logging.info('Finish training {}, take {} mins'.format( traindata, float((t1 - t0) / 60))) else: testdata = next(testdata_list) tf.logging.info('Start evaluation {}'.format(testdata)) t0 = time.time() Accuracy, AUC = LR(testdata, mode='pred').lr_model() t1 = time.time() tf.logging.info('Finish evaluation {}, take {} mins'.format( testdata, float((t1 - t0) / 60))) print("LR_Accuracy: %f" % Accuracy) print("LR_AUC: %f" % AUC)
def gen_analyzed_data(): """ Generate the data to be analyzed from the original pred data """ # schemas SCHEMA = Config().read_schema( ) # dict id -> col_name, e.g. SCHEMA[1]='clk' del SCHEMA[1] header_str = [v for k, v in SCHEMA.iteritems()] header_int = [k for k, v in SCHEMA.iteritems()] col2id = {v: k for k, v in SCHEMA.iteritems()} feature_conf_dic = CONF.read_feature_conf() cross_feature_list = CONF.read_cross_feature_conf() # load data df = pd.read_table(FLAGS.pred_data + "/pred1", header=header_int) # reformat the table, only analyzed columns are left keep_columns_str = get_analyzed_columns(feature_conf_dic) keep_columns_int = [col2id[v] for v in keep_columns_str] keep_columns_int.sort() df_keep_columns_int = [ col - 2 for col in keep_columns_int ] # dataframe starts from column 0; while our map start from 2 analyzed_table = df.iloc[:, df_keep_columns_int] # save to csv analyzed_table.to_csv(FLAGS.analyzed_data, header=[SCHEMA[k] for k in keep_columns_int], index=False) print("Analyzed data generation finished.")
def __init__(self, data_file): self._conf = Config() self._data_file = data_file self._feature_conf_dic = self._conf.read_feature_conf()[0] self._feature_used = self._conf.read_feature_conf()[1] self._all_features = self._conf.read_schema_conf() self.model_conf = self._conf.read_model_conf()['model_conf'] self._csv_defaults = self._column_to_csv_defaults()
def __init__(self, data_file, mode): self._conf = Config() self._data_file = data_file self._Tf_Data = TF_Data(self._data_file) self.dataset_train = self._Tf_Data.gbdt_input() self.lr_conf = self._conf.read_model_conf()['lr_conf'] self._mode = mode self._gbdt_spr = GBDT_spr(self._data_file).gbdt_model(self._mode)
def __init__(self, data_file): self._data_file = data_file self._DataSet = DataSet(self._data_file) self._conf = Config() self.dataset = self._DataSet.input_fn() self.batch_dataset = self._DataSet.iter_minibatches() self._feature_colums = self._feature_colums() self.gbdt_conf = self._conf.read_model_conf()['gbdt_conf'] self.model_conf = self._conf.read_model_conf()['model_conf']
def __init__(self, data_file): self._data_file = data_file self._Tf_Data = TF_Data(self._data_file) self._conf = Config() self.dataset_train = self._Tf_Data.gbdt_input() self.dataset_trans = self._Tf_Data.gbdt_input() self.dataset_pred = self._Tf_Data.gbdt_input() self.gbdt_conf = self._conf.read_model_conf()['gbdt_conf'] self.model_conf = self._conf.read_model_conf()['model_conf']
def main(): CONFIG = Config() model_conf = CONFIG.read_model_conf()['model_conf'] if model_conf['mode'] == 'train': train1 = LR(model_conf['data_dir_train'], mode='train').lr_model() else: Accuracy, AUC = LR(model_conf['data_dir_pred'], mode='pred').lr_model() print("LR_Accuracy: %f" % Accuracy) print("LR_AUC: %f" % AUC)
def __init__(self): self._conf = Config() self._train_conf = self._conf.train self._cnn_conf = self._conf.model x_train, y_train, x_test, y_test, x_train_categ, x_test_categ, x_train_conti, x_test_conti, all_data \ = preprocessing() self.x_train = x_train self.y_train = y_train self.x_test = x_test self.y_test = y_test self.x_train_categ = x_train_categ # 訓練セットの中のカテゴリーデータ self.x_test_categ = x_test_categ # テストセットの中のカテゴリーデータ self.x_train_conti = x_train_conti # 訓練セットの中の連続的データ self.x_test_conti = x_test_conti # テストセットの中の連続的データ self.all_data = all_data self.poly = PolynomialFeatures(degree=2, interaction_only=True) # カテゴリーデータをcross product化 self.x_train_categ_poly = self.poly.fit_transform(x_train_categ) self.x_test_categ_poly = self.poly.transform(x_test_categ) self.categ_inputs = None self.conti_input = None self.deep_component_outlayer = None self.logistic_input = None self.model = None
def gen_pred_csv(): """ Save the pred data as csv """ # schemas SCHEMA = Config().read_schema( ) # dict id -> col_name, e.g. SCHEMA[1]='clk' del SCHEMA[1] # load data df = pd.read_table(FLAGS.pred_data + "/pred1") # save to csv df.to_csv("../data/pred/pred1.csv", header=[v for k, v in SCHEMA.iteritems()], index=False) print("Csv generation finished.")
def pred_input_fn(csv_data): """Prediction input fn for a single data, used for serving client""" conf = Config() feature = conf.get_feature_name() feature_unused = conf.get_feature_name('unused') feature_conf = conf.read_feature_conf() csv_default = column_to_dtype(feature, feature_conf) csv_default.pop('label') feature_dict = {} for idx, f in enumerate(csv_default.keys()): if f in feature_unused: continue else: if csv_default[f] == tf.string: feature_dict[f] = _bytes_feature(csv_data[idx]) else: feature_dict[f] = _float_feature(float(csv_data[idx])) return feature_dict
def main(): CONFIG = Config() model_conf = CONFIG.read_model_conf()['model_conf'] traindata_list = FileListGenerator(model_conf['data_dir_train']).generate() testdata_list = FileListGenerator(model_conf['data_dir_pred']).generate() model = build_estimator() traindata = next(traindata_list) testdata = next(testdata_list) t0 = time.time() tf.logging.info('Start training {}'.format(traindata)) model.train(input_fn=lambda: input_fn(traindata, 'train'), hooks=None, steps=None, max_steps=None, saving_listeners=None) t1 = time.time() tf.logging.info('Finish training {}, take {} mins'.format( traindata, float((t1 - t0) / 60))) tf.logging.info('Start evaluating {}'.format(testdata)) t2 = time.time() results = model.evaluate( input_fn=lambda: input_fn(testdata, 'eval'), steps=None, # Number of steps for which to evaluate model. hooks=None, checkpoint_path=None, # latest checkpoint in model_dir is used. name=None) t3 = time.time() tf.logging.info('Finish evaluation {}, take {} mins'.format( testdata, float((t3 - t2) / 60))) # Display evaluation metrics for key in sorted(results): print('{}: {}'.format(key, results[key]))
def pred_input_fn(csv_data): """Prediction input fn for a single data, used for serving client""" conf = Config() # feature = conf.read_schema_conf().values() # feature_unused = conf.get_feature_name('unused') feature_conf = conf.read_feature_conf()[1] csv_default = TF_Data('/home/zhangqifan/data/part_0.csv')._column_to_csv_defaults() csv_default.pop('label') print(csv_default) feature_dict = {} for idx, f in enumerate(csv_default.keys()): print(f) print(type(csv_default[f])) if f in feature_conf: if csv_default[f] == ['']: print('yes') feature_dict[f] = _bytes_feature(csv_data[idx]) else: feature_dict[f] = _float_feature(float(csv_data[idx])) return feature_dict
def __init__(self, data_file): # check file exsits, turn to list so that data_file can be both file or directory. assert tf.gfile.Exists(data_file), ( 'data file: {} not found. Please check input data path'.format(data_file)) if tf.gfile.IsDirectory(data_file): data_file_list = [f for f in tf.gfile.ListDirectory(data_file) if not f.startswith('.')] data_file = [data_file + '/' + file_name for file_name in data_file_list] self._data_file = data_file self._conf = Config() self._train_conf = self._conf.train self._dist_conf = self._conf.distribution self._shuffle_buffer_size = self._train_conf["num_examples"] self._num_parallel_calls = self._train_conf["num_parallel_calls"] self._train_epochs = self._train_conf["train_epochs"]
def main(): CONFIG = Config() model_conf = CONFIG.read_model_conf()['model_conf'] model = build_estimator() predictions = model.predict(input_fn=lambda: input_fn('/home/leadtek/zhangqifan/reflux_user_pro/data/pred_data/all_data.csv','pred'), predict_keys=None, hooks=None, checkpoint_path=None) # defaults None to use latest_checkpoint res = [] for pred_dict in predictions: # dict{probabilities, classes, class_ids} opt = [] class_id = pred_dict['class_ids'][0] opt.append(class_id) probability = pred_dict['probabilities'] opt.append(probability[1]) res.append(opt) # print('class_id:',class_id,'probability:',probability) res_df = pd.DataFrame(res, columns=['class_id','probability']) x = res_df[res_df['class_id'].isin([1])] sample = pd.read_csv("/home/leadtek/zhangqifan/reflux_user_pro/data/opt_all_data.csv",sep=' ') res_sample = pd.concat([sample,res_df],axis=1) res_sample.to_csv(r"/home/leadtek/zhangqifan/reflux_user_pro/res.csv", header=True, index=False, sep=' ')
def wenqi_pred_input_fn(csv_data): """Prediction input fn for a single data, used for serving client""" conf = Config() feature = conf.get_feature_name() feature_unused = conf.get_feature_name('unused') feature_conf = conf.read_feature_conf() csv_default = column_to_dtype(feature, feature_conf) csv_default.pop('label') feature_dict = {} for idx, f in enumerate(csv_default.keys()): if f in feature_unused: continue else: # print(csv_default[f]) if csv_default[f] == tf.string: # for i in range(FLAGS.num_tests): csv_data_list = [csv_data[idx] for i in range(FLAGS.num_tests)] feature_dict[f] = _bytes_feature(csv_data_list) elif csv_default[f] == tf.int32 or csv_default[f] == tf.int64: feature_dict[f] = _int_feature(int(csv_data[idx])) else: feature_dict[f] = _float_feature(float(csv_data[idx])) return feature_dict
def gen_sample_csv(): """ Generate sample csv that contains both hashed and one-hot-encoded features """ # schemas SCHEMA = Config().read_schema( ) # dict id -> col_name, e.g. SCHEMA[1]='clk' del SCHEMA[1] # load data df = pd.read_csv("../data/pred/pred1.csv") # save to csv sample_col = [ "request_id", "account_id", "adplan_id", "os", "client_type", "hour" ] sample_table = df.loc[:, sample_col] sample_table.to_csv("../data/sample/sample.csv", header=sample_col, index=False) print("Csv generation finished.")
class LR(object): ''' LR class LR模型训练,预测 ''' def __init__(self, data_file, mode): self._conf = Config() self.lr_conf = self._conf.read_model_conf()['lr_conf'] self._data_file = data_file self._mode = mode self._gbdt_spr = GBDT_spr(self._data_file) def lr_model(self): ''' lr模型训练及预测 :return: AUC ''' if self._mode == 'train': gbdt_features, y_label = self._gbdt_spr.gbdt_model(self._mode) grd_lm = LogisticRegression(penalty=self.lr_conf['penalty'], solver=self.lr_conf['solver'], C=float(self.lr_conf['c'])) grd_lm.fit(gbdt_features, y_label) joblib.dump(grd_lm, os.path.join(MODEL_DIR, "lr_model.m")) else: gbdt_features, y_label = self._gbdt_spr.gbdt_model(self._mode) grd_lm = joblib.load(os.path.join(MODEL_DIR, "lr_model.m")) y_pred_grd_lm = grd_lm.predict_proba(gbdt_features)[:, 1] pred_res = grd_lm.predict(gbdt_features) accuracy_score = metrics.accuracy_score(y_label, pred_res) fpr_grd_lm, tpr_grd_lm, _ = metrics.roc_curve( y_label, y_pred_grd_lm) roc_auc = metrics.auc(fpr_grd_lm, tpr_grd_lm) AUC_Score = metrics.roc_auc_score(y_label, y_pred_grd_lm) return accuracy_score, AUC_Score
def main(unused_argv): CONFIG = Config() print("Using TensorFlow Version %s" % tf.__version__) assert "1.4" <= tf.__version__, "Need TensorFlow r1.4 or Later." print('\nModel Type: {}'.format(FLAGS.model_type)) model_dir = os.path.join(FLAGS.model_dir, FLAGS.model_type) print('\nModel Directory: {}'.format(model_dir)) print("\nUsing Train Config:") for k, v in CONFIG.train.items(): print('{}: {}'.format(k, v)) print("\nUsing Model Config:") for k, v in CONFIG.model.items(): print('{}: {}'.format(k, v)) if not FLAGS.keep_train: # Clean up the model directory if not keep training shutil.rmtree(model_dir, ignore_errors=True) print('Remove model directory: {}'.format(model_dir)) model = build_custom_estimator(model_dir, FLAGS.model_type) tf.logging.info('Build estimator: {}'.format(model)) train_and_eval_api(model)
from __future__ import print_function from __future__ import unicode_literals import argparse import os import sys import time import tensorflow as tf from lib.read_conf import Config from lib.dataset import input_fn from lib.build_estimator import build_estimator from lib.utils.util import elapse_time CONFIG = Config().train parser = argparse.ArgumentParser(description='Evaluate Wide and Deep Model.') parser.add_argument('--model_dir', type=str, default=CONFIG["model_dir"], help='Model checkpoint dir for evaluating.') parser.add_argument('--model_type', type=str, default=CONFIG["model_type"], help="Valid model types: {'wide', 'deep', 'wide_deep'}.") parser.add_argument('--test_data', type=str, default=CONFIG["test_data"],
from lib.read_conf import Config from lib.utils.model_util import activation_fn from lib.joint import WideAndDeepClassifier # wide_deep columns categorical_column_with_identity = tf.feature_column.categorical_column_with_identity categorical_column_with_hash_bucket = tf.feature_column.categorical_column_with_hash_bucket categorical_column_with_vocabulary_list = tf.feature_column.categorical_column_with_vocabulary_list crossed_column = tf.feature_column.crossed_column bucketized_column = tf.feature_column.bucketized_column # deep columns embedding_column = tf.feature_column.embedding_column indicator_column = tf.feature_column.indicator_column numeric_column = tf.feature_column.numeric_column CONF = Config() if CONF.train['pos_sample_loss_weight'] is None and CONF.train[ 'neg_sample_loss_weight'] is None: weight_column = None else: weight_column = 'weight_column' def _build_model_columns(): """ Build wide_deep and deep feature columns from custom feature conf using tf.feature_column API wide_columns: category features + cross_features + [discretized continuous features] deep_columns: continuous features + category features(onehot or embedding for sparse features) + [cross_features(embedding)] Return: _CategoricalColumn and __DenseColumn instance in tf.feature_column API """
from absl import logging import tensorflow as tf import numpy as np import os import sys PACKAGE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, PACKAGE_DIR) from lib.read_conf import Config from lib.dataset import input_fn from lib.build_estimator import build_custom_estimator, build_estimator TEST_CSV = os.path.join(os.path.dirname(PACKAGE_DIR), 'data/test/test2') USED_FEATURE_KEY = Config().get_feature_name('used') def _read_test_input(all_lines=False): if all_lines: return open(TEST_CSV).readlines() else: return open(TEST_CSV).readline() TEST_INPUT_VALUES = _read_test_input() TEST_INPUT_KEYS = Config().get_feature_name() TEST_INPUT = dict( zip(TEST_INPUT_KEYS, TEST_INPUT_VALUES.strip().split("\t")[1:])) for key in TEST_INPUT: TEST_INPUT[key] = TEST_INPUT[key].split(',')
https://www.tensorflow.org/programmers_guide/saved_model#using_savedmodel_with_estimators """ from __future__ import print_function import os import sys import tensorflow as tf PACKAGE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, PACKAGE_DIR) from lib.build_estimator import _build_model_columns, build_custom_estimator from lib.read_conf import Config model_base_dir = Config().train['model_dir'] CONF = Config().serving['SavedModel'] tf.app.flags.DEFINE_string('model_type', CONF['model_type'], """Model type to export""") tf.app.flags.DEFINE_string( 'checkpoint_path', CONF['checkpoint_path'], """Directory to read training checkpoints. If None, use latest.""") tf.app.flags.DEFINE_string('export_dir', CONF['model_dir'], """Directory to export inference model.""") tf.app.flags.DEFINE_integer('model_version', CONF['model_version'], 'version number of the model.') FLAGS = tf.app.flags.FLAGS def main(_):
class GBDT_spr(object): ''' GBDT_spr class GBDT模型训练,生成离散特征 ''' def __init__(self, data_file): self._data_file = data_file self._DataSet = DataSet(self._data_file) self._conf = Config() self.dataset = self._DataSet.input_fn() self.batch_dataset = self._DataSet.iter_minibatches() self._feature_colums = self._feature_colums() self.gbdt_conf = self._conf.read_model_conf()['gbdt_conf'] self.model_conf = self._conf.read_model_conf()['model_conf'] def _feature_colums(self): ''' 特征列处理 :return: gbdt_colums, type: list ''' gbdt_colums = [] feature_conf_dic = self._conf.read_feature_conf()[0] for feature, conf in feature_conf_dic.items(): f_type, f_tran = conf["type"], conf["transform"] if f_type == 'category': if f_tran == 'multivalue': opt = (feature, multivalue()) gbdt_colums.append(opt) if f_tran == 'one_hot': opt = (feature, one_hot()) gbdt_colums.append(opt) else: opt = ([feature], min_max()) gbdt_colums.append(opt) return gbdt_colums def gbdt_model(self, mode): ''' gbdt模型训练,生成离散特征 :param mode: ‘train’ or ‘pred’ :return: lr_feat:gbdt生成的离散特征 y:对应数据的label ''' mapper = DataFrameMapper(self._feature_colums, sparse=True) if mode == 'train': X = mapper.fit_transform(self.dataset) y = list(self.dataset['label']) grd = GradientBoostingClassifier( n_estimators=int(self.gbdt_conf['n_estimators']), # random_state=int(self.gbdt_conf['random_state']), learning_rate=float(self.gbdt_conf['learning_rate']), # subsample=float(self.gbdt_conf['subsample']), min_samples_leaf=int(self.gbdt_conf['min_samples_leaf']), max_depth=int(self.gbdt_conf['max_depth']), max_leaf_nodes=int(self.gbdt_conf['max_leaf_nodes']), min_samples_split=int(self.gbdt_conf['min_samples_split'])) if self.model_conf['batch_size'] == '0': grd.fit(X, y) joblib.dump(grd, os.path.join(MODEL_DIR, "gbdt_model.m")) new_feature = grd.apply(X) new_feature = new_feature.reshape( -1, int(self.gbdt_conf['n_estimators'])) enc = OneHotEncoder() enc.fit(new_feature) lr_feat = np.array(enc.transform(new_feature).toarray()) else: for i, dataset in enumerate(self.batch_dataset): # print(dataset) batch_X = mapper.fit_transform(dataset) batch_y = list(dataset['label']) grd.fit(batch_X, batch_y) new_feature = grd.apply(batch_X) new_feature = new_feature.reshape( -1, int(self.gbdt_conf['n_estimators'])) enc = OneHotEncoder() enc.fit(new_feature) new_feature2 = np.array( enc.transform(new_feature).toarray()) print(new_feature2) if i == 0: lr_feat = new_feature2 else: lr_feat = np.concatenate([lr_feat, new_feature2], axis=0) joblib.dump(grd, os.path.join(MODEL_DIR, "gbdt_model.m")) else: X = mapper.fit_transform(self.dataset) y = list(self.dataset['label']) grd = joblib.load(os.path.join(MODEL_DIR, "gbdt_model.m")) new_feature = grd.apply(X) new_feature = new_feature.reshape( -1, int(self.gbdt_conf['n_estimators'])) enc = OneHotEncoder() enc.fit(new_feature) lr_feat = np.array(enc.transform(new_feature).toarray()) return lr_feat, y
# from tensorflow.python.ops import partitioned_variables # from tensorflow.python.ops import state_ops # from tensorflow.python.ops import variable_scope # from tensorflow.python.summary import summary # from tensorflow.python.training import sync_replicas_optimizer # from tensorflow.python.training import training_util # # The default learning rates are a historical artifact of the initial implementation. # _DNN_LEARNING_RATE = 0.001 # 0.05 # _LINEAR_LEARNING_RATE = 0.005 # _CNN_LEARNING_RATE = 0.001 # # Weight decay learning rate implementation. # decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps) CONF = Config().model _linear_init_learning_rate = CONF['linear_initial_learning_rate'] or 0.005 _dnn_init_learning_rate = CONF['dnn_initial_learning_rate'] or 0.001 _cnn_init_learning_rate = CONF['cnn_initial_learning_rate'] or 0.001 _linear_decay_rate = CONF['linear_decay_rate'] or 1 _dnn_decay_rate = CONF['dnn_decay_rate'] or 1 _cnn_decay_rate = CONF['cnn_decay_rate'] or 1 _batch_size = Config().train['batch_size'] _num_examples = Config().train['num_examples'] decay_steps = _num_examples / _batch_size def _wide_deep_combined_model_fn( features, labels, mode, head, model_type,
from tensorflow.python.ops import init_ops from tensorflow.python.layers import core as core_layers from tensorflow.python.layers import normalization from tensorflow.python.ops.losses import losses from tensorflow.python.keras.engine import training # from tensorflow.keras.regularizers import l1, l2, l1_l2 import os import sys PACKAGE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, PACKAGE_DIR) from lib.read_conf import Config from lib.utils.model_util import add_layer_summary, _get_optimizer_instance, _get_activation_fn CONF = Config().model ACTIVATION_FN = _get_activation_fn(CONF['dnn_activation_function']) DROPOUT = CONF['dnn_dropout'] BATCH_NORM = CONF['dnn_batch_normalization'] DNN_L1 = CONF['dnn_l1'] DNN_L2 = CONF['dnn_l2'] regularizer_list = [] if DNN_L1: regularizer_list.append(tf.contrib.layers.l1_regularizer(DNN_L1)) if DNN_L2: regularizer_list.append(tf.contrib.layers.l2_regularizer(DNN_L2)) if len(regularizer_list) == 0: REG = None else: REG = tf.contrib.layers.sum_regularizer(regularizer_list)
def build_model_columns(): def embedding_dim(dim): """empirical embedding dim""" return int(np.power(2, np.ceil(np.log(dim**0.5)))) wide_columns = [] wide_dim = 0 deep_columns = [] deep_dim = 0 normalizer_scaler = 'min_max' _feature_conf_dic = Config().read_feature_conf()[0] for feature, conf in _feature_conf_dic.items(): f_type, f_tran, f_param, is_deep = conf["type"], conf[ "transform"], conf["parameter"], conf["is_deep"] if feature == 'tag' or feature == 'main_actor': col = tf.feature_column.categorical_column_with_vocabulary_file( feature, vocabulary_file=f_param) wide_columns.append(col) wide_dim += int(conf["dim"]) if is_deep: embed_dim = 20 deep_columns.append( tf.feature_column.embedding_column( col, dimension=embed_dim, combiner='mean', initializer=None, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None, trainable=True)) deep_dim += embed_dim else: if f_type == 'category': if f_tran == 'hash_bucket': hash_bucket_size = int(f_param) col = tf.feature_column.categorical_column_with_hash_bucket( feature, hash_bucket_size=hash_bucket_size, dtype=tf.string) wide_columns.append(col) wide_dim += hash_bucket_size if is_deep: embed_dim = embedding_dim(hash_bucket_size) deep_columns.append( tf.feature_column.embedding_column( col, dimension=embed_dim, combiner='mean', initializer=None, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None, trainable=True)) deep_dim += embed_dim elif f_tran == 'vocab': col = tf.feature_column.categorical_column_with_vocabulary_list( feature, vocabulary_list=list(map(str, f_param)), dtype=None, default_value=-1, num_oov_buckets=0) wide_columns.append(col) wide_dim += len(f_param) if is_deep: deep_columns.append( tf.feature_column.indicator_column(col)) deep_dim += len(f_param) elif f_tran == 'identity': num_buckets = f_param col = tf.feature_column.categorical_column_with_identity( feature, num_buckets=num_buckets, default_value=0) wide_columns.append(col) wide_dim += num_buckets if is_deep: deep_columns.append( tf.feature_column.indicator_column(col)) deep_dim += num_buckets else: normalization_params = [] normalization_params.append(int(f_param[0])) normalization_params.append(int(f_param[2])) normalizer_fn = normalizer_fn_builder( normalizer_scaler, tuple(normalization_params)) col = tf.feature_column.numeric_column( feature, shape=(1, ), default_value=0, dtype=tf.float32, normalizer_fn=normalizer_fn) wide_columns.append(col) wide_dim += 1 if is_deep: deep_columns.append(col) deep_dim += 1 # for cross_features, hash_bucket_size, is_deep in cross_feature_list: # cf_list = [] # for f in cross_features: # # f_type = feature_conf_dic[f]["type"] # f_tran = feature_conf_dic[f]["transform"] # f_param = feature_conf_dic[f]["parameter"] # if f_tran == 'identity': # cf_list.append(tf.feature_column.categorical_column_with_identity(f, num_buckets=f_param, # default_value=0)) # else: # cf_list.append(f) # col = tf.feature_column.crossed_column(cf_list, int(hash_bucket_size)) # wide_columns.append(col) # wide_dim += int(hash_bucket_size) # if is_deep: # deep_columns.append(tf.feature_column.embedding_column(col, dimension=embedding_dim(int(hash_bucket_size)))) # deep_dim += embedding_dim(int(hash_bucket_size)) tf.logging.info('Build total {} wide columns'.format(len(wide_columns))) for col in wide_columns: tf.logging.debug('Wide columns: {}'.format(col)) tf.logging.info('Wide input dimension is: {}'.format(wide_dim)) tf.logging.info('Build total {} deep columns'.format(len(deep_columns))) for col in deep_columns: tf.logging.debug('Deep columns: {}'.format(col)) tf.logging.info('Deep input dimension is: {}'.format(deep_dim)) return wide_columns, deep_columns
class LR(object): ''' LR class LR模型训练,预测 ''' def __init__(self, data_file, mode): self._conf = Config() self._data_file = data_file self._Tf_Data = TF_Data(self._data_file) self.dataset_train = self._Tf_Data.gbdt_input() self.lr_conf = self._conf.read_model_conf()['lr_conf'] self._mode = mode self._gbdt_spr = GBDT_spr(self._data_file).gbdt_model(self._mode) def lr_model(self): ''' lr模型训练及预测 :return: AUC ''' if self._mode == 'train': grd_lm = SGDClassifier(penalty=self.lr_conf['penalty'], loss='log', warm_start=True) i = 0 while True: try: dataset = next(self._gbdt_spr) batch_X = dataset[0] batch_y = dataset[1] print('start training LR epochs_%d' % i) grd_lm = grd_lm.partial_fit(batch_X, batch_y, classes=[0, 1]) i += 1 del (dataset) del (batch_y) del (batch_X) gc.collect() except StopIteration as e: print('Generator return value:', e.value) break joblib.dump(grd_lm, os.path.join(MODEL_DIR, "lr_model.m")) else: y_all_label = [] y_all_pred_grd_lm = [] pred_all_res = [] grd_lm = joblib.load(os.path.join(MODEL_DIR, "lr_model.m")) while True: try: dataset = next(self._gbdt_spr) gbdt_features = dataset[0] y_label = dataset[1] y_pred_grd_lm = grd_lm.predict_proba(gbdt_features)[:, 1] pred_res = grd_lm.predict(gbdt_features) y_all_label.extend(y_label) y_all_pred_grd_lm.extend(y_pred_grd_lm) pred_all_res.extend(pred_res) del (dataset) del (gbdt_features) gc.collect() except StopIteration as e: print('Generator return value:', e.value) break accuracy_score = metrics.accuracy_score(y_all_label, pred_all_res) fpr_grd_lm, tpr_grd_lm, _ = metrics.roc_curve( y_all_label, y_all_pred_grd_lm) roc_auc = metrics.auc(fpr_grd_lm, tpr_grd_lm) AUC_Score = metrics.roc_auc_score(y_all_label, y_all_pred_grd_lm) return accuracy_score, AUC_Score
fractions={ '0': keep_prob, '1': 1 }, seed=0).values() print('down sampling finished.') print(data.first()) if os.path.exists(outpath): shutil.rmtree(outpath) data.map(lambda x: "\t".join(x)).saveAsTextFile(outpath) sc.stop() ss.stop() if __name__ == '__main__': CONF = Config().read_data_process_conf() SCHEMA = Config().read_schema() feature_index_list = CONF['category_feature_index_list'] keep_prob = CONF['downsampling_keep_ratio'] conf = SparkConf().setAppName('wide_deep'). \ set('spark.executor.memory', '10g').set('spark.driver.memory', '10g').setMaster('local[*]') sc = SparkContext(conf=conf) ss = SparkSession.builder.getOrCreate() inpath = '/Users/lapis-hong/Documents/NetEase/wide_deep/data/train' outpath = '/Users/lapis-hong/Documents/NetEase/wide_deep/data/spark' # if len(sys.argv) < 3: # exit('Missing arguments: \nUsage: $ python data_process_local_test.py $inpath $outpath') if len(sys.argv) == 3: inpath = sys.argv[1] outpath = sys.argv[2] local_data_preprocess2(inpath, outpath)
from tensorflow.python.estimator.canned import head as head_lib import os import sys PACKAGE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, PACKAGE_DIR) from lib.read_conf import Config from lib.linear import linear_logit_fn_builder from lib.dnn import multidnn_logit_fn_builder from lib.utils.model_util import add_layer_summary, check_no_sync_replicas_optimizer, activation_fn, get_optimizer_instance # # Weight decay learning rate implementation. # decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps) CONF = Config().model _linear_init_learning_rate = CONF['linear_initial_learning_rate'] or 0.005 _dnn_init_learning_rate = CONF['dnn_initial_learning_rate'] or 0.001 _linear_decay_rate = CONF['linear_decay_rate'] or 1 _dnn_decay_rate = CONF['dnn_decay_rate'] or 1 _batch_size = Config().train['batch_size'] _num_examples = Config().train['num_examples'] decay_steps = _num_examples / _batch_size _feature_sequence = Config().get_feature_name('sequence') # sequence features def _wide_deep_combined_model_fn(features, labels, mode,
def main(unused_argv): CONFIG = Config() print("Using TensorFlow Version %s" % tf.__version__) # assert "1.4" <= tf.__version__, "Need TensorFlow r1.4 or Later." print('\nModel Type: {}'.format(FLAGS.model_type)) model_dir = os.path.join(FLAGS.model_dir, FLAGS.model_type) print('\nModel Directory: {}'.format(model_dir)) print("\nUsing Train Config:") for k, v in CONFIG.train.items(): print('{}: {}'.format(k, v)) print("\nUsing Model Config:") for k, v in CONFIG.model.items(): print('{}: {}'.format(k, v)) if not FLAGS.keep_train: # Clean up the model directory if not keep training shutil.rmtree(model_dir, ignore_errors=True) print('Remove model directory: {}'.format(model_dir)) # model = build_estimator(model_dir, FLAGS.model_type) model = build_custom_estimator(model_dir, FLAGS.model_type) tf.logging.info('Build estimator: {}'.format(model)) if CONFIG.train['dynamic_train']: train_fn = dynamic_train print("Using dynamic train mode.") else: train_fn = train_and_eval if CONFIG.distribution["is_distribution"]: print("Using PID: {}".format(os.getpid())) cluster = CONFIG.distribution["cluster"] job_name = CONFIG.distribution["job_name"] task_index = CONFIG.distribution["task_index"] print( "Using Distributed TensorFlow. Local host: {} Job_name: {} Task_index: {}" .format(cluster[job_name][task_index], job_name, task_index)) cluster = tf.train.ClusterSpec(CONFIG.distribution["cluster"]) server = tf.train.Server(cluster, job_name=job_name, task_index=task_index) # distributed can not including eval. train_fn = train if job_name == 'ps': # wait for incoming connection forever server.join() # sess = tf.Session(server.target) # queue = create_done_queue(task_index, num_workers) # for i in range(num_workers): # sess.run(queue.dequeue()) # print("ps {} received worker {} done".format(task_index, i) # print("ps {} quitting".format(task_index)) else: # TODO:supervisor & MonotoredTrainingSession & experiment (deprecated) train_fn(model) # train_and_eval(model) # Each worker only needs to contact the PS task(s) and the local worker task. # config = tf.ConfigProto(device_filters=[ # '/job:ps', '/job:worker/task:%d' % arguments.task_index]) # with tf.device(tf.train.replica_device_setter( # worker_device="/job:worker/task:%d" % task_index, # cluster=cluster)): # e = _create_experiment_fn() # e.train_and_evaluate() # call estimator's train() and evaluate() method # hooks = [tf.train.StopAtStepHook(last_step=10000)] # with tf.train.MonitoredTrainingSession( # master=server.target, # is_chief=(task_index == 0), # checkpoint_dir=args.model_dir, # hooks=hooks) as mon_sess: # while not mon_sess.should_stop(): # # mon_sess.run() # classifier.fit(input_fn=train_input_fn, steps=1) else: # local run train_fn(model)