def write_table(self, user_topk_item, user_topk_score): writer = tf.TableRecordWriter(self.output_path) write_to_table = writer.write(range(self.NUM_OF_TABLE_COLUMNS), [tf.as_string(self.user_batch), user_topk_item, user_topk_score]) return write_to_table
def get_outputs(self, data): FLAGS = self.flag outfield_json = { "features": [{ "feature_name": "item_id", "feature_type": "id_feature", "value_type": "String", "expression": "item:item_id" }, { "feature_name": "featureparser_trace_critic_score", "feature_type": "id_feature", "value_type": "Float", "expression": "item:featureparser_trace_critic_score" }] } # feature preprocess with tf.name_scope("featureparser_feature_input"): fg_configs = get_json_conf_from_file(FLAGS.feature_conf['input']) featureparser_features = featureparser_fg.parse_genreated_fg( fg_configs, data['features']) featureparser_features = format_feature_offline( featureparser_features, FLAGS.feature_conf['input']) # label label_configs = get_json_conf_from_file( FLAGS.feature_conf['label']) label_dict = featureparser_fg.parse_genreated_fg( label_configs, data['label']) label_dict = format_feature_offline(label_dict, FLAGS.feature_conf['label']) # output field outfields = featureparser_fg.parse_genreated_fg( outfield_json, data['features']) outfields = format_feature_offline(outfields, outfield_json) list_size = 10 search_id = tf.expand_dims(data['search_id'], 1) search_ids = tf.reshape(tf.tile(search_id, [1, list_size]), [-1, 1]) rn = tf.expand_dims(data['rn'], 1) rns = tf.reshape(tf.tile(rn, [1, list_size]), [-1, 1]) item_id = tf.reshape(outfields["item_id"], [-1, 1]) net0_dict = self.get_net0_dict(featureparser_features) self.build_network(net0_dict) print("predict result table: {}".format( FLAGS.data['critic_result_table'])) writer = tf.TableRecordWriter(FLAGS.data['critic_result_table'], slice_id=FLAGS.task_index) write_op = writer.write([0, 1, 2, 3, 4, 5, 6, 7], [ search_ids, rns, item_id, self.logits, self.predict_score, label_dict['pay'], outfields['featureparser_trace_critic_score'], label_dict['click'] ]) close_writer_op = writer.close() return write_op, close_writer_op
def _build_graph(self, serving_mode): # random seed tf.set_random_seed(self.random_seed) # model config self.dropout_keep_fm = tf.placeholder(tf.float32, shape=[None], name="dropout_keep_fm") self.dropout_keep_deep = tf.placeholder(tf.float32, shape=[None], name="dropout_keep_deep") self.train_phase = tf.placeholder(tf.bool, name="train_phase") # init variables self.weights = self._initialize_weights() # model self.embeddings = tf.nn.embedding_lookup( self.weights["feature_embeddings"], self.feat_index) self.mask = tf.transpose(tf.cast(self.mask, tf.float32), [0, 2, 1]) self.embeddings = tf.multiply(self.embeddings, self.mask) #None * F * K # weight to embedding feat_value = tf.reshape(self.feat_value, shape=[-1, self.feature_size, 1]) self.embeddings = tf.multiply(self.embeddings, feat_value) # bias self.bias = tf.nn.embedding_lookup(self.weights["feature_bias"], self.feat_index) # None * F * 1 self.bias = tf.multiply(self.bias, self.mask) self.bias = tf.multiply(self.bias, feat_value) # None * F * 1 # use multi tags if self.use_multi_tags: for multi_tags_feature_val in self.multi_tags: multi_tags_embeddings, multi_tags_bias = self._multi_tags_embedding( multi_tags_feature_val, self.multi_tags_max_size, self.multi_tags_embedding_size) self.embeddings = tf.concat( [self.embeddings, multi_tags_embeddings], axis=1) self.bias = tf.concat([self.bias, multi_tags_bias], axis=1) # first order term self.y_first_order = tf.reduce_sum(self.bias, 2) # None * F self.y_first_order = tf.nn.dropout(self.y_first_order, self.dropout_keep_fm[0]) # None * F # second order term # sum_square part self.summed_features_emb = tf.reduce_sum(self.embeddings, 1) # None * K self.summed_features_emb_square = tf.square( self.summed_features_emb) # None * K # square_sum part self.squared_features_emb = tf.square(self.embeddings) self.squared_sum_features_emb = tf.reduce_sum( self.squared_features_emb, 1) # None * K # second order self.y_second_order = 0.5 * tf.subtract( self.summed_features_emb_square, self.squared_sum_features_emb) # None * K self.y_second_order = tf.nn.dropout( self.y_second_order, self.dropout_keep_fm[1]) # None * K # Deep component self.y_deep = tf.reshape( self.embeddings, shape=[-1, self.all_feature_size * self.embedding_size ]) # None * (F*K) self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[0]) for i in range(0, len(self.deep_layers)): self.y_deep = tf.matmul(self.y_deep, self.weights["layer_%d" % i]) # None * layer[i] * 1 self.y_deep = tf.add(self.y_deep, self.weights["bias_%d" % i]) # None * layer[i] * 1 if self.batch_norm: self.y_deep = self._batch_norm_layer( self.y_deep, train_phase=self.train_phase, scope_bn="bn_%d" % i) # None * layer[i] * 1 self.y_deep = self.deep_layers_activation(self.y_deep) self.y_deep = tf.nn.dropout( self.y_deep, self.dropout_keep_deep[1 + i]) # dropout at each Deep layer # DeepFM if self.use_fm and self.use_deep: concat_input = tf.concat( [self.y_first_order, self.y_second_order, self.y_deep], axis=1) elif self.use_fm: concat_input = tf.concat([self.y_first_order, self.y_second_order], axis=1) elif self.use_deep: concat_input = self.y_deep self.out = tf.matmul(concat_input, self.weights["concat_projection"]) self.out = tf.add(self.out, self.weights["concat_bias"]) self.out = tf.nn.sigmoid(self.out) if serving_mode: return self.gs = tf.train.get_or_create_global_step() self.auc = tf.metrics.auc(labels=self.label, predictions=self.out) if self.mode == "train": # loss if self.loss_type == "logloss": self.loss = tf.losses.log_loss(self.label, self.out) elif self.loss_type == "mse": self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out)) # l2 regularization on weights if self.l2_reg > 0: self.loss += tf.contrib.layers.l2_regularizer(self.l2_reg)( self.weights["concat_projection"]) if self.use_deep: for i in range(len(self.deep_layers)): self.loss += tf.contrib.layers.l2_regularizer( self.l2_reg)(self.weights["layer_%d" % i]) # optimizer, default is adam if self.optimizer_type == "adam": self.optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate, beta1=self.adam_beta1, beta2=self.adam_beta2, epsilon=self.adam_epsilon) elif self.optimizer_type == "adagrad": self.optimizer = tf.train.AdagradOptimizer( learning_rate=self.learning_rate, initial_accumulator_value=self. adagrad_initial_accumulator_value) elif self.optimizer_type == "gd": self.optimizer = tf.train.GradientDescentOptimizer( learning_rate=self.learning_rate) elif self.optimizer_type == "momentum": self.optimizer = tf.train.MomentumOptimizer( learning_rate=self.learning_rate, momentum=self.momentum) # defalt is async, this options is usefull for if self.sync_type == "sync": self.optimizer = tf.train.SyncReplicasOptimizer( self.optimizer, replicas_to_aggregate=self.cluster.num_tasks("worker"), total_num_replicas=self.cluster.num_tasks("worker")) self.sync_opt_hook = self.optimizer.make_sesion_run_hook( (self.task_index == 0), num_tokens=0) self.optimizer = self.optimizer.minimize(self.loss, global_step=self.gs) elif self.mode == "predict": self.v_list = [] self.v_list.append(tf.reshape(self.out, [-1])) for (feature_name, v) in self.feature_dict.items(): self.v_list.append(v) # create table writer, just for odps table on pai platform if hasattr(tf, "TableRecordWriter" ) and self.output_name.startswith("odps"): writer = tf.TableRecordWriter(self.output_name, slice_id=self.task_index) self.write_to_table = writer.write( range(self.all_feature_size + 1), self.v_list) #self.close_table = writer.close() return self.avg_loss = tf.div(self.loss, self.batch_size) # init tf.Saver self.saver = tf.train.Saver(sharded=True) # summary infomation self.summary_merged, self.summary_writer = self._add_summary( self.checkpoint_dir, tf.get_default_graph()) # number of params total_parameters = 0 for variable in self.weights.values(): shape = variable.get_shape() variable_parameters = 1 for dim in shape: variable_parameters *= dim.value total_parameters += variable_parameters if self.verbose > 0: print("model params number: %d" % total_parameters) print("model total size: %d" % (total_parameters * 4))
def predict(input_fn=None, model_fn=None, params=None): # step 1. get hyper parameters eval_table = params['eval_table'] output_table = params['output_table'] display = params['every_n_iter'] local = params['local'] model_dir = params['model_dir'] target = params['target'] session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config = tf.estimator.RunConfig(model_dir=model_dir, session_config=session_config) # step 2. get model from estimator model = tf.estimator.Estimator(model_fn=model_fn, model_dir=model_dir, config=config, params=params) # step 3. make prediction predictions = model.predict( input_fn=lambda: input_fn(eval_table, params=params, mode='eval')) # step 4. build a new graph to write results into odps new_graph = tf.Graph() with new_graph.as_default(): target_id_ph = tf.placeholder(dtype=tf.string) target_emb_ph = tf.placeholder(dtype=tf.string) if not local: writer = tf.TableRecordWriter(output_table) write_to_table = writer.write(range(2), [target_id_ph, target_emb_ph]) close_table = writer.close() # check checkpoint print(model_dir) _print_graph_weight(model_dir) sess = tf.Session(graph=new_graph) # with as sess: try: for i, preds in enumerate(predictions): if (i + 1) % display == 0: tf.logging.info('step {}'.format(i + 1)) #print(preds['{}_emb'.format(target)]) emb_string = ','.join( ['{:.5f}'.format(v) for v in preds['{}_emb'.format(target)]]) #print(emb_string) if not local: sess.run(write_to_table, feed_dict={ target_id_ph: preds['{}_id'.format(target)], target_emb_ph: emb_string, }) else: print(preds['{}_id'.format(target)], emb_string) except tf.errors.OutOfRangeError: print('final step: {}'.format(i + 1)) finally: if not local: print(sess.run(close_table)) print('write over')
def predict(input_fn=None, model_fn=None, params=None): # step 1. get hyper parameters eval_table = params['eval_table'] output_table = params['output_table'] display = params['every_n_iter'] local = params['local'] model_dir = params['model_dir'] session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config = tf.estimator.RunConfig(model_dir=model_dir, session_config=session_config) # step 2. get model from estimator model = tf.estimator.Estimator(model_fn=model_fn, model_dir=model_dir, config=config, params=params) # step 3. make prediction predictions = model.predict( input_fn=lambda: input_fn(eval_table, params=params, mode='eval')) # step 4. build a new graph to write results into odps new_graph = tf.Graph() with new_graph.as_default(): user_id_ph = tf.placeholder(dtype=tf.string) content_id_ph = tf.placeholder(dtype=tf.string) label_ph = tf.placeholder(dtype=tf.int32) ctr_prob_ph = tf.placeholder(dtype=tf.float32) if not local: writer = tf.TableRecordWriter(output_table) write_to_table = writer.write( range(4), [user_id_ph, content_id_ph, label_ph, ctr_prob_ph]) close_table = writer.close() # check checkpoint _print_graph_weight(model_dir) sess = tf.Session(graph=new_graph) # with as sess: try: for i, preds in enumerate(predictions): if (i + 1) % display == 0: tf.logging.info('step {}'.format(i + 1)) if not local: sess.run(write_to_table, feed_dict={ user_id_ph: preds['user_id'], content_id_ph: preds['content_id'], label_ph: preds['click_cnt'], ctr_prob_ph: preds['ctr_prob'] }) else: print(preds['user_id'], preds['content_id'], preds['click_cnt'], preds['ctr_prob']) except tf.errors.OutOfRangeError: print('final step: {}'.format(i + 1)) finally: if not local: print(sess.run(close_table))