def write_table(self, user_topk_item, user_topk_score):
     writer = tf.TableRecordWriter(self.output_path)
     write_to_table = writer.write(range(self.NUM_OF_TABLE_COLUMNS),
                                   [tf.as_string(self.user_batch),
                                    user_topk_item,
                                    user_topk_score])
     return write_to_table
Beispiel #2
0
    def get_outputs(self, data):
        FLAGS = self.flag
        outfield_json = {
            "features": [{
                "feature_name": "item_id",
                "feature_type": "id_feature",
                "value_type": "String",
                "expression": "item:item_id"
            }, {
                "feature_name": "featureparser_trace_critic_score",
                "feature_type": "id_feature",
                "value_type": "Float",
                "expression": "item:featureparser_trace_critic_score"
            }]
        }
        # feature preprocess
        with tf.name_scope("featureparser_feature_input"):
            fg_configs = get_json_conf_from_file(FLAGS.feature_conf['input'])
            featureparser_features = featureparser_fg.parse_genreated_fg(
                fg_configs, data['features'])
            featureparser_features = format_feature_offline(
                featureparser_features, FLAGS.feature_conf['input'])
            # label
            label_configs = get_json_conf_from_file(
                FLAGS.feature_conf['label'])
            label_dict = featureparser_fg.parse_genreated_fg(
                label_configs, data['label'])
            label_dict = format_feature_offline(label_dict,
                                                FLAGS.feature_conf['label'])
            # output field
            outfields = featureparser_fg.parse_genreated_fg(
                outfield_json, data['features'])
            outfields = format_feature_offline(outfields, outfield_json)
            list_size = 10
            search_id = tf.expand_dims(data['search_id'], 1)
            search_ids = tf.reshape(tf.tile(search_id, [1, list_size]),
                                    [-1, 1])
            rn = tf.expand_dims(data['rn'], 1)
            rns = tf.reshape(tf.tile(rn, [1, list_size]), [-1, 1])
            item_id = tf.reshape(outfields["item_id"], [-1, 1])

        net0_dict = self.get_net0_dict(featureparser_features)
        self.build_network(net0_dict)

        print("predict result table: {}".format(
            FLAGS.data['critic_result_table']))
        writer = tf.TableRecordWriter(FLAGS.data['critic_result_table'],
                                      slice_id=FLAGS.task_index)
        write_op = writer.write([0, 1, 2, 3, 4, 5, 6, 7], [
            search_ids, rns, item_id, self.logits, self.predict_score,
            label_dict['pay'], outfields['featureparser_trace_critic_score'],
            label_dict['click']
        ])
        close_writer_op = writer.close()
        return write_op, close_writer_op
    def _build_graph(self, serving_mode):
        # random seed
        tf.set_random_seed(self.random_seed)

        # model config
        self.dropout_keep_fm = tf.placeholder(tf.float32,
                                              shape=[None],
                                              name="dropout_keep_fm")
        self.dropout_keep_deep = tf.placeholder(tf.float32,
                                                shape=[None],
                                                name="dropout_keep_deep")
        self.train_phase = tf.placeholder(tf.bool, name="train_phase")

        # init variables
        self.weights = self._initialize_weights()

        # model
        self.embeddings = tf.nn.embedding_lookup(
            self.weights["feature_embeddings"], self.feat_index)
        self.mask = tf.transpose(tf.cast(self.mask, tf.float32), [0, 2, 1])
        self.embeddings = tf.multiply(self.embeddings,
                                      self.mask)  #None * F * K

        # weight to embedding
        feat_value = tf.reshape(self.feat_value,
                                shape=[-1, self.feature_size, 1])
        self.embeddings = tf.multiply(self.embeddings, feat_value)

        # bias
        self.bias = tf.nn.embedding_lookup(self.weights["feature_bias"],
                                           self.feat_index)  # None * F * 1
        self.bias = tf.multiply(self.bias, self.mask)
        self.bias = tf.multiply(self.bias, feat_value)  # None * F * 1
        # use multi tags
        if self.use_multi_tags:
            for multi_tags_feature_val in self.multi_tags:
                multi_tags_embeddings, multi_tags_bias = self._multi_tags_embedding(
                    multi_tags_feature_val, self.multi_tags_max_size,
                    self.multi_tags_embedding_size)
                self.embeddings = tf.concat(
                    [self.embeddings, multi_tags_embeddings], axis=1)
                self.bias = tf.concat([self.bias, multi_tags_bias], axis=1)

        # first order term
        self.y_first_order = tf.reduce_sum(self.bias, 2)  # None * F
        self.y_first_order = tf.nn.dropout(self.y_first_order,
                                           self.dropout_keep_fm[0])  # None * F

        # second order term
        # sum_square part
        self.summed_features_emb = tf.reduce_sum(self.embeddings,
                                                 1)  # None * K
        self.summed_features_emb_square = tf.square(
            self.summed_features_emb)  # None * K

        # square_sum part
        self.squared_features_emb = tf.square(self.embeddings)
        self.squared_sum_features_emb = tf.reduce_sum(
            self.squared_features_emb, 1)  # None * K

        # second order
        self.y_second_order = 0.5 * tf.subtract(
            self.summed_features_emb_square,
            self.squared_sum_features_emb)  # None * K
        self.y_second_order = tf.nn.dropout(
            self.y_second_order, self.dropout_keep_fm[1])  # None * K

        # Deep component
        self.y_deep = tf.reshape(
            self.embeddings,
            shape=[-1, self.all_feature_size * self.embedding_size
                   ])  # None * (F*K)
        self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[0])
        for i in range(0, len(self.deep_layers)):
            self.y_deep = tf.matmul(self.y_deep,
                                    self.weights["layer_%d" %
                                                 i])  # None * layer[i] * 1
            self.y_deep = tf.add(self.y_deep,
                                 self.weights["bias_%d" %
                                              i])  # None * layer[i] * 1
            if self.batch_norm:
                self.y_deep = self._batch_norm_layer(
                    self.y_deep,
                    train_phase=self.train_phase,
                    scope_bn="bn_%d" % i)  # None * layer[i] * 1
            self.y_deep = self.deep_layers_activation(self.y_deep)
            self.y_deep = tf.nn.dropout(
                self.y_deep,
                self.dropout_keep_deep[1 + i])  # dropout at each Deep layer

        # DeepFM
        if self.use_fm and self.use_deep:
            concat_input = tf.concat(
                [self.y_first_order, self.y_second_order, self.y_deep], axis=1)
        elif self.use_fm:
            concat_input = tf.concat([self.y_first_order, self.y_second_order],
                                     axis=1)
        elif self.use_deep:
            concat_input = self.y_deep

        self.out = tf.matmul(concat_input, self.weights["concat_projection"])
        self.out = tf.add(self.out, self.weights["concat_bias"])
        self.out = tf.nn.sigmoid(self.out)
        if serving_mode:
            return

        self.gs = tf.train.get_or_create_global_step()
        self.auc = tf.metrics.auc(labels=self.label, predictions=self.out)

        if self.mode == "train":
            # loss
            if self.loss_type == "logloss":
                self.loss = tf.losses.log_loss(self.label, self.out)
            elif self.loss_type == "mse":
                self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out))
            # l2 regularization on weights
            if self.l2_reg > 0:
                self.loss += tf.contrib.layers.l2_regularizer(self.l2_reg)(
                    self.weights["concat_projection"])
                if self.use_deep:
                    for i in range(len(self.deep_layers)):
                        self.loss += tf.contrib.layers.l2_regularizer(
                            self.l2_reg)(self.weights["layer_%d" % i])

            # optimizer, default is adam
            if self.optimizer_type == "adam":
                self.optimizer = tf.train.AdamOptimizer(
                    learning_rate=self.learning_rate,
                    beta1=self.adam_beta1,
                    beta2=self.adam_beta2,
                    epsilon=self.adam_epsilon)
            elif self.optimizer_type == "adagrad":
                self.optimizer = tf.train.AdagradOptimizer(
                    learning_rate=self.learning_rate,
                    initial_accumulator_value=self.
                    adagrad_initial_accumulator_value)
            elif self.optimizer_type == "gd":
                self.optimizer = tf.train.GradientDescentOptimizer(
                    learning_rate=self.learning_rate)
            elif self.optimizer_type == "momentum":
                self.optimizer = tf.train.MomentumOptimizer(
                    learning_rate=self.learning_rate, momentum=self.momentum)

            # defalt is async, this options is usefull for
            if self.sync_type == "sync":
                self.optimizer = tf.train.SyncReplicasOptimizer(
                    self.optimizer,
                    replicas_to_aggregate=self.cluster.num_tasks("worker"),
                    total_num_replicas=self.cluster.num_tasks("worker"))
                self.sync_opt_hook = self.optimizer.make_sesion_run_hook(
                    (self.task_index == 0), num_tokens=0)

            self.optimizer = self.optimizer.minimize(self.loss,
                                                     global_step=self.gs)
        elif self.mode == "predict":
            self.v_list = []
            self.v_list.append(tf.reshape(self.out, [-1]))
            for (feature_name, v) in self.feature_dict.items():
                self.v_list.append(v)
            # create table writer, just for odps table on pai platform
            if hasattr(tf, "TableRecordWriter"
                       ) and self.output_name.startswith("odps"):
                writer = tf.TableRecordWriter(self.output_name,
                                              slice_id=self.task_index)
                self.write_to_table = writer.write(
                    range(self.all_feature_size + 1), self.v_list)
                #self.close_table = writer.close()
            return

        self.avg_loss = tf.div(self.loss, self.batch_size)
        # init tf.Saver
        self.saver = tf.train.Saver(sharded=True)
        # summary infomation
        self.summary_merged, self.summary_writer = self._add_summary(
            self.checkpoint_dir, tf.get_default_graph())

        # number of params
        total_parameters = 0
        for variable in self.weights.values():
            shape = variable.get_shape()
            variable_parameters = 1
            for dim in shape:
                variable_parameters *= dim.value
            total_parameters += variable_parameters
        if self.verbose > 0:
            print("model params number: %d" % total_parameters)
            print("model total size: %d" % (total_parameters * 4))
Beispiel #4
0
def predict(input_fn=None, model_fn=None, params=None):
    # step 1. get hyper parameters
    eval_table = params['eval_table']
    output_table = params['output_table']
    display = params['every_n_iter']
    local = params['local']
    model_dir = params['model_dir']
    target = params['target']

    session_config = tf.ConfigProto(allow_soft_placement=True,
                                    log_device_placement=False)
    config = tf.estimator.RunConfig(model_dir=model_dir,
                                    session_config=session_config)

    # step 2. get model from estimator
    model = tf.estimator.Estimator(model_fn=model_fn,
                                   model_dir=model_dir,
                                   config=config,
                                   params=params)

    # step 3. make prediction
    predictions = model.predict(
        input_fn=lambda: input_fn(eval_table, params=params, mode='eval'))

    # step 4. build a new graph to write results into odps
    new_graph = tf.Graph()
    with new_graph.as_default():
        target_id_ph = tf.placeholder(dtype=tf.string)
        target_emb_ph = tf.placeholder(dtype=tf.string)

        if not local:
            writer = tf.TableRecordWriter(output_table)
            write_to_table = writer.write(range(2),
                                          [target_id_ph, target_emb_ph])
            close_table = writer.close()

    # check checkpoint
    print(model_dir)
    _print_graph_weight(model_dir)

    sess = tf.Session(graph=new_graph)
    # with  as sess:
    try:
        for i, preds in enumerate(predictions):
            if (i + 1) % display == 0:
                tf.logging.info('step {}'.format(i + 1))
            #print(preds['{}_emb'.format(target)])
            emb_string = ','.join(
                ['{:.5f}'.format(v) for v in preds['{}_emb'.format(target)]])
            #print(emb_string)

            if not local:
                sess.run(write_to_table,
                         feed_dict={
                             target_id_ph: preds['{}_id'.format(target)],
                             target_emb_ph: emb_string,
                         })
            else:
                print(preds['{}_id'.format(target)], emb_string)
    except tf.errors.OutOfRangeError:
        print('final step: {}'.format(i + 1))
    finally:
        if not local:
            print(sess.run(close_table))
            print('write over')
Beispiel #5
0
def predict(input_fn=None, model_fn=None, params=None):
    # step 1. get hyper parameters
    eval_table = params['eval_table']
    output_table = params['output_table']
    display = params['every_n_iter']
    local = params['local']
    model_dir = params['model_dir']

    session_config = tf.ConfigProto(allow_soft_placement=True,
                                    log_device_placement=False)
    config = tf.estimator.RunConfig(model_dir=model_dir,
                                    session_config=session_config)

    # step 2. get model from estimator
    model = tf.estimator.Estimator(model_fn=model_fn,
                                   model_dir=model_dir,
                                   config=config,
                                   params=params)

    # step 3. make prediction
    predictions = model.predict(
        input_fn=lambda: input_fn(eval_table, params=params, mode='eval'))

    # step 4. build a new graph to write results into odps
    new_graph = tf.Graph()
    with new_graph.as_default():
        user_id_ph = tf.placeholder(dtype=tf.string)
        content_id_ph = tf.placeholder(dtype=tf.string)
        label_ph = tf.placeholder(dtype=tf.int32)
        ctr_prob_ph = tf.placeholder(dtype=tf.float32)

        if not local:
            writer = tf.TableRecordWriter(output_table)
            write_to_table = writer.write(
                range(4), [user_id_ph, content_id_ph, label_ph, ctr_prob_ph])
            close_table = writer.close()

    # check checkpoint
    _print_graph_weight(model_dir)

    sess = tf.Session(graph=new_graph)
    # with  as sess:
    try:
        for i, preds in enumerate(predictions):
            if (i + 1) % display == 0:
                tf.logging.info('step {}'.format(i + 1))

            if not local:
                sess.run(write_to_table,
                         feed_dict={
                             user_id_ph: preds['user_id'],
                             content_id_ph: preds['content_id'],
                             label_ph: preds['click_cnt'],
                             ctr_prob_ph: preds['ctr_prob']
                         })
            else:
                print(preds['user_id'], preds['content_id'],
                      preds['click_cnt'], preds['ctr_prob'])
    except tf.errors.OutOfRangeError:
        print('final step: {}'.format(i + 1))
    finally:
        if not local:
            print(sess.run(close_table))