Esempio n. 1
0
def export_model(wnd_conf, ckpt_name=None):
    # Build inference model.
    # dense feat with sparse
    with tf.Graph().as_default() as graph:
        tables = lookup.LookupTables(wnd_conf)

        feature_values = tf.placeholder(tf.float32, [None],
                                        name="dense_values_placeholder")
        feature_indices = tf.placeholder(tf.int16, [None],
                                         name="dense_indices_placeholder")
        dense_shape = tf.to_int64(feature_indices[0:2])
        dense_real_indice = tf.reshape(tf.to_int64(feature_indices[2:]),
                                       (tf.shape(feature_values)[0], 2))
        batch_size = tf.placeholder(tf.int32,
                                    shape=(),
                                    name="batch_size_placeholder")
        feature = tf.SparseTensor(indices=dense_real_indice,
                                  values=feature_values,
                                  dense_shape=dense_shape)
        feature = tf.sparse.to_dense(feature)
        feature = tf.reshape(feature, (batch_size, wnd_conf[MODEL][FEAT_DIM]))
        input_feature = {'BatchSize': batch_size}
        save_model_input = {
            "dense_values_placeholder": feature_values,
            'batch_size_placeholder': batch_size,
            "dense_indices_placeholder": feature_indices
        }

        iside_index = 0
        uside_index = 0
        for emb in wnd_conf.embedding_list:
            id_feat_name = emb[3]
            id_type = emb[4]
            id_feat_wts_name = id_feat_name + 'Wts'
            if id_type == 'i':
                id_values_placeholder_name = "emb_values_placeholder_" + str(
                    iside_index)
                id_wts_placeholder_name = "emb_wts_placeholder_" + str(
                    iside_index)
                id_indices_placeholder_name = "emb_indices_placeholder_" + str(
                    iside_index)
                id_values_placeholder = tf.placeholder(
                    tf.string, [None], name=id_values_placeholder_name)
                id_indices_placeholder = tf.placeholder(
                    tf.int16, [None], name=id_indices_placeholder_name)
                id_wts_placeholder = tf.placeholder(
                    tf.float32, [None], name=id_wts_placeholder_name)
                id_shape = tf.to_int64(id_indices_placeholder[0:2])
                id_real_indices = tf.reshape(
                    tf.to_int64(id_indices_placeholder[2:]),
                    (tf.shape(id_values_placeholder)[0], 2))
                input_feature[id_feat_name] = tf.SparseTensor(
                    indices=id_real_indices,
                    values=tables.inf_transform(id_feat_name,
                                                id_values_placeholder),
                    dense_shape=id_shape)
                input_feature[id_feat_wts_name] = tf.SparseTensor(
                    indices=id_real_indices,
                    values=id_wts_placeholder,
                    dense_shape=id_shape)
                save_model_input[
                    id_values_placeholder_name] = id_values_placeholder
                save_model_input[id_wts_placeholder_name] = id_wts_placeholder
                save_model_input[
                    id_indices_placeholder_name] = id_indices_placeholder
                iside_index += 1
            elif id_type == 'u':
                id_values_placeholder_name = "emb_common_values_placeholder_" + str(
                    uside_index)
                id_wts_placeholder_name = "emb_common_wts_placeholder_" + str(
                    uside_index)
                id_values_placeholder = tf.placeholder(
                    tf.string, [None], name=id_values_placeholder_name)

                model_type = wnd_conf[MODEL][MODEL_TYPE]
                if (model_type == "din_v2"):
                    print("model_type:din_v2")
                    id_values_splited = tf.string_split(
                        id_values_placeholder, '&').values
                    # id_wts_placeholder = tf.placeholder(tf.float32, [None], name=id_wts_placeholder_name)
                    # id_wts_placeholder = tf.identity(tf.Print(id_wts_placeholder ,[batch_size, id_feat_name, id_values_placeholder , id_values_splited,id_wts_placeholder],summarize=600000))
                    input_feature[id_feat_name] = tables.inf_transform(
                        id_feat_name, id_values_splited)
                else:
                    input_feature[id_feat_name] = tables.inf_transform(
                        id_feat_name, id_values_placeholder)

                id_wts_placeholder = tf.placeholder(
                    tf.float32, [None], name=id_wts_placeholder_name)
                input_feature[id_feat_wts_name] = id_wts_placeholder
                save_model_input[
                    id_values_placeholder_name] = id_values_placeholder
                save_model_input[id_wts_placeholder_name] = id_wts_placeholder
                uside_index += 1

        norm_vec_const, std_list = vec_constant(wnd_conf)
        inference_const_vec = tf.constant(value=norm_vec_const,
                                          dtype=tf.float32)
        std = tf.constant(value=std_list, dtype=tf.float32)
        epsilon = tf.constant(0.0000001,
                              dtype=tf.float32,
                              shape=[len(std_list)])

        clip_feature = tf.clip_by_value(feature,
                                        clip_value_min=0.0,
                                        clip_value_max=sys.float_info.max)
        normalize_feature = tf.subtract(
            tf.div(tf.multiply(clip_feature, std),
                   tf.square(tf.add_n([std, epsilon])) * 3.0),
            inference_const_vec)

        clip_normalize_feature = tf.clip_by_value(normalize_feature,
                                                  clip_value_min=-0.99,
                                                  clip_value_max=0.99)

        input_feature["features"] = clip_normalize_feature
        # Run inference.
        # Run inference.
        inf = inference.Inference(wnd_conf)
        # tf.Print(clip_normalize_feature,[clip_normalize_feature],summarize=600000)
        with tf.variable_scope("DnnModel"):
            #logits = inf.online_inference(input_feature)
            click_logit, order_logit = inf.online_inference(input_feature)
        #scores = tf.reshape(tf.sigmoid(logits), [-1], name="Scores")

        click_scores = tf.reshape(tf.sigmoid(click_logit), [-1],
                                  name="click_Scores")
        order_scores = tf.reshape(tf.sigmoid(order_logit), [-1],
                                  name="order_Scores")
        weights = wnd_conf[EXPORT_MODEL][EXPORT_WEIGHT]
        print("*" * 100)
        print(weights)
        print("*" * 100)
        scores = tf.divide(tf.add(weights[0] * click_scores,
                                  weights[1] * order_scores),
                           sum(weights),
                           name='Scores')

        ## gpu settings
        os.environ['CUDA_VISIBLE_DEVICES'] = wnd_conf[MODEL][GPU_VISIBLE]
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        saver = tf.train.Saver()
        output_path = wnd_conf[PATH][MODEL_FROZEN_PATH]
        del_path(output_path)
        exporter = tf.saved_model.Builder(output_path)
        with tf.Session(graph=graph) as sess:
            # sess.run(legacy_init_op)
            ckpt_path = wnd_conf[PATH][MODEL_PATH] + ckpt_name
            print("Restore from ckpt_path: ", ckpt_path)
            saver.restore(sess, ckpt_path)
            signature_def_map = {
                signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
                signature_def_utils.predict_signature_def(
                    save_model_input, {"Scores": scores})
            }
            legacy_init_op = tf.group(tf.tables_initializer(),
                                      name='legacy_init_op')
            exporter.add_meta_graph_and_variables(
                sess,
                tags=[tag_constants.SERVING],
                signature_def_map=signature_def_map,
                main_op=legacy_init_op,
                clear_devices=True)
            exporter.save()
        print('Successfully exported model to %s' % output_path)
Esempio n. 2
0
def validation(wnd_conf):
    step = get_validation_newest_step(wnd_conf[PATH][VALIDATION_RESULT])
    print("get_validation_newest_step: %s" % str(step))

    # if validation data is in hdfs, then get it to local path.
    validation_path = wnd_conf[PATH][VALIDATION_DATA_PATH]
    # if validation_path.startswith("hdfs") or validation_path.startswith("/user"):
    #    validation_path = hdfs_files_to_local(validation_path)

    val_tensorboard_path = wnd_conf[PATH][SUMMARY_PATH] + "/val/"
    # if not os.path.exists(val_tensorboard_path):
    #    os.makedirs(val_tensorboard_path)
    summary_write = tf.summary.FileWriter(val_tensorboard_path)
    while step < wnd_conf[MODEL][MAX_ITER_STEP]:
        newest_ckpt, newest_step = get_ckpt_from_fs(step,
                                                    wnd_conf[PATH][MODEL_PATH])
        if step == newest_step or not file_exists(
                wnd_conf[PATH][MODEL_PATH],
                'step-%d.model.DONE' % newest_step):
            time.sleep(5)
            continue
        step = newest_step

        tables = lookup.LookupTables(wnd_conf)
        ## read validation data
        validation_labels, validation_header, validation_mask, validation_features = \
            tfrecord.get_val_test_batch(file_path=validation_path,
                                        EPOCH_NUM=1, batch_size=wnd_conf[MODEL][VALIDATION_BATCH_SIZE],
                                        wnd_conf=wnd_conf, lookup_tables=tables)
        inf = inference.Inference(wnd_conf)
        ## do validation
        with tf.variable_scope("DnnModel"):
            validation_logits = inf.inference(validation_features,
                                              is_train=False)
            # validation_loss = inf.loss(validation_logits, validation_labels, validation_mask, is_train=False)

            if (wnd_conf.is_unbias_model):
                tower_train_loss = inf.loss_multi_task_unbias(
                    validation_logits,
                    validation_labels,
                    validation_mask,
                    is_train=True,
                    loss_unbias_method=wnd_conf[MODEL][loss_unbias_method],
                    loss_ctr_rel_method=wnd_conf[MODEL][LOSS_CTR_REL_METHOD])
            else:
                tower_train_loss = inf.loss_multi_task(
                    validation_logits,
                    validation_labels,
                    validation_mask,
                    is_train=True,
                    propensity_weight_mul=tower_batch_features[
                        "propensity_weight_mul"])

            if (wnd_conf.is_unbias_model):
                y_rel, y_bias = validation_logits
                click_logit, order_logit = y_rel
                (p_ctr, p_cvr) = cal_ctr_cvr_unibas(
                    y_rel,
                    y_bias,
                    loss_unbias_method=wnd_conf[MODEL][loss_unbias_method])
            else:
                p_ctr, p_cvr = cal_ctr_cvr(validation_logits)

        validation_click_logits_sigmoid = p_ctr
        validation_click_predict = tf.cast(
            tf.greater(validation_click_logits_sigmoid, tf.constant(0.5)),
            tf.float32)

        validation_order_logits_sigmoid = p_cvr
        validation_order_predict = tf.cast(
            tf.greater(validation_order_logits_sigmoid, tf.constant(0.5)),
            tf.float32)

        # validation metrics
        validation_metrics_var_scope = "validation_metrics"

        validation_click_precision, validation_click_precision_op = tf.metrics.precision(
            labels=tf.reduce_sum(validation_mask[:, 1:5], axis=-1),
            predictions=validation_click_predict[:, 0],
            name=validation_metrics_var_scope)
        validation_click_recall, validation_click_recall_op = tf.metrics.recall(
            labels=tf.reduce_sum(validation_mask[:, 1:5], axis=-1),
            predictions=validation_click_predict[:, 0],
            name=validation_metrics_var_scope)
        validation_click_auc, validation_click_auc_op = tf.metrics.auc(
            labels=tf.reduce_sum(validation_mask[:, 1:5], axis=-1),
            predictions=validation_click_logits_sigmoid[:, 0],
            name=validation_metrics_var_scope)

        validation_order_precision, validation_order_precision_op = tf.metrics.precision(
            labels=tf.add(validation_mask[:, 3], validation_mask[:, 4]),
            predictions=validation_order_predict[:, 0],
            name=validation_metrics_var_scope)
        validation_order_recall, validation_order_recall_op = tf.metrics.recall(
            labels=tf.add(validation_mask[:, 3], validation_mask[:, 4]),
            predictions=validation_order_predict[:, 0],
            name=validation_metrics_var_scope)
        validation_order_auc, validation_order_auc_op = tf.metrics.auc(
            labels=tf.add(validation_mask[:, 3], validation_mask[:, 4]),
            predictions=validation_order_logits_sigmoid[:, 0],
            name=validation_metrics_var_scope)

        validation_mean_loss, validation_mean_loss_op = tf.metrics.mean(
            values=tower_train_loss, name=validation_metrics_var_scope)
        tf.summary.scalar('validation_click_precision',
                          validation_click_precision)
        tf.summary.scalar('validation_click_recall', validation_click_recall)
        tf.summary.scalar('validation_click_auc', validation_click_auc)

        tf.summary.scalar('validation_order_precision',
                          validation_order_precision)
        tf.summary.scalar('validation_order_recall', validation_order_recall)
        tf.summary.scalar('validation_order_auc', validation_order_auc)

        tf.summary.scalar('validation_mean_loss', validation_mean_loss)
        sum_ops = tf.summary.merge_all()

        # validation metric init op
        validation_metrics_vars = tf.get_collection(
            tf.GraphKeys.LOCAL_VARIABLES, scope=validation_metrics_var_scope)
        validation_metrics_init_op = tf.variables_initializer(
            var_list=validation_metrics_vars, name='validation_metrics_init')
        ## saver
        saver = tf.train.Saver()

        config = tf.ConfigProto()
        os.environ['CUDA_VISIBLE_DEVICES'] = "-1"
        config.gpu_options.allow_growth = True
        # config.allow_soft_placement=True

        with tf.Session(config=config) as sess:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())
            sess.run(tf.tables_initializer())

            restore_session_from_checkpoint(
                sess, saver,
                os.path.join(wnd_conf[PATH][MODEL_PATH], newest_ckpt))
            sess.run(validation_metrics_init_op)

            validation_logits_sigmoid_values_list = []

            validation_click_logits_sigmoid_values_list = []
            validation_detail_logits_sigmoid_values_list = []
            validation_order_logits_sigmoid_values_list = []

            header_list = []

            start_time = time.time()
            try:
                while True:
                    validation_click_logits_sigmoid_values, \
                    validation_detail_logits_sigmoid_values, \
                    validation_order_logits_sigmoid_values, \
                    headers_values, _, _, _, _, _, _ = sess.run(
                        [validation_click_logits_sigmoid,
                         validation_order_logits_sigmoid,
                         validation_header,
                         validation_click_precision_op,
                         validation_click_recall_op,
                         validation_click_auc_op,

                         validation_order_precision_op,
                         validation_order_recall_op,
                         validation_order_auc_op,
                         validation_mean_loss_op])
                    validation_click_logits_sigmoid_values_list.extend(
                        validation_click_logits_sigmoid_values[:, 0])
                    validation_detail_logits_sigmoid_values_list.extend(
                        validation_detail_logits_sigmoid_values[:, 0])
                    validation_order_logits_sigmoid_values_list.extend(
                        validation_order_logits_sigmoid_values[:, 0])
                    header_list.extend(headers_values)
            except tf.errors.OutOfRangeError:
                print(
                    "-----------------------------OutOfRange---------------------------------"
                )

            validation_click_precision_value, \
            validation_click_recall_value, \
            validation_click_auc_value, \
            validation_order_precision_value, \
            validation_order_recall_value, \
            validation_order_auc_value, \
            validation_mean_loss_value = sess.run([
                validation_click_precision,
                validation_click_recall,
                validation_click_auc,
                validation_order_precision,
                validation_order_recall,
                validation_order_auc,
                validation_mean_loss])

            summary_out = sess.run(sum_ops)
            summary_write.add_summary(summary_out, step)
            sys.stdout.write(
                "[%s] spent time: %f | validation_loss: %f | validation_click_precision: %f | validation_click_recall: %f | validation_click_auc: %f  |validation_order_precision: %f | validation_order_recall: %f | validation_order_auc: %f | iter: %d\n" % \
                (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), time.time() - start_time,
                 validation_mean_loss_value,
                 validation_click_precision_value,
                 validation_click_recall_value,
                 validation_click_auc_value,
                 validation_order_precision_value,
                 validation_order_recall_value,
                 validation_order_auc_value,
                 step))

            validation_metrics_str = ">> iter_steps:" + str(
                step) + "\n" + "validation_loss:" + str(
                    validation_mean_loss_value
                ) + "\n" + "validation_click_precision:" + str(
                    validation_click_precision_value
                ) + "\n" + "validation_click_recall:" + str(
                    validation_click_recall_value
                ) + "\n" + "validation_click_auc:" + str(
                    validation_click_auc_value
                ) + "\n" + "validation_order_precision:" + str(
                    validation_order_precision_value
                ) + "\n" + "validation_order_recall:" + str(
                    validation_order_recall_value
                ) + "\n" + "validation_order_auc:" + str(
                    validation_order_auc_value) + "\n"

            log_to_file(validation_metrics_str,
                        wnd_conf[PATH][VALIDATION_RESULT])

            total_logit = []
            for i, k in zip(validation_click_logits_sigmoid_values_list,
                            validation_order_logits_sigmoid_values_list):
                total_logit.append(i + k)
            metric_sets, at_list = metrics.get_offline_metrics(
                wnd_conf[SCHEMA][HEADER_SCHEMA], header_list, total_logit)

            for action, metric in metric_sets.items():
                offline_metrics_str = ''
                metric_threshlod_pair = zip(at_list, metric)
                for tuple0, tuple1 in metric_threshlod_pair:
                    offline_metrics_str += "action_{a}_at_{n}: {m}\n".format(
                        a=action, n=tuple0, m=tuple1)
                log_to_file(offline_metrics_str,
                            wnd_conf[PATH][VALIDATION_RESULT])
        tf.reset_default_graph()
    summary_write.close()
    print("ValidationEnd!")
Esempio n. 3
0
def predict(wnd_conf, ckpt_name=None, test_tag="", test_score_method=""):
    print("predict...")
    all_test_data_path = wnd_conf[PATH][TEST_DATA_PATH].split(',')
    if (test_tag == "ord"):
        all_test_data_path = wnd_conf[PATH][TEST_DATA_PATH_ORD].split(',')
    print("test_score_method:", test_score_method)
    out_file_test = wnd_conf[PATH][OUTPUT_PATH] + wnd_conf.tag + '.' + 'ckpt-' + \
                    ckpt_name.split('-')[-1] + '.test_result' + '_' + test_tag + "_" + args['test_score_method']

    header_score_file = out_file_test + '.detail'

    print("out_file_test:", out_file_test)
    print("header_score_file:", header_score_file)

    del_path(out_file_test)
    del_path(header_score_file)

    for test_data_path in all_test_data_path:
        tables = lookup.LookupTables(wnd_conf)
        ## read validation data
        test_labels, test_header, test_mask, test_features = tfrecord.get_val_test_batch(
            file_path=test_data_path,
            EPOCH_NUM=1,
            batch_size=wnd_conf[MODEL][TEST_BATCH_SIZE],
            wnd_conf=wnd_conf,
            lookup_tables=tables)

        ## model
        inf = inference.Inference(wnd_conf)

        ## test
        with tf.variable_scope("DnnModel"):
            test_eval_logits = inf.inference(test_features, is_train=False)
            if (wnd_conf.is_unbias_model):
                test_loss = inf.loss_multi_task_unbias(
                    test_eval_logits,
                    test_labels,
                    test_mask,
                    is_train=False,
                    loss_unbias_method=wnd_conf[MODEL][loss_unbias_method],
                    loss_ctr_rel_method=wnd_conf[MODEL][LOSS_CTR_REL_METHOD])
            else:
                test_loss = inf.loss_multi_task(
                    test_eval_logits,
                    test_labels,
                    test_mask,
                    is_train=False,
                    propensity_weight_mul=test_features[
                        "propensity_weight_mul"])

            if (wnd_conf.is_unbias_model):
                y_rel, y_bias = test_eval_logits
                # click_logit, order_logit = y_rel
                # test_score_method: 'rel' or 'ctr'
                if (test_score_method == "rel"):
                    p_ctr, p_cvr = cal_ctr_cvr(y_rel)
                else:
                    (p_ctr, p_cvr) = cal_ctr_cvr_unibas(
                        y_rel,
                        y_bias,
                        loss_unbias_method=wnd_conf[MODEL][loss_unbias_method])
            else:
                p_ctr, p_cvr = cal_ctr_cvr(test_eval_logits)

        # test
        test_click_logits_sigmoid = p_ctr
        test_click_predict = tf.cast(
            tf.greater(test_click_logits_sigmoid, tf.constant(0.5)),
            tf.float32)

        test_order_logits_sigmoid = p_cvr
        test_order_predict = tf.cast(
            tf.greater(test_order_logits_sigmoid, tf.constant(0.5)),
            tf.float32)

        # test metrics
        test_metrics_var_scope = "test_metrics"

        test_click_precision, test_click_precision_op = tf.metrics.precision(
            labels=tf.reduce_sum(test_mask[:, 1:5], axis=-1),
            predictions=test_click_predict[:, 0],
            name=test_metrics_var_scope)
        test_click_recall, test_click_recall_op = tf.metrics.recall(
            labels=tf.reduce_sum(test_mask[:, 1:5], axis=-1),
            predictions=test_click_predict[:, 0],
            name=test_metrics_var_scope)
        test_click_auc, test_click_auc_op = tf.metrics.auc(
            labels=tf.reduce_sum(test_mask[:, 1:5], axis=-1),
            predictions=test_click_logits_sigmoid[:, 0],
            name=test_metrics_var_scope)

        test_order_precision, test_order_precision_op = tf.metrics.precision(
            labels=tf.add(test_mask[:, 3], test_mask[:, 4]),
            predictions=test_order_predict[:, 0],
            name=test_metrics_var_scope)
        test_order_recall, test_order_recall_op = tf.metrics.recall(
            labels=tf.add(test_mask[:, 3], test_mask[:, 4]),
            predictions=test_order_predict[:, 0],
            name=test_metrics_var_scope)
        test_order_auc, test_order_auc_op = tf.metrics.auc(
            labels=tf.add(test_mask[:, 3], test_mask[:, 4]),
            predictions=test_order_logits_sigmoid[:, 0],
            name=test_metrics_var_scope)
        if "mmoe" in wnd_conf[MODEL][MODEL_TYPE]:
            click_weight = tf.get_default_graph().get_tensor_by_name(
                'DnnModel/mmoe_layers/gates-0/gates-layer-0/Softmax:0')
            order_weight = tf.get_default_graph().get_tensor_by_name(
                'DnnModel/mmoe_layers/gates-1/gates-layer-0/Softmax:0')

        test_mean_loss, test_mean_loss_op = tf.metrics.mean( \
            values=test_loss, name=test_metrics_var_scope)

        # test metric init op
        test_metrics_vars = tf.get_collection( \
            tf.GraphKeys.LOCAL_VARIABLES, scope=test_metrics_var_scope)
        test_metrics_init_op = tf.variables_initializer( \
            var_list=test_metrics_vars, name='test_metrics_init')

        tf.summary.scalar('test_click_precision', test_click_precision)
        tf.summary.scalar('test_click_recall', test_click_recall)
        tf.summary.scalar('test_click_auc', test_click_auc)

        tf.summary.scalar('test_order_precision', test_order_precision)
        tf.summary.scalar('test_order_recall', test_order_recall)
        tf.summary.scalar('test_order_auc', test_order_auc)

        ## saver
        saver = tf.train.Saver()

        ## gpu settings
        os.environ['CUDA_VISIBLE_DEVICES'] = wnd_conf[MODEL][GPU_VISIBLE]
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        # config.allow_soft_placement = True

        with tf.Session(config=config) as sess:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())
            sess.run(tf.tables_initializer())

            model_ckpt = wnd_conf[PATH][MODEL_PATH] + ckpt_name

            restore_session_from_checkpoint(sess, saver, model_ckpt)
            sess.run(test_metrics_init_op)

            test_click_logits_sigmoid_list = []
            test_order_logits_sigmoid_list = []

            header_list = []
            feed_batch_num = 0
            if "mmoe" in wnd_conf[MODEL][MODEL_TYPE]:
                click_weight_value_list = []
                order_weight_value_list = []
            try:
                while True:
                    # check data feed
                    feed_batch_num = feed_batch_num + 1
                    print('*' * 70)
                    print("predicting data of feed_batch_num:", feed_batch_num)
                    if "mmoe" in wnd_conf[MODEL][MODEL_TYPE]:
                        test_click_logits_sigmoid_values, \
                        test_order_logits_sigmoid_values, \
                        test_header_values, _, _, _, _, _, _, _, click_weight_value, order_weight_value = sess.run(
                            [test_click_logits_sigmoid,
                             test_order_logits_sigmoid,
                             test_header,
                             test_click_precision_op,
                             test_click_recall_op,
                             test_click_auc_op,
                             test_order_precision_op,
                             test_order_recall_op,
                             test_order_auc_op,
                             test_mean_loss_op,
                             click_weight,
                             order_weight])
                    else:
                        test_click_logits_sigmoid_values, \
                        test_order_logits_sigmoid_values, \
                        test_header_values, _, _, _, _, _, _, _ = sess.run(
                            [test_click_logits_sigmoid,
                             test_order_logits_sigmoid,
                             test_header,
                             test_click_precision_op,
                             test_click_recall_op,
                             test_click_auc_op,
                             test_order_precision_op,
                             test_order_recall_op,
                             test_order_auc_op,
                             test_mean_loss_op])

                    # test_eval_logits_sigmoid_values is a ndarray of shape (batch_size, 1)
                    test_click_logits_sigmoid_list.extend(
                        test_click_logits_sigmoid_values[:, 0])
                    test_order_logits_sigmoid_list.extend(
                        test_order_logits_sigmoid_values[:, 0])
                    header_list.extend(test_header_values)
                    if "mmoe" in wnd_conf[MODEL][MODEL_TYPE]:
                        click_weight_value_list.extend(click_weight_value)
                        order_weight_value_list.extend(order_weight_value)

                    test_click_precision_value, test_click_recall_value, test_click_auc_value, \
                    test_order_precision_value, test_order_recall_value, test_order_auc_value, test_loss_value = \
                        sess.run([test_click_precision, test_click_recall, test_click_auc,
                                  test_order_precision, test_order_recall, test_order_auc,
                                  test_mean_loss])
                    print("test_click_precision   :",
                          test_click_precision_value)
                    print("test_click_recall      :", test_click_recall_value)
                    print("test_click_auc         :", test_click_auc_value)
                    print("test_order_precision   :",
                          test_order_precision_value)
                    print("test_order_recall      :", test_order_recall_value)
                    print("test_order_auc         :", test_order_auc_value)
                    print("test_loss        :", test_loss_value)

                # break
            except tf.errors.OutOfRangeError:
                print(
                    "-----------------------------OutOfRangeError---------------------------------"
                )

            test_metrics_str = "test_data_path:" + str(
                test_data_path) + "\n" + "test_click_precision:" + str(
                    test_click_precision_value
                ) + "\n" + "test_click_recall:" + str(
                    test_click_recall_value) + "\n" + "test_click_auc:" + str(
                        test_click_auc_value
                    ) + "\n" + "test_order_precision:" + str(
                        test_order_precision_value
                    ) + "\n" + "test_order_recall:" + str(
                        test_order_recall_value
                    ) + "\n" + "test_order_auc:" + str(
                        test_order_auc_value) + "\n" + "test_loss:" + str(
                            test_loss_value) + "\n"

            log_to_file(test_metrics_str, out_file_test)

            offline_metrics_str = "add clk_score/ord_socre: 1/1..."
            log_to_file(offline_metrics_str, out_file_test)
            print(offline_metrics_str)
            total_logit = []
            for i, k in zip(test_click_logits_sigmoid_list,
                            test_order_logits_sigmoid_list):
                total_logit.append(i + k)
            metric_sets, at_list = metrics.get_offline_metrics(
                wnd_conf[SCHEMA][HEADER_SCHEMA], header_list, total_logit)
            for action, metric in metric_sets.items():
                metric_pre, metric_mrr = metric
                offline_metrics_str = ''

                metric_threshlod_pair = zip(at_list, metric_pre)
                for tuple0, tuple1 in metric_threshlod_pair:
                    offline_metrics_str += "action_{a}_pre_at_{n}: {m}\n".format(
                        a=action, n=tuple0, m=tuple1)

                offline_metrics_str += "\n"

                metric_threshlod_pair = zip(at_list, metric_mrr)
                for tuple0, tuple1 in metric_threshlod_pair:
                    offline_metrics_str += "action_{a}_mrr_at_{n}: {m}\n".format(
                        a=action, n=tuple0, m=tuple1)

                offline_metrics_str += "\n"

                log_to_file(offline_metrics_str, out_file_test)
                print(offline_metrics_str)

            metric_sets = metrics.get_offline_metrics_auc(
                wnd_conf[SCHEMA][HEADER_SCHEMA], header_list, total_logit)
            for action, metric in metric_sets.items():
                offline_metrics_str = ''
                offline_metrics_str += "action_{a}_auc: {m}\n".format(
                    a=action, m=metric[0])
                log_to_file(offline_metrics_str, out_file_test)
                print(offline_metrics_str)

            print("================== process file==================")

            version = wnd_conf.tag + '_' + test_tag
            checkpoint = 'ckpt-' + ckpt_name.split('-')[-1]

            metrics3.save_to_local(wnd_conf[SCHEMA][HEADER_SCHEMA],
                                   header_list, test_click_logits_sigmoid_list,
                                   test_order_logits_sigmoid_list,
                                   out_file_test, version, checkpoint)
            if "mmoe" in wnd_conf[MODEL][MODEL_TYPE]:
                metrics3.save_weights_to_local(click_weight_value_list,
                                               order_weight_value_list)
            print("====================DONE========")
            df = pd.read_csv("./res/{0}_test_{1}.csv".format(
                version, checkpoint))
            # out_name = "./res/{0}_result_{1}.txt".format(version, checkpoint)
            out_name = out_file_test
            metrics2.get_offline_metrics(df, out_name)
            print("finish", out_name)
            sys.exit(0)

        tf.reset_default_graph()
Esempio n. 4
0
def train(wnd_conf, ckpt_name=None):
    print("If this is the training process:", wnd_conf[INFO][TYPE] == 'train')
    tables = lookup.LookupTables(wnd_conf)

    ## read train data
    per_tower_data = tfrecord.get_multi_towers_batch(wnd_conf,
                                                     lookup_tables=tables)

    ## step count and learning rate
    step = 0
    if "current" not in ckpt_name:
        step = int(ckpt_name.split("-")[1])
    global_step = tf.Variable(step, name="global_step", trainable=False)

    # decay learning rate
    learning_rate = tf.train.piecewise_constant(global_step,
                                                wnd_conf[MODEL][STEP_BOUNDARY],
                                                wnd_conf[MODEL][LEARNING_RATE])

    ## inference and get an optimizer that performs gradient descent.
    inf = inference.Inference(wnd_conf)
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        opt = inf.get_optimizer(wnd_conf[MODEL][OPTIMIZER], learning_rate)

    # Calculate the predict for each model tower.
    tower_eval_predict = []
    tower_eval_predict_score = []
    tower_labels = []

    tower_click_predict_score = []
    tower_order_predict_score = []

    tower_click_label_predict = []
    tower_order_label_predict = []

    # Calculate the gradients for each model tower.
    tower_grads = []
    tower_losses = []
    for gpu in wnd_conf[MODEL][GPU_VISIBLE].split(','):
        with tf.device('/gpu:%s' % gpu):
            with tf.variable_scope("DnnModel", reuse=(int(gpu) > 0)):
                # get one batch for the GPU
                tower_batch_labels, tower_batch_mask, tower_batch_features = per_tower_data[
                    gpu]

                tower_train_logits = inf.inference(tower_batch_features,
                                                   is_train=True)
                if (wnd_conf.is_unbias_model):
                    tower_train_loss = inf.loss_multi_task_unbias(
                        tower_train_logits,
                        tower_batch_labels,
                        tower_batch_mask,
                        is_train=True,
                        loss_unbias_method=wnd_conf[MODEL][loss_unbias_method],
                        loss_ctr_rel_method=wnd_conf[MODEL]
                        [LOSS_CTR_REL_METHOD])
                else:
                    tower_train_loss = inf.loss_multi_task(
                        tower_train_logits,
                        tower_batch_labels,
                        tower_batch_mask,
                        is_train=True,
                        propensity_weight_mul=tower_batch_features[
                            "propensity_weight_mul"])
                print("tower_train_loss:", tower_train_loss)

                if (wnd_conf.is_unbias_model):
                    y_rel, y_bias = tower_train_logits
                    click_logit, order_logit = y_rel
                    (p_ctr, p_cvr) = cal_ctr_cvr_unibas(
                        y_rel,
                        y_bias,
                        loss_unbias_method=wnd_conf[MODEL][loss_unbias_method])
                else:
                    p_ctr, p_cvr = cal_ctr_cvr(tower_train_logits)

                if (wnd_conf[MODEL][WND_WD] > 0.00001):
                    tower_train_loss = tower_train_loss + inf.l2_norm(
                        tower_batch_features)

                # Reuse variables for the next tower.
                tf.get_variable_scope().reuse_variables()

                # Calculate the gradients for the batch of data on this tower.
                grads = opt.compute_gradients(tower_train_loss)

                # Keep track of the gradients across all towers.
                tower_grads.append(grads)

                # Keep track of the loss across all towers
                tower_losses.append(tf.expand_dims(tower_train_loss, 0))

                tower_train_eval_click_sigmoid = p_ctr
                tower_click_predict = tf.cast(
                    tf.greater(tower_train_eval_click_sigmoid,
                               tf.constant(0.5)), tf.float32)
                tower_click_predict_score.append(
                    tower_train_eval_click_sigmoid)
                tower_click_label_predict.append(tower_click_predict)

                tower_train_eval_order_sigmoid = p_cvr
                tower_order_predict = tf.cast(
                    tf.greater(tower_train_eval_order_sigmoid,
                               tf.constant(0.5)), tf.float32)
                tower_order_predict_score.append(
                    tower_train_eval_order_sigmoid)
                tower_order_label_predict.append(tower_order_predict)

                tower_labels.append(tower_batch_mask)

    # We must calculate the mean of each gradient. Note that this is the
    # synchronization point across all towers.
    grads = average_gradients(tower_grads)
    batch_train_loss = average_losses(tower_losses)

    # Apply the gradients to adjust the shared variables.
    train_op = opt.apply_gradients(grads, global_step=global_step)

    train_click_predict = tf.concat(axis=0, values=tower_click_label_predict)
    train_click_predict_score = tf.concat(axis=0,
                                          values=tower_click_predict_score)

    train_order_predict = tf.concat(axis=0, values=tower_order_label_predict)
    train_order_predict_score = tf.concat(axis=0,
                                          values=tower_order_predict_score)

    batch_labels = tf.concat(axis=0, values=tower_labels)

    # train metrics
    train_metrics_var_scope = "train_metrics"
    train_mean_loss, train_mean_loss_op = tf.metrics.mean(
        values=batch_train_loss, name=train_metrics_var_scope)

    train_click_precision, train_click_precision_op = tf.metrics.precision(
        labels=tf.reduce_sum(batch_labels[:, 1:5], axis=-1),
        predictions=train_click_predict[:, 0],
        name=train_metrics_var_scope)
    train_click_recall, train_click_recall_op = tf.metrics.recall(
        labels=tf.reduce_sum(batch_labels[:, 1:5], axis=-1),
        predictions=train_click_predict[:, 0],
        name=train_metrics_var_scope)
    train_click_auc, train_click_auc_op = tf.metrics.auc(
        labels=tf.reduce_sum(batch_labels[:, 1:5], axis=-1),
        predictions=train_click_predict_score[:, 0],
        name=train_metrics_var_scope)

    train_order_precision, train_order_precision_op = tf.metrics.precision(
        labels=tf.add(batch_labels[:, 3], batch_labels[:, 4]),
        predictions=train_order_predict[:, 0],
        name=train_metrics_var_scope)
    train_order_recall, train_order_recall_op = tf.metrics.recall(
        labels=tf.add(batch_labels[:, 3], batch_labels[:, 4]),
        predictions=train_order_predict[:, 0],
        name=train_metrics_var_scope)
    train_order_auc, train_order_auc_op = tf.metrics.auc(
        labels=tf.add(batch_labels[:, 3], batch_labels[:, 4]),
        predictions=train_order_predict_score[:, 0],
        name=train_metrics_var_scope)

    tf.summary.scalar('train_click_precision', train_click_precision)
    tf.summary.scalar('train_click_recall', train_click_recall)
    tf.summary.scalar('train_click_auc', train_click_auc)

    tf.summary.scalar('train_order_precision', train_order_precision)
    tf.summary.scalar('train_order_recall', train_order_recall)
    tf.summary.scalar('train_order_auc', train_order_auc)

    # train metrics init op
    train_metrics_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES,
                                           scope=train_metrics_var_scope)
    train_metrics_init_op = tf.variables_initializer(
        var_list=train_metrics_vars, name='train_metrics_init')

    ## merge all summaries
    summary_op = tf.summary.merge_all()

    var_list = [var for var in tf.global_variables() if "moving" in var.name]
    var_list += tf.trainable_variables()
    # max_to_keep=0 means to save all checkpoint files
    saver = tf.train.Saver(var_list=var_list, max_to_keep=0)

    ## gpu settings
    os.environ['CUDA_VISIBLE_DEVICES'] = wnd_conf[MODEL][GPU_VISIBLE]
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    ## gpu number
    gpu_num = len(wnd_conf[MODEL][GPU_VISIBLE].split(','))

    # set the total_data_num to calculate current_epoch_num
    total_data_num = wnd_conf[MODEL][TOTAL_EXAMPLE_NUM]
    current_epoch_num = -1

    if wnd_conf[PARAMETER][LOSS_WEIGHT_METHOD] == 'uncertainty':
        click_weight = tf.get_default_graph().get_tensor_by_name(
            'DnnModel/uncertainty_click_weight:0')
        order_weight = tf.get_default_graph().get_tensor_by_name(
            'DnnModel/uncertainty_order_weight:0')

    with tf.Session(config=config) as sess:
        train_tensorboard_path = wnd_conf[PATH][SUMMARY_PATH] + "/train/"
        # if not os.path.exists(train_tensorboard_path):
        #    os.makedirs(train_tensorboard_path)
        writer = tf.summary.FileWriter(train_tensorboard_path, sess.graph)
        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())
        sess.run(tf.tables_initializer())
        # reset all train metrics to be zero

        # print model variables
        print("print model variables:")
        tvars = tf.trainable_variables()
        tvars_vals = sess.run(tvars)

        for var, val in zip(tvars, tvars_vals):
            print(var.name,
                  val)  # Prints the name of the variable alongside its value.

        sess.run(train_metrics_init_op)
        if wnd_conf[MODEL][MODEL_TYPE] == "embed_mlp" or wnd_conf[MODEL][
                MODEL_TYPE] == "embed_mlp_recall":
            inf.embedding_update(sess)

        if ckpt_name != 'model.ckpt-0':
            cur_model_ckpt = wnd_conf[PATH][MODEL_PATH] + ckpt_name
            saver.restore(sess, cur_model_ckpt)
            print("load ckpt for train : %s" % cur_model_ckpt)
        else:
            del_path(wnd_conf[PATH][MODEL_PATH])

        sys.stdout.write("[%s] start training\n" %
                         datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
        try:
            while step < wnd_conf[MODEL][MAX_ITER_STEP]:
                start_time = time.time()
                # reset all train metrics to be zero
                temp_epoch_num = int(
                    (wnd_conf[MODEL][BATCH_SIZE] * gpu_num * step) /
                    total_data_num) + 1
                if temp_epoch_num != current_epoch_num:
                    current_epoch_num = temp_epoch_num
                    print('*' * 210)
                    print('>> Current epoch num:', current_epoch_num)
                    print('*' * 210)

                batch_loss_value, _, step, _, _, _, _, _, _, _ = sess.run([
                    batch_train_loss, train_op, global_step,
                    train_mean_loss_op, train_click_precision_op,
                    train_click_recall_op, train_click_auc_op,
                    train_order_precision_op, train_order_recall_op,
                    train_order_auc_op
                ])
                if wnd_conf[PARAMETER][LOSS_WEIGHT_METHOD] == 'uncertainty':
                    train_click_precision_value, train_click_recall_value, train_click_auc_value, \
                    train_order_precision_value, train_order_recall_value, train_order_auc_value, train_mean_loss_value, \
                    click_weight_value, order_weight_value = \
                        sess.run([train_click_precision, train_click_recall, train_click_auc,
                                  train_order_precision, train_order_recall, train_order_auc, train_mean_loss,
                                  click_weight, order_weight])
                    sys.stdout.write(
                        "[%s] spent time: %f | batch_train_loss: %f | mean_train_loss: %f | train_click_precision: %f "
                        "| train_click_recall: %f | train_click_auc: %f | "
                        "train_order_precision: %f | "
                        "train_order_recall: %f | train_order_auc: %f | click_weight: %f |"
                        "order_weight: %f  |--- iter: %d | \n" % \
                        (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), time.time() - start_time,
                         batch_loss_value, train_mean_loss_value,
                         train_click_precision_value, train_click_recall_value, train_click_auc_value,
                         train_order_precision_value, train_order_recall_value, train_order_auc_value,
                         click_weight_value, order_weight_value, step))
                else:
                    train_click_precision_value, train_click_recall_value, train_click_auc_value, \
                    train_order_precision_value, train_order_recall_value, train_order_auc_value, train_mean_loss_value, \
                        = \
                        sess.run([train_click_precision, train_click_recall, train_click_auc,
                                  train_order_precision, train_order_recall, train_order_auc, train_mean_loss
                                  ])

                    sys.stdout.write(
                        "[%s] spent time: %f | batch_train_loss: %f | mean_train_loss: %f | train_click_precision: %f | train_click_recall: %f | train_click_auc: %f |train_order_precision: %f | train_order_recall: %f | train_order_auc: %f | iter: %d\n" % \
                        (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), time.time() - start_time,
                         batch_loss_value, train_mean_loss_value,
                         train_click_precision_value, train_click_recall_value, train_click_auc_value,
                         train_order_precision_value, train_order_recall_value, train_order_auc_value,
                         step))

                sys.stdout.write("=" * 100)

                if step == 1 or step % wnd_conf[MODEL][VALIDATE_STEP] == 0:
                    summary = sess.run(summary_op)
                    writer.add_summary(summary, step)

                    metrics_str = "%s" % ('*' * 70) + "\n" + \
                                  ">> iter_steps:" + str(step) + "\n" + \
                                  "batch_train_loss:" + str(batch_loss_value) + "\n" + \
                                  "mean_train_loss:" + str(train_mean_loss_value) + "\n" + \
                                  "train_click_precision:" + str(train_click_precision_value) + "\n" + \
                                  "train_click_recall:" + str(train_click_recall_value) + "\n" + \
                                  "train_click_auc:" + str(train_click_auc_value) + "\n" + \
                                  "train_order_precision:" + str(train_order_precision_value) + "\n" + \
                                  "train_order_recall:" + str(train_order_recall_value) + "\n" + \
                                  "train_order_auc:" + str(train_order_auc_value) + "\n"

                    log_to_file(metrics_str, wnd_conf[PATH][TRAIN_RESULT])

                    print(
                        "\n[%s] model saving..." %
                        datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
                    saver.save(sess,
                               wnd_conf[PATH][MODEL_PATH] + 'model.ckpt',
                               global_step=step)
                    create_file(wnd_conf[PATH][MODEL_PATH],
                                'step-%d.model.DONE' % step)

        except tf.errors.OutOfRangeError:
            print(
                "-----------------------------OutOfRangeError---------------------------------"
            )
            print("TrainEnd!")
        writer.close()
        saver.save(sess,
                   wnd_conf[PATH][MODEL_PATH] + 'model.ckpt',
                   global_step=step)
        create_file(wnd_conf[PATH][MODEL_PATH], 'step-%d.model.DONE' % step)