Beispiel #1
0
def get_adanet_model():
    # Estimator configuration.
    runConfig = tf.estimator.RunConfig(save_checkpoints_steps=100,
                                       save_summary_steps=100,
                                       tf_random_seed=RANDOM_SEED)
    estimator = adanet.Estimator(
        model_dir=OUTPUT_DIR,
        # adanet_loss_decay=0.99,
        head=tf.contrib.estimator.binary_classification_head(),
        subnetwork_generator=simple_dnn.Generator(
            learn_mixture_weights=True,
            dropout=CONFIG["DROPOUT"],
            feature_columns=bidding_data.
            get_feature_columns_for_imp_prediction(),
            optimizer=tf.train.RMSPropOptimizer(
                learning_rate=ADANET_LEARNING_RATE),
            seed=RANDOM_SEED),
        max_iteration_steps=NUM_EPOCHS // ADANET_ITERATIONS,
        evaluator=adanet.Evaluator(
            input_fn=lambda: bidding_data.validation_input_fn_for_predict_imp(
                batch_size=BATCH_SIZE, num_epochs=NUM_EPOCHS),
            steps=EVAL_STEPS),
        config=runConfig)

    return estimator
Beispiel #2
0
def get_adanet_model():
    LEARNING_RATE = 0.003  #@param {type:"number"}
    TRAIN_STEPS = NUM_EPOCHS  #@param {type:"integer"}
    # BATCH_SIZE = 64  #@param {type:"integer"}
    ADANET_ITERATIONS = 8  #@param {type:"integer"}

    RANDOM_SEED = 42
    # Estimator configuration.
    runConfig = tf.estimator.RunConfig(save_checkpoints_steps=100,
                                       save_summary_steps=100,
                                       tf_random_seed=RANDOM_SEED)
    classifier = estimator = adanet.Estimator(
        model_dir=OUTPUT_DIR,
        adanet_loss_decay=0.99,
        head=tf.contrib.estimator.binary_classification_head(),
        subnetwork_generator=simple_dnn.Generator(
            learn_mixture_weights=True,
            dropout=0.5,
            feature_columns=bidding_data.get_feature_columns_for_wr_prediction(
            ),
            optimizer=tf.train.RMSPropOptimizer(learning_rate=LEARNING_RATE),
            seed=RANDOM_SEED),
        max_iteration_steps=TRAIN_STEPS // ADANET_ITERATIONS,
        evaluator=adanet.Evaluator(
            input_fn=lambda: bidding_data.validation_input_fn_for_predict_wr(
                batch_size=BATCH_SIZE, num_epochs=TRAIN_STEPS),
            steps=EVAL_STEPS),
        config=runConfig)

    return classifier
Beispiel #3
0
 def test_constructor_errors(self,
                             feature_columns,
                             layer_size=3,
                             initial_num_layers=0):
     with self.assertRaises(ValueError):
         simple_dnn.Generator(
             feature_columns=feature_columns,
             optimizer=tf.train.GradientDescentOptimizer(.1),
             layer_size=layer_size,
             initial_num_layers=initial_num_layers)
Beispiel #4
0
    def build_subnetwork_generator(self):
        feature_columns = [
            tf.feature_column.numeric_column(self.FEATURE_KEY,
                                             shape=[28, 28, 1])
        ]

        return simple_dnn.Generator(
            feature_columns=feature_columns,
            optimizer=tf.train.AdamOptimizer(self.learning_rate),
            seed=SEED,
        )
Beispiel #5
0
def dnn_ada():
    print("==============================================")
    start = datetime.datetime.now()
    print("Start Train Adanet with [DNN Model] on Mnist at %s" %
          time_str(start))
    print("- - - - - - - - - - - - - - - - - - - - - - - -")

    LEARNING_RATE = 0.003
    TRAIN_STEPS = 5000
    BATCH_SIZE = 64
    ADANET_ITERATIONS = 2

    model_dir = os.path.join(LOG_DIR, "dnn_%s" % time_str(start))

    config = tf.estimator.RunConfig(save_checkpoints_steps=50000,
                                    save_summary_steps=50000,
                                    tf_random_seed=RANDOM_SEED,
                                    model_dir=model_dir)

    estimator = adanet.Estimator(
        head=head,
        subnetwork_generator=simple_dnn.Generator(
            feature_columns=feature_columns,
            optimizer=tf.train.RMSPropOptimizer(learning_rate=LEARNING_RATE),
            seed=RANDOM_SEED),
        max_iteration_steps=TRAIN_STEPS // ADANET_ITERATIONS,
        evaluator=adanet.Evaluator(input_fn=input_fn("train",
                                                     training=False,
                                                     batch_size=BATCH_SIZE),
                                   steps=None),
        config=config)

    results, _ = tf.estimator.train_and_evaluate(
        estimator,
        train_spec=tf.estimator.TrainSpec(input_fn=input_fn(
            "train", training=True, batch_size=BATCH_SIZE),
                                          max_steps=TRAIN_STEPS),
        eval_spec=tf.estimator.EvalSpec(input_fn=input_fn(
            "test", training=False, batch_size=BATCH_SIZE),
                                        steps=None))

    print("Accuracy:", results["accuracy"])
    print("Loss:", results["average_loss"])

    end = datetime.datetime.now()
    print("Training end at %s" % time_str(end))
    print("Time Spend %s" % str(end - start))
    print("==============================================")
Beispiel #6
0
    def get_estimator(self):
        estimator = adanet.Estimator(
            head=self.head,
            subnetwork_generator=simple_dnn.Generator(
                feature_columns=self.feature_columns,
                optimizer=tf.train.RMSPropOptimizer(
                    learning_rate=self.LEARNING_RATE),
                seed=self.RANDOM_SEED),
            max_iteration_steps=self.TRAIN_STEPS // self.ADANET_ITERATIONS,
            evaluator=adanet.Evaluator(input_fn=self.input_fn(
                "train",
                training=False,
                batch_size=self.BATCH_SIZE,
                RANDOM_SEED=self.RANDOM_SEED),
                                       steps=None),
            config=self.config)

        return estimator
Beispiel #7
0
def get_adanet_model():
    # Estimator configuration.
    # distribution_strategy = tf.contrib.distribute.MirroredStrategy()
    session_config = tf.ConfigProto(log_device_placement=True)
    session_config.gpu_options.allow_growth = True
    session_config.gpu_options.per_process_gpu_memory_fraction = 0.8

    runConfig = tf.estimator.RunConfig(
        # train_distribute=distribution_strategy,
        # eval_distribute=distribution_strategy,
        session_config=session_config,
        save_checkpoints_steps=100,
        save_summary_steps=100,
        tf_random_seed=RANDOM_SEED)
    estimator = adanet.Estimator(
        model_dir=OUTPUT_DIR,
        # metric_fn=custom_metrics,
        # adanet_loss_decay=0.99,
        head=tf.contrib.estimator.multi_label_head(
            name="name",
            n_classes=len(CONFIG['LABELS']),
            # classes_for_class_based_metrics= [5,6]
        ),
        subnetwork_generator=simple_dnn.Generator(
            learn_mixture_weights=True,
            dropout=CONFIG["DROPOUT"],
            feature_columns=data.get_feature_columns(),
            optimizer=tf.train.AdamOptimizer(
                learning_rate=ADANET_LEARNING_RATE),
            seed=RANDOM_SEED),
        max_iteration_steps=NUM_EPOCHS // ADANET_ITERATIONS,
        evaluator=adanet.Evaluator(input_fn=lambda: data.validation_input_fn(
            batch_size=BATCH_SIZE, num_epochs=NUM_EPOCHS),
                                   steps=EVAL_STEPS),
        config=runConfig)

    return estimator
Beispiel #8
0
    def test_generate_candidates(self,
                                 want_names,
                                 want_subnetwork_losses,
                                 want_mixture_weight_losses,
                                 want_complexities,
                                 learn_mixture_weights=False,
                                 initial_num_layers=0,
                                 previous_ensemble=None):
        feature_columns = [tf.feature_column.numeric_column("x")]
        generator = simple_dnn.Generator(
            feature_columns=feature_columns,
            optimizer=tf.train.GradientDescentOptimizer(.1),
            layer_size=3,
            initial_num_layers=initial_num_layers,
            learn_mixture_weights=learn_mixture_weights,
            seed=42)
        with tf.Graph().as_default() as g:
            iteration_step = tf.train.create_global_step()
            features = {"x": [[1.], [2.]]}
            labels = tf.constant([[0.], [1.]])
            names = []
            subnetwork_losses = []
            mixture_weight_losses = []
            complexities = []
            for builder in generator.generate_candidates(
                    previous_ensemble,
                    # The following arguments are not used by
                    # simple_dnn.BuilderGenerator's generate_candidates.
                    iteration_number=0,
                    previous_ensemble_reports=[],
                    all_reports=[]):
                names.append(builder.name)

                # 1. Build subnetwork graph.
                subnetwork = builder.build_subnetwork(
                    features,
                    logits_dimension=1,
                    training=True,
                    iteration_step=iteration_step,
                    summary=tf.summary,
                    previous_ensemble=previous_ensemble)

                # 2. Build subnetwork train ops.
                subnetwork_loss = tf.reduce_mean(
                    tf.nn.sigmoid_cross_entropy_with_logits(
                        logits=subnetwork.logits, labels=labels))
                subnetwork_train_op = builder.build_subnetwork_train_op(
                    subnetwork,
                    subnetwork_loss,
                    var_list=None,
                    labels=labels,
                    iteration_step=iteration_step,
                    summary=tf.summary,
                    previous_ensemble=None)

                # 3. Build mixture weight train ops.

                # Stop gradients since mixture weights should have not propagate
                # beyond top layer.
                subnetwork_logits = tf.stop_gradient(subnetwork.logits)

                # Mixture weight will initialize to a one-valued scalar.
                mixture_weight_logits = tf.layers.dense(
                    subnetwork_logits,
                    units=1,
                    use_bias=False,
                    kernel_initializer=tf.ones_initializer())
                mixture_weight_loss = tf.reduce_mean(
                    tf.nn.sigmoid_cross_entropy_with_logits(
                        logits=mixture_weight_logits, labels=labels))
                mixture_weight_train_op = builder.build_mixture_weights_train_op(
                    mixture_weight_loss,
                    var_list=None,
                    labels=labels,
                    logits=mixture_weight_logits,
                    iteration_step=iteration_step,
                    summary=tf.summary)

                with self.test_session(graph=g) as sess:
                    sess.run(tf.global_variables_initializer())
                    sess.run(subnetwork_train_op)
                    sess.run(mixture_weight_train_op)
                    subnetwork_losses.append(sess.run(subnetwork_loss))
                    mixture_weight_losses.append(sess.run(mixture_weight_loss))
                    complexities.append(sess.run(subnetwork.complexity))

        self.assertEqual(want_names, names)
        self.assertAllClose(want_subnetwork_losses,
                            subnetwork_losses,
                            atol=1e-3)
        self.assertAllClose(want_mixture_weight_losses,
                            mixture_weight_losses,
                            atol=1e-3)
        self.assertAllClose(want_complexities, complexities, atol=1e-3)
Beispiel #9
0
def map_fun(args, ctx):
    from datetime import datetime
    import tensorflow as tf
    import os
    import time
    import json

    import adanet
    from adanet.examples import simple_dnn

    worker_num = ctx.worker_num
    job_name = ctx.job_name
    task_index = ctx.task_index
    message = "worker_num: {0}, job_name: {1}, task_index: {2}".format(
        worker_num, job_name, task_index)
    print(message)
    input_dim = int(args.input_dim)
    batch_size = args.batch_size
    # Fix Random Seed
    RANDOM_SEED = 42

    FEATURES_KEY = "features"

    NUM_CLASSES = 2

    loss_reduction = tf.losses.Reduction.SUM_OVER_BATCH_SIZE

    # head = tf.contrib.estimator.multi_class_head(NUM_CLASSES, loss_reduction=loss_reduction)
    head = tf.contrib.estimator.binary_classification_head(
        loss_reduction=loss_reduction)

    # numeric_column do not support SparseTensor
    feature_columns = [
        tf.feature_column.numeric_column(key=FEATURES_KEY, shape=[input_dim])
    ]

    log_dir = ctx.absolute_path(args.log_dir)
    export_dir = ctx.absolute_path(args.export_dir)
    pred_dir = ctx.absolute_path(args.prediction_dir)
    print("tensorflow log path: {0}".format(log_dir))
    print("tensorflow export path: {0}".format(export_dir))
    print("tensorflow prediction path: {0}".format(pred_dir))

    def generator(ln):
        splits = tf.string_split([ln], delimiter=" ")
        label = splits.values[0]
        label = tf.string_to_number(label, tf.float64)
        label = tf.cond(
            label >= 1.0,
            lambda: tf.constant(1, shape=[1], dtype=tf.float32),
            lambda: tf.constant(0, shape=[1], dtype=tf.float32),
        )

        # SparseTensor output
        col_val = tf.string_split(splits.values[1::], delimiter=":")
        col = tf.string_to_number(col_val.values[0::2], tf.int64) - 1

        vals = col_val.values[1::2]
        vals = tf.string_to_number(vals, tf.float32)

        # Filter the features which occurs few than given input_dim
        vals = tf.boolean_mask(vals, col < input_dim)
        col = tf.boolean_mask(col, col < input_dim)

        row = tf.cast(tf.fill(tf.shape(col), 0), tf.int64, name="row_cast")
        row_col = tf.transpose(tf.stack([row, col]), name="row_col_transpose")

        sparse = tf.SparseTensor(row_col, vals, (1, input_dim))

        # convert to dense,191106 必须转
        features = {FEATURES_KEY: tf.sparse_tensor_to_dense(sparse)}

        return features, label

    def new_input_fn(partition, training):
        def _input_fn():
            # path is ok
            parse_fn = generator

            if partition == "train":
                data_dir = ctx.absolute_path(args.data_dir)
                file_pattern = os.path.join(data_dir, "part-*")
                ds = tf.data.Dataset.list_files(file_pattern, shuffle=False)

                ds = ds.apply(
                    tf.contrib.data.parallel_interleave(
                        tf.data.TextLineDataset, cycle_length=10))
                ds = ds.map(parse_fn, num_parallel_calls=5)
                if training:
                    ds = ds.shuffle(batch_size * 5).repeat()
            else:
                data_dir = ctx.absolute_path(args.test_dir)
                file_pattern = os.path.join(data_dir, "part-*")
                ds = tf.data.Dataset.list_files(file_pattern, shuffle=False)

                ds = ds.apply(
                    tf.contrib.data.parallel_interleave(
                        tf.data.TextLineDataset, cycle_length=10))
                ds = ds.map(parse_fn, num_parallel_calls=5)

            iterator = ds.make_one_shot_iterator()
            features, labels = iterator.get_next()
            return features, labels

            # ds = ds.apply(tf.contrib.data.batch_and_drop_remainder(batch_size))
            # return ds.batch(batch_size)

        return _input_fn

    print("========= Start Training")
    LEARNING_RATE = 0.01
    TRAIN_STEPS = 3000
    ADANET_ITERATIONS = 3  # AKA Boosting Iteration
    # 控制模型复杂度
    ADANET_LAMBDA = 0.1
    LEARN_MIXTURE_WEIGHTS = False

    #strategy = adanet.distributed.RoundRobinStrategy()

    # 191125 这里一定要设置
    tfc = json.dumps({
        "cluster": ctx.cluster_spec,
        "task": {
            "type": job_name,
            "index": task_index
        }
    })
    os.environ["TF_CONFIG"] = tfc

    # 191127 尝试不用 device_filter,用了 strategy 后会自动设置为 /job:ps,不需要时候手动设置
    config = tf.estimator.RunConfig(
        save_checkpoints_steps=5000,
        tf_random_seed=RANDOM_SEED,
        model_dir=log_dir,
    )

    # config = tf.estimator.RunConfig(
    #     save_checkpoints_steps=5000,
    #     tf_random_seed=RANDOM_SEED,
    #     model_dir=logdir,
    #     session_config=tf.ConfigProto(
    #         log_device_placement=False, device_filters=["/job:ps"]
    #     ),
    # )

    # BaseLine Linear
    # estimator = tf.estimator.LinearClassifier(
    #     feature_columns=feature_columns,
    #     n_classes=NUM_CLASSES,
    #     optimizer=tf.train.RMSPropOptimizer(learning_rate=LEARNING_RATE),
    #     loss_reduction=loss_reduction,
    #     config=config
    # )

    # DNN TEST - ADANET
    estimator = adanet.Estimator(
        head=head,
        force_grow=True,
        subnetwork_generator=simple_dnn.Generator(
            layer_size=128,
            initial_num_layers=2,
            dropout=0.2,
            feature_columns=feature_columns,
            optimizer=tf.train.RMSPropOptimizer(learning_rate=LEARNING_RATE),
            learn_mixture_weights=LEARN_MIXTURE_WEIGHTS,
            seed=RANDOM_SEED,
        ),
        adanet_lambda=ADANET_LAMBDA,
        max_iteration_steps=TRAIN_STEPS // ADANET_ITERATIONS,
        #evaluator=adanet.Evaluator(input_fn=new_input_fn("test", False)),
        evaluator=adanet.Evaluator(input_fn=new_input_fn("test", False),
                                   steps=1000),
        config=config,
        #experimental_placement_strategy=strategy,
        # 记录 report,实际上没啥用
        #     report_materializer=adanet.ReportMaterializer(
        #         input_fn=new_input_fn("train", False),
        #     ),
    )

    # 尝试不 return 任何东西,只是计算
    tf.estimator.train_and_evaluate(
        estimator,
        train_spec=tf.estimator.TrainSpec(input_fn=new_input_fn("train", True),
                                          max_steps=TRAIN_STEPS),
        # 这里的 Eval 在分布式场景下,实际上并没有任何作用
        eval_spec=tf.estimator.EvalSpec(
            input_fn=new_input_fn("test", False),
            steps=None,
            start_delay_secs=1,
            throttle_secs=30,
        ),
    )

    # 最后一轮只训练,模型参数会保存到 model.ckpt,并不会再为下一轮去做准备

    # 参考 https://github.com/tensorflow/adanet/blob/master/adanet/core/estimator_test.py
    # line 2362 def test_export_saved_model_always_uses_replication_placement(self):
    def serving_input_receiver_fn():
        serialized_sample = tf.compat.v1.placeholder(dtype=tf.float32,
                                                     shape=[None, input_dim],
                                                     name='features')
        tensor_features = {'features': serialized_sample}
        return tf.estimator.export.ServingInputReceiver(
            features=tensor_features, receiver_tensors=serialized_sample)

    # 在 RoundRobinStrategy 下无法执行
    if ctx.job_name == "chief":
        # 进行预测,分别是测试和训练
        print('export test result')
        predictions = estimator.predict(new_input_fn("test", False))
        print('Writing Predictions to {}'.format(pred_dir))
        tf.gfile.MakeDirs(pred_dir)
        with tf.gfile.GFile("{}/test".format(pred_dir), 'w') as f:
            for pred in predictions:
                f.write(str(pred))
                f.write('\n')
        print('export train result')
        predictions = estimator.predict(new_input_fn("train", False))
        print('Writing Predictions to {}'.format(pred_dir))
        tf.gfile.MakeDirs(pred_dir)
        with tf.gfile.GFile("{}/train".format(pred_dir), 'w') as f:
            for pred in predictions:
                f.write(str(pred))
                f.write('\n')
        # 导出模型
        estimator.export_saved_model(
            export_dir,
            serving_input_receiver_fn,
            experimental_mode=tf.estimator.ModeKeys.PREDICT)
Beispiel #10
0
def map_fun(args, ctx):
    from datetime import datetime
    import tensorflow as tf
    import os
    import time
    import json

    import adanet
    from adanet.examples import simple_dnn

    worker_num = ctx.worker_num
    job_name = ctx.job_name
    task_index = ctx.task_index
    message = "worker_num: {0}, job_name: {1}, task_index: {2}".format(
        worker_num, job_name, task_index)
    print(message)
    input_dim = int(args.input_dim)
    batch_size = args.batch_size
    # Fix Random Seed
    RANDOM_SEED = 42

    FEATURES_KEY = "features"

    loss_reduction = tf.losses.Reduction.SUM_OVER_BATCH_SIZE

    def weighted_cross_entropy_with_logits(labels, logits):
        return tf.nn.weighted_cross_entropy_with_logits(targets=labels,
                                                        logits=logits,
                                                        pos_weight=4)

    head = tf.contrib.estimator.binary_classification_head(
        loss_reduction=loss_reduction,
        loss_fn=weighted_cross_entropy_with_logits)

    # numeric_column do not support SparseTensor
    feature_columns = [
        tf.feature_column.numeric_column(key=FEATURES_KEY, shape=[input_dim])
    ]

    log_dir = ctx.absolute_path(args.log_dir)
    export_dir = ctx.absolute_path(args.export_dir)
    pred_dir = ctx.absolute_path(args.prediction_dir)
    print("tensorflow log path: {0}".format(log_dir))
    print("tensorflow export path: {0}".format(export_dir))
    print("tensorflow prediction path: {0}".format(pred_dir))

    def generator(ln):
        splits = tf.string_split([ln], delimiter=" ")
        label = splits.values[0]
        label = tf.string_to_number(label, tf.float64)
        label = tf.cond(
            label >= 1.0,
            lambda: tf.constant(1, shape=[1], dtype=tf.float32),
            lambda: tf.constant(0, shape=[1], dtype=tf.float32),
        )

        # SparseTensor output
        col_val = tf.string_split(splits.values[1::], delimiter=":")
        col = tf.string_to_number(col_val.values[0::2], tf.int64) - 1

        vals = col_val.values[1::2]
        vals = tf.string_to_number(vals, tf.float32)

        # Filter the features which occurs few than given input_dim
        vals = tf.boolean_mask(vals, col < input_dim)
        col = tf.boolean_mask(col, col < input_dim)

        row = tf.cast(tf.fill(tf.shape(col), 0), tf.int64, name="row_cast")
        row_col = tf.transpose(tf.stack([row, col]), name="row_col_transpose")

        sparse = tf.SparseTensor(row_col, vals, (1, input_dim))

        # convert to dense,191106 必须转
        features = {FEATURES_KEY: tf.sparse_tensor_to_dense(sparse)}

        return features, label

    def new_input_fn(partition, training):
        def _input_fn():
            # path is ok
            parse_fn = generator

            if partition == "train":
                data_dir = ctx.absolute_path(args.data_dir)
                file_pattern = os.path.join(data_dir, "part-*")
                ds = tf.data.Dataset.list_files(file_pattern, shuffle=False)

                ds = ds.apply(
                    tf.contrib.data.parallel_interleave(
                        tf.data.TextLineDataset, cycle_length=10))
                ds = ds.map(parse_fn, num_parallel_calls=5)
                if training:
                    ds = ds.shuffle(batch_size * 5).repeat()
            else:
                data_dir = ctx.absolute_path(args.test_dir)
                file_pattern = os.path.join(data_dir, "part-*")
                ds = tf.data.Dataset.list_files(file_pattern, shuffle=False)

                ds = ds.apply(
                    tf.contrib.data.parallel_interleave(
                        tf.data.TextLineDataset, cycle_length=10))
                ds = ds.map(parse_fn, num_parallel_calls=5)

            iterator = ds.make_one_shot_iterator()
            features, labels = iterator.get_next()
            return features, labels

            # ds = ds.apply(tf.contrib.data.batch_and_drop_remainder(batch_size))
            # return ds.batch(batch_size)

        return _input_fn

    print("========= Start Training")
    LEARNING_RATE = 0.01
    TRAIN_STEPS = 1000
    ADANET_ITERATIONS = 4  # AKA Boosting Iteration
    # 控制模型复杂度
    ADANET_LAMBDA = 0.1
    LEARN_MIXTURE_WEIGHTS = False

    #strategy = adanet.distributed.RoundRobinStrategy()

    # 191125 这里一定要设置
    tfc = json.dumps({
        "cluster": ctx.cluster_spec,
        "task": {
            "type": job_name,
            "index": task_index
        }
    })
    os.environ["TF_CONFIG"] = tfc

    # 191127 尝试不用 device_filter,用了 strategy 后会自动设置为 /job:ps,不需要时候手动设置
    config = tf.estimator.RunConfig(
        save_checkpoints_steps=5000,
        tf_random_seed=RANDOM_SEED,
        model_dir=log_dir,
    )

    # estimator = tf.estimator.LinearEstimator(
    #     head=head,
    #     feature_columns=feature_columns,
    #     config=config
    #
    # )

    # config = tf.estimator.RunConfig(
    #     save_checkpoints_steps=5000,
    #     tf_random_seed=RANDOM_SEED,
    #     model_dir=logdir,
    #     session_config=tf.ConfigProto(
    #         log_device_placement=False, device_filters=["/job:ps"]
    #     ),
    # )

    # DNN TEST - ADANET
    estimator = adanet.Estimator(
        head=head,
        force_grow=False,
        subnetwork_generator=simple_dnn.Generator(
            layer_size=128,
            initial_num_layers=1,
            dropout=0.2,
            feature_columns=feature_columns,
            optimizer=tf.train.RMSPropOptimizer(learning_rate=LEARNING_RATE),
            learn_mixture_weights=LEARN_MIXTURE_WEIGHTS,
            seed=RANDOM_SEED,
        ),
        adanet_lambda=ADANET_LAMBDA,
        max_iteration_steps=TRAIN_STEPS // ADANET_ITERATIONS,
        evaluator=adanet.Evaluator(input_fn=new_input_fn("test", False),
                                   steps=1000),
        config=config,
    )

    # ensemble_estimator = adanet.AutoEnsembleEstimator(
    #     head=head,
    #     candidate_pool= lambda config: {
    #         "linear1":
    #             tf.estimator.LinearEstimator(
    #                 head=head,
    #                 feature_columns=feature_columns,
    #                 optimizer=tf.train.RMSPropOptimizer(learning_rate=0.1),
    #                 config=config,
    #             ),
    #         "dnn1":
    #             tf.estimator.DNNEstimator(
    #                 head=head,
    #                 feature_columns=feature_columns,
    #                 optimizer=tf.train.RMSPropOptimizer(learning_rate=0.001),
    #                 hidden_units=[512, 256, 128],
    #                 config=config,
    #             ),
    #         "dnn2":
    #             tf.estimator.DNNEstimator(
    #                 head=head,
    #                 feature_columns=feature_columns,
    #                 optimizer=tf.train.RMSPropOptimizer(learning_rate=0.01),
    #                 hidden_units=[256, 128],
    #                 config=config,
    #             ),
    #         "dnn_linear":
    #             tf.estimator.DNNLinearCombinedEstimator(
    #                 head=head,
    #                 dnn_feature_columns=feature_columns,
    #                 linear_feature_columns=feature_columns,
    #                 dnn_hidden_units=[512, 256, 128],
    #                 config=config,
    #             )
    #     },
    #     max_iteration_steps=100,
    # )

    cur_e = estimator

    # 尝试不 return 任何东西,只是计算
    tf.estimator.train_and_evaluate(
        cur_e,
        train_spec=tf.estimator.TrainSpec(input_fn=new_input_fn("train", True),
                                          max_steps=TRAIN_STEPS),
        # 这里的 Eval 在分布式场景下,实际上并没有任何作用
        eval_spec=tf.estimator.EvalSpec(
            input_fn=new_input_fn("test", False),
            steps=None,
            start_delay_secs=1,
            throttle_secs=30,
        ),
    )

    # 最后一轮只训练,模型参数会保存到 model.ckpt,并不会再为下一轮去做准备
    # 这样的保存方式,需要输入是一个 example,不适合 DSP 的输入
    # feature_spec = tf.feature_column.make_parse_example_spec(feature_columns)
    # serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)

    def serving_input_receiver_fn():
        indices = tf.placeholder(dtype=tf.int64,
                                 shape=[None, None],
                                 name='indices')
        values = tf.placeholder(dtype=tf.float32, shape=[None], name='values')
        shape = tf.placeholder(dtype=tf.int64,
                               shape=[None],
                               name='dense_shape')
        receiver_input = {
            'indices': indices,
            'values': values,
            'dense_shape': shape
        }
        # 先构成 sparse,然后 sparse_to_dense
        sparse = tf.SparseTensor(indices, values, shape)
        features = {FEATURES_KEY: tf.sparse_tensor_to_dense(sparse)}

        return tf.estimator.export.ServingInputReceiver(
            features, receiver_input)

    # 在 RoundRobinStrategy 下无法执行
    if ctx.job_name == "chief":
        # 进行 evaluate,比较慢,跳过

        # predictions = cur_e.predict(new_input_fn("test", False))
        # result = cur_e.evaluate(new_input_fn("test", False))
        # with tf.gfile.GFile("{}/evaluate".format(log_dir), 'w') as f:
        #     f.write(str(result))
        #     f.write('\n')
        # 进行预测,分别是测试和训练
        # print('export test result')
        # predictions = estimator.predict(new_input_fn("test", False))
        # print('Writing Predictions to {}'.format(pred_dir))
        # tf.gfile.MakeDirs(pred_dir)
        # with tf.gfile.GFile("{}/test".format(pred_dir), 'w') as f:
        #     for pred in predictions:
        #         f.write(str(pred))
        #         f.write('\n')
        # print('export train result')
        # predictions = estimator.predict(new_input_fn("train", False))
        # print('Writing Predictions to {}'.format(pred_dir))
        # tf.gfile.MakeDirs(pred_dir)
        # with tf.gfile.GFile("{}/train".format(pred_dir), 'w') as f:
        #     for pred in predictions:
        #         f.write(pred['classes'][0])
        #         f.write('\n')
        # 导出模型
        # 191204 这样导出没有办法指定 serving 时的输出,
        cur_e.export_saved_model(export_dir, serving_input_receiver_fn)
Beispiel #11
0
def map_fun_v2(args, ctx):
    from datetime import datetime
    import tensorflow as tf
    import time

    worker_num = ctx.worker_num
    job_name = ctx.job_name
    task_index = ctx.task_index

    # Parameters
    IMAGE_PIXELS = 28
    hidden_units = 128
    # Fix Random Seed
    RANDOM_SEED = 42

    (x_train, y_train), (x_test,
                         y_test) = (tf.keras.datasets.mnist.load_data())

    FEATURES_KEY = "images"

    NUM_CLASSES = 10

    loss_reduction = tf.losses.Reduction.SUM_OVER_BATCH_SIZE

    head = tf.contrib.estimator.multi_class_head(NUM_CLASSES,
                                                 loss_reduction=loss_reduction)

    feature_columns = [
        tf.feature_column.numeric_column(FEATURES_KEY, shape=[28, 28, 1])
    ]

    # Get TF cluster and server instances
    cluster, server = ctx.start_cluster_server(1, args.rdma)

    def generator(images, labels):
        """Returns a generator that returns image-label pairs."""
        def _gen():
            for image, label in zip(images, labels):
                yield image, label

        return _gen

    def preprocess_image(image, label):
        """Preprocesses an image for an `Estimator`."""
        image = image / 255.
        image = tf.reshape(image, [28, 28, 1])
        features = {FEATURES_KEY: image}
        return features, label

    def input_fn(partition, training):
        """Generate an input_fn for the Estimator."""
        def _input_fn():
            if partition == "train":
                dataset = tf.data.Dataset.from_generator(
                    generator(x_train, y_train), (tf.float32, tf.int32),
                    ((28, 28), ()))
            else:
                dataset = tf.data.Dataset.from_generator(
                    generator(x_test, y_test), (tf.float32, tf.int32),
                    ((28, 28), ()))

            if training:
                dataset = dataset.shuffle(10 * args.batch_size,
                                          seed=RANDOM_SEED).repeat()

            dataset = dataset.map(preprocess_image).batch(args.batch_size)
            iterator = dataset.make_one_shot_iterator()
            features, labels = iterator.get_next()
            return features, labels

        return _input_fn

    if job_name == "ps":
        server.join()
    elif job_name == "worker":
        # Assigns ops to the local worker by default
        # 这里的日志都是看不到的
        message = ""
        with tf.device(
                tf.train.replica_device_setter(
                    worker_device="/job:worker/task:%d" % task_index,
                    cluster=cluster)):
            print("========= Start Training")
            LEARNING_RATE = 0.003
            TRAIN_STEPS = 5000
            BATCH_SIZE = 64
            ADANET_ITERATIONS = 2

            logdir = ctx.absolute_path(args.model)

            config = tf.estimator.RunConfig(save_checkpoints_steps=50000,
                                            save_summary_steps=50000,
                                            tf_random_seed=RANDOM_SEED,
                                            model_dir=logdir)

            # 先测试下线性模型
            # estimator = tf.estimator.LinearClassifier(
            #     feature_columns=feature_columns,
            #     n_classes=NUM_CLASSES,
            #     optimizer=tf.train.RMSPropOptimizer(learning_rate=LEARNING_RATE),
            #     loss_reduction=loss_reduction,
            #     config=config
            # )

            estimator = adanet.Estimator(
                head=head,
                subnetwork_generator=simple_dnn.Generator(
                    feature_columns=feature_columns,
                    optimizer=tf.train.RMSPropOptimizer(
                        learning_rate=LEARNING_RATE),
                    seed=RANDOM_SEED),
                max_iteration_steps=TRAIN_STEPS // ADANET_ITERATIONS,
                evaluator=adanet.Evaluator(input_fn=input_fn("train",
                                                             training=False),
                                           steps=None),
                config=config)

            results, _ = tf.estimator.train_and_evaluate(
                estimator,
                train_spec=tf.estimator.TrainSpec(input_fn=input_fn(
                    "train", training=True),
                                                  max_steps=TRAIN_STEPS),
                eval_spec=tf.estimator.EvalSpec(input_fn=input_fn(
                    "test", training=False),
                                                steps=None))

            print("Accuracy:", results["accuracy"])
            print("Loss:", results["average_loss"])
            message = "Accuracy: {}; Loss: {}".format(results["accuracy"],
                                                      results["average_loss"])
            print("==============================================")

        print("{} stopping MonitoredTrainingSession".format(
            datetime.now().isoformat()))

        # WORKAROUND FOR https://github.com/tensorflow/tensorflow/issues/21745
        # wait for all other nodes to complete (via done files)
        done_dir = "{}/{}/done".format(ctx.absolute_path(args.model),
                                       args.mode)
        print("Writing done file to: {}".format(done_dir))
        tf.gfile.MakeDirs(done_dir)
        with tf.gfile.GFile("{}/{}".format(done_dir, ctx.task_index),
                            'w') as done_file:
            done_file.write("done")
            done_file.write(message)

        for i in range(60):
            if len(tf.gfile.ListDirectory(done_dir)) < len(
                    ctx.cluster_spec['worker']):
                print("{} Waiting for other nodes {}".format(
                    datetime.now().isoformat(), i))
                time.sleep(1)
            else:
                print("{} All nodes done".format(datetime.now().isoformat()))
                break
Beispiel #12
0
def map_fun(args, ctx):
    from datetime import datetime
    import tensorflow as tf
    import os
    import time

    worker_num = ctx.worker_num
    job_name = ctx.job_name
    task_index = ctx.task_index
    message = 'worker_num: {0}, job_name: {1}, task_index: {2}'.format(worker_num, job_name, task_index)

    input_dim = int(args.input_dim)
    batch_size = args.batch_size

    # Fix Random Seed
    RANDOM_SEED = 42

    FEATURES_KEY = "ctr"

    NUM_CLASSES = 2

    loss_reduction = tf.losses.Reduction.SUM_OVER_BATCH_SIZE

    # head = tf.contrib.estimator.multi_class_head(NUM_CLASSES, loss_reduction=loss_reduction)
    head = tf.contrib.estimator.binary_classification_head(loss_reduction=loss_reduction)

    # 用 numeric_column 是不支持 SparseTensor 的
    feature_columns = [
        tf.feature_column.numeric_column(FEATURES_KEY, shape=[input_dim])
    ]

    log_dir = ctx.absolute_path(args.log_dir)
    export_dir = ctx.absolute_path(args.export_dir)
    print("tensorflow log path: {0}".format(log_dir))
    print("tensorflow export path: {0}".format(export_dir))

    # Get TF cluster and server instances
    cluster, server = ctx.start_cluster_server(1, args.rdma)

    def generator(ln):
        splits = tf.string_split([ln], delimiter=' ')
        label = splits.values[0]
        label = tf.string_to_number(label, tf.float64)
        label = tf.cond(label >= 1.0,
                        lambda: tf.constant(1, shape=[1], dtype=tf.float32),
                        lambda: tf.constant(0, shape=[1], dtype=tf.float32))

        # SparseTensor output
        col_val = tf.string_split(splits.values[1::], delimiter=':')
        col = tf.string_to_number(col_val.values[0::2], tf.int64) - 1

        vals = col_val.values[1::2]
        vals = tf.string_to_number(vals, tf.float32)

        # Filter the features which occurs few than given input_dim
        vals = tf.boolean_mask(vals, col < input_dim)
        col = tf.boolean_mask(col, col < input_dim)

        row = tf.cast(tf.fill(tf.shape(col), 0), tf.int64, name='row_cast')
        row_col = tf.transpose(tf.stack([row, col]), name='row_col_transpose')

        sparse = tf.SparseTensor(row_col, vals, (1, input_dim))

        # 转换成 dense
        features = {FEATURES_KEY: tf.sparse_tensor_to_dense(sparse)}

        return features, label

    def input_fn(partition):
        """Generate an input_fn for the Estimator."""

        def _input_fn():
            num_workers = len(ctx.cluster_spec['worker'])

            data_dir = ctx.absolute_path(args.data_dir)
            file_pattern = os.path.join(data_dir, 'part-*')
            ds = tf.data.Dataset.list_files(file_pattern)
            ds = ds.shard(num_workers, task_index).repeat(args.epochs)

            if args.format == 'libsvm':
                ds = ds.apply(tf.contrib.data.parallel_interleave(tf.data.TextLineDataset, cycle_length=10))
                parse_fn = generator

            if partition == "train":
                ds = ds.map(parse_fn, num_parallel_calls=5).shuffle(batch_size * 5)
            else:
                ds = ds.map(parse_fn, num_parallel_calls=5)

            ds = ds.apply(tf.contrib.data.batch_and_drop_remainder(batch_size)).prefetch(100)
            iterator = ds.make_one_shot_iterator()
            features, labels = iterator.get_next()
            return features, labels

        return _input_fn

    if job_name == "ps":
        server.join()
    elif job_name == "worker":
        # Assigns ops to the local worker by default
        # 这里的日志都是看不到的
        message = ""
        with tf.device(tf.train.replica_device_setter(
                worker_device="/job:worker/task:%d" % task_index,
                cluster=cluster)):
            print("========= Start Training")
            LEARNING_RATE = 0.003
            TRAIN_STEPS = 1000
            ADANET_ITERATIONS = 2

            # 目前来看效果不是很好,还不如线性
            logdir = ctx.absolute_path(args.log_dir)

            config = tf.estimator.RunConfig(
                save_checkpoints_steps=50000,
                save_summary_steps=50000,
                tf_random_seed=RANDOM_SEED,
                model_dir=logdir
            )

            # BaseLine 线性模型
            # estimator = tf.estimator.LinearClassifier(
            #     feature_columns=feature_columns,
            #     n_classes=NUM_CLASSES,
            #     optimizer=tf.train.RMSPropOptimizer(learning_rate=LEARNING_RATE),
            #     loss_reduction=loss_reduction,
            #     config=config
            # )

            # DNN 测试 - ADANET
            estimator = adanet.Estimator(
                head=head,
                subnetwork_generator=simple_dnn.Generator(
                    layer_size=128,
                    initial_num_layers=3,
                    dropout=0.2,
                    feature_columns=feature_columns,
                    optimizer=tf.train.RMSPropOptimizer(learning_rate=LEARNING_RATE),
                    seed=RANDOM_SEED),
                max_iteration_steps=TRAIN_STEPS // ADANET_ITERATIONS,
                evaluator=adanet.Evaluator(
                    input_fn=input_fn("train"),
                    steps=None
                ),
                config=config
            )

            results, _ = tf.estimator.train_and_evaluate(
                estimator,
                train_spec=tf.estimator.TrainSpec(
                    input_fn=input_fn("train"),
                    max_steps=TRAIN_STEPS),
                eval_spec=tf.estimator.EvalSpec(
                    input_fn=input_fn("test"),
                    steps=None)
            )

            print("Accuracy:", results["accuracy"])
            print("Loss:", results["average_loss"])
            message = "Accuracy: {}; Loss: {}".format(results["accuracy"], results["average_loss"])
            arch = results["architecture/adanet/ensembles"]
            summary_proto = tf.summary.Summary.FromString(arch)
            arch_result = summary_proto.value[0].tensor.string_val[0]
            print("==============================================")


    print("{} stopping MonitoredTrainingSession".format(datetime.now().isoformat()))

    # WORKAROUND for https://github.com/tensorflow/tensorflow/issues/21745
    # wait for all other nodes to complete (via done files)
    done_dir = "{}/{}/done".format(ctx.absolute_path(args.log_dir), args.mode)
    print("Writing done file to: {}".format(done_dir))
    tf.gfile.MakeDirs(done_dir)
    with tf.gfile.GFile("{}/{}".format(done_dir, ctx.task_index), 'w') as done_file:
        done_file.write(message)
        done_file.write(arch_result)

    for i in range(30):
        if len(tf.gfile.ListDirectory(done_dir)) < len(ctx.cluster_spec['worker']):
            print("{} Waiting for other nodes {}".format(datetime.now().isoformat(), i))
            time.sleep(1)
        else:
            print("{} All nodes done".format(datetime.now().isoformat()))
            break
Beispiel #13
0
def dnn_ada():
    print("==============================================")
    start = datetime.datetime.now()
    print("Start Train Adanet with [DNN Model] on Criteo at %s" % time_str(start))
    print("- - - - - - - - - - - - - - - - - - - - - - - -")

    # 根据论文参数调整
    LEARNING_RATE = LR

    model_dir = os.path.join(LOG_DIR, "dnn_%s" % time_str(start))
    result_file = os.path.join(RESULT_DIR, "dnn_%s" % time_str(start))
    valid_file = os.path.join(RESULT_DIR, "valid_%s" % time_str(start))
    test_file = os.path.join(RESULT_DIR, "test_%s" % time_str(start))
    tpred_file = os.path.join(RESULT_DIR, "tpred_%s" % time_str(start))
    vpred_file = os.path.join(RESULT_DIR, "vpred_%s" % time_str(start))

    config = tf.estimator.RunConfig(
        save_checkpoints_steps=50000,
        save_summary_steps=50000,
        tf_random_seed=RANDOM_SEED,
        model_dir=model_dir
    )

    # layer size 125 256 512
    estimator = adanet.Estimator(
        head=head,
        subnetwork_generator=simple_dnn.Generator(
            feature_columns=feature_columns,
            layer_size=LS,
            optimizer=tf.train.RMSPropOptimizer(learning_rate=LEARNING_RATE),
            seed=RANDOM_SEED),
        max_iteration_steps=TRAIN_STEPS // ADANET_ITERATIONS,
        evaluator=adanet.Evaluator(
            input_fn=input_fn("train"),
            steps=None),
        config=config
    )

    results, _ = tf.estimator.train_and_evaluate(
        estimator,
        train_spec=tf.estimator.TrainSpec(
            input_fn=input_fn("train"),
            max_steps=TRAIN_STEPS),
        eval_spec=tf.estimator.EvalSpec(
            input_fn=input_fn("test"),
            steps=None)
    )

    print("Accuracy:", results["accuracy"])
    print("AUC", results["auc"])
    print("Loss:", results["average_loss"])

    # 重新获取评测结果
    train_spec = estimator.evaluate(input_fn=input_fn("train"))
    test_spec = estimator.evaluate(input_fn=input_fn("test"))

    end = datetime.datetime.now()
    print("Training end at %s" % time_str(end))
    print("Time Spend %s" % str(end - start))
    print("==============================================")
    with open('{}.txt'.format(result_file), 'w') as f:
        f.write('Train Configs:\n')
        f.write('[Layer Size] {}\n'.format(LS))
        f.write('[Learning Rate] {}\n'.format(LR))
        f.write('[BATCH SIZE] {}\n'.format(BATCH_SIZE))
        f.write('[Train Step] {}\n'.format(TRAIN_STEPS))
        f.write('[Adanet Iteration] {}\n'.format(ADANET_ITERATIONS))
        f.write('\nResults:\n')
        f.write('[Accurary] {}\n'.format(results["accuracy"]))
        f.write('[AUC] {}\n'.format(results["auc"]))
        f.write('[Loss] {}\n'.format(results["average_loss"]))
        f.write('[Time Spend] {}\n'.format(str(end - start)))
        f.write('[Train Spec] {}\n'.format(str(train_spec)))
        f.write('[Test Spec] {}\n'.format(str(test_spec)))

    # 写入测试集
    print("export test data")
    test.to_csv('{}.txt'.format(test_file))
    print("export train data")
    train.to_csv('{}.txt'.format(valid_file))

    # 进行预测
    predictions = estimator.predict(input_fn=input_fn("test"))
    # 写入预测集
    with open('{}.txt'.format(tpred_file), 'w') as f:
        for pred in predictions:
            f.write(str(pred))
            f.write('\n')

    # 进行预测并写入预测集
    predictions = estimator.predict(input_fn=input_fn("valid"))
    # 写入预测集
    with open('{}.txt'.format(vpred_file), 'w') as f:
        for pred in predictions:
            f.write(str(pred))
            f.write('\n')