Beispiel #1
0
		def run_eval(steps):
			import _pickle as pkl
			eval_features = tf_data_utils.eval_input_fn(
										parse_folder(FLAGS.dev_file),
										_decode_record, name_to_features, params)
			eval_dict = model_eval_fn(eval_features, [], tf.estimator.ModeKeys.EVAL)
			sess.run(tf.local_variables_initializer())
			eval_finial_dict = eval_fn(eval_dict)
			if hvd.rank() == 0:
				pkl.dump(eval_finial_dict, open(FLAGS.model_output+"/eval_dict_{}.pkl".format(steps), "wb"))
			return eval_finial_dict
Beispiel #2
0
        for name in list(example.keys()):
            t = example[name]
            if t.dtype == tf.int64:
                t = tf.to_int32(t)
            example[name] = t
        for name in ["input_ids", "input_mask", "segment_ids"]:
            example[name] = tf.reshape(example[name], [-1, max_seq_length])
        return example

    params = Bunch({})
    params.epoch = FLAGS.epoch
    params.batch_size = FLAGS.batch_size
    train_features = tf_data_utils.train_input_fn(FLAGS.train_file,
                                                  _decode_record,
                                                  name_to_features, params)
    eval_features = tf_data_utils.eval_input_fn(FLAGS.dev_file, _decode_record,
                                                name_to_features, params)

    [train_op, train_loss, train_per_example_loss,
     train_logits] = model_train_fn(train_features, [],
                                    tf.estimator.ModeKeys.TRAIN)
    [_, eval_loss, eval_per_example_loss,
     eval_logits] = model_eval_fn(eval_features, [],
                                  tf.estimator.ModeKeys.EVAL)
    result = metric_fn(eval_features, eval_logits, eval_loss)

    model_io_fn.set_saver()

    init_op = tf.group(tf.global_variables_initializer(),
                       tf.local_variables_initializer())
    sess.run(init_op)
def train_eval_fn(FLAGS, init_checkpoint, train_file, dev_file, checkpoint_dir,
                  **kargs):

    graph = tf.Graph()
    with graph.as_default():
        import json

        config = model_config_parser(FLAGS)

        train_size = int(FLAGS.train_size)
        init_lr = FLAGS.init_lr

        distillation_config = Bunch(
            json.load(tf.gfile.Open(FLAGS.multi_task_config)))

        if FLAGS.use_tpu:
            warmup_ratio = config.get('warmup', 0.1)

            num_train_steps = int(train_size / FLAGS.batch_size * FLAGS.epoch)

            num_warmup_steps = int(num_train_steps * warmup_ratio)

            print('==num warmup steps==', num_warmup_steps)

            print(" model type {}".format(FLAGS.model_type))

            print(num_train_steps, num_warmup_steps, "=============",
                  kargs.get('num_gpus', 1), '==number of gpus==')
            tf.logging.info("***** Running evaluation *****")
            tf.logging.info("***** train steps : %d", num_train_steps)
            max_eval_steps = int(int(FLAGS.eval_size) / FLAGS.batch_size)

            clip_norm_scale = 1.0
            lr_scale = 1.0
            lr = init_lr

            checkpoint_dir = checkpoint_dir

            opt_config = Bunch({
                "init_lr":
                lr,
                "num_train_steps":
                num_train_steps,
                "num_warmup_steps":
                num_warmup_steps,
                "train_op":
                kargs.get("train_op", "adam"),
                "decay":
                kargs.get("decay", "no"),
                "warmup":
                kargs.get("warmup", "no"),
                "clip_norm":
                config.get("clip_norm", 1.0),
                "grad_clip":
                config.get("grad_clip", "global_norm"),
                "use_tpu":
                1
            })

        else:
            warmup_ratio = config.get('warmup', 0.1)
            worker_count = kargs.get('worker_count', 1)
            task_index = kargs.get('task_index', 0)
            is_chief = kargs.get('is_chief', 0)

            if FLAGS.if_shard == "0":
                train_size = FLAGS.train_size
                epoch = int(FLAGS.epoch / worker_count)
            elif FLAGS.if_shard == "1":
                print("==number of gpus==", kargs.get('num_gpus', 1))
                train_size = int(FLAGS.train_size / worker_count /
                                 kargs.get('num_gpus', 1))
                # train_size = int(FLAGS.train_size)
                epoch = FLAGS.epoch
            else:
                train_size = int(FLAGS.train_size / worker_count)
                epoch = FLAGS.epoch

            num_train_steps = int(train_size / FLAGS.batch_size * epoch)
            if config.get('ln_type', 'postln') == 'postln':
                num_warmup_steps = int(num_train_steps * warmup_ratio)
            elif config.get('ln_type', 'preln') == 'postln':
                num_warmup_steps = 0
            else:
                num_warmup_steps = int(num_train_steps * warmup_ratio)
            print('==num warmup steps==', num_warmup_steps)

            num_storage_steps = min(
                [int(train_size / FLAGS.batch_size), 10000])
            if num_storage_steps <= 100:
                num_storage_steps = 500

            num_eval_steps = int(FLAGS.eval_size / FLAGS.batch_size)

            print(
                "num_train_steps {}, num_eval_steps {}, num_storage_steps {}".
                format(num_train_steps, num_eval_steps, num_storage_steps))

            print(" model type {}".format(FLAGS.model_type))

            print(num_train_steps, num_warmup_steps, "=============",
                  kargs.get('num_gpus', 1), '==number of gpus==')

            if worker_count * kargs.get("num_gpus", 1) >= 2:
                clip_norm_scale = 1.0
                lr_scale = 0.8
            else:
                clip_norm_scale = 1.0
                lr_scale = 1.0
            lr = init_lr * worker_count * kargs.get("num_gpus", 1) * lr_scale
            if lr >= 1e-3:
                lr = 1e-3
            print('==init lr==', lr)
            if FLAGS.opt_type == "hvd" and hvd:
                checkpoint_dir = checkpoint_dir if task_index == 0 else None
            elif FLAGS.opt_type == "all_reduce":
                checkpoint_dir = checkpoint_dir
            elif FLAGS.opt_type == "collective_reduce":
                checkpoint_dir = checkpoint_dir if task_index == 0 else None
            elif FLAGS.opt_type == "ps" or FLAGS.opt_type == "ps_sync":
                checkpoint_dir = checkpoint_dir if task_index == 0 else None

            opt_config = Bunch({
                "init_lr":
                lr,
                "num_train_steps":
                num_train_steps,
                "num_warmup_steps":
                num_warmup_steps,
                "worker_count":
                worker_count,
                "gpu_count":
                worker_count * kargs.get("num_gpus", 1),
                "opt_type":
                FLAGS.opt_type,
                "is_chief":
                is_chief,
                "train_op":
                kargs.get("train_op", "adam"),
                "decay":
                kargs.get("decay", "no"),
                "warmup":
                kargs.get("warmup", "no"),
                "clip_norm":
                config.get("clip_norm", 1.0),
                "grad_clip":
                config.get("grad_clip", "global_norm"),
                "epoch":
                FLAGS.epoch,
                "strategy":
                FLAGS.distribution_strategy,
                "use_tpu":
                0
            })

        model_io_config = Bunch({"fix_lm": False})
        model_io_fn = model_io.ModelIO(model_io_config)

        num_classes = FLAGS.num_classes

        model_config_dict = {}
        num_labels_dict = {}
        init_checkpoint_dict = {}
        load_pretrained_dict = {}
        exclude_scope_dict = {}
        not_storage_params_dict = {}
        target_dict = {}

        for task_type in FLAGS.multi_task_type.split(","):
            print("==task type==", task_type)
            model_config_dict[task_type] = model_config_parser(
                Bunch(distillation_config[task_type]))
            print(task_type, distillation_config[task_type],
                  '=====task model config======')
            num_labels_dict[task_type] = distillation_config[task_type][
                "num_labels"]
            init_checkpoint_dict[task_type] = os.path.join(
                FLAGS.buckets,
                distillation_config[task_type]["init_checkpoint"])
            load_pretrained_dict[task_type] = distillation_config[task_type][
                "load_pretrained"]
            exclude_scope_dict[task_type] = distillation_config[task_type][
                "exclude_scope"]
            not_storage_params_dict[task_type] = distillation_config[
                task_type]["not_storage_params"]
            target_dict[task_type] = distillation_config[task_type]["target"]

        tf.logging.info("***** use tpu ***** %s", str(FLAGS.use_tpu))
        model_fn = classifier_model_fn_builder(
            model_config_dict,
            num_labels_dict,
            init_checkpoint_dict,
            load_pretrained_dict,
            model_io_config=model_io_config,
            opt_config=opt_config,
            exclude_scope_dict=exclude_scope_dict,
            not_storage_params_dict=not_storage_params_dict,
            target_dict=target_dict,
            use_tpu=FLAGS.use_tpu,
            **kargs)

        if FLAGS.use_tpu:
            from data_generator import tf_data_utils
            estimator = tf.contrib.tpu.TPUEstimator(
                use_tpu=True,
                model_fn=model_fn,
                config=kargs.get('run_config', {}),
                train_batch_size=FLAGS.batch_size,
                eval_batch_size=FLAGS.batch_size)
            tf.logging.info("****** do train ******* %s", str(FLAGS.do_train))
            if FLAGS.do_train:
                tf.logging.info("***** Running training *****")
                tf.logging.info("  Batch size = %d", FLAGS.batch_size)
                input_features = tf_data_utils.electra_input_fn_builder(
                    train_file,
                    FLAGS.max_length,
                    FLAGS.max_predictions_per_seq,
                    True,
                    num_cpu_threads=4)
                estimator.train(input_fn=input_features,
                                max_steps=num_train_steps)
            else:
                tf.logging.info("***** Running evaluation *****")
                tf.logging.info("  Batch size = %d", FLAGS.batch_size)
                eval_input_fn = tf_data_utils.electra_input_fn_builder(
                    input_files=dev_file,
                    max_seq_length=FLAGS.max_length,
                    max_predictions_per_seq=FLAGS.max_predictions_per_seq,
                    is_training=False)
                tf.logging.info("***** Begining Running evaluation *****")
                result = estimator.evaluate(input_fn=eval_input_fn,
                                            steps=max_eval_steps)
                output_eval_file = os.path.join(checkpoint_dir,
                                                "eval_results.txt")
                with tf.gfile.GFile(output_eval_file, "w") as writer:
                    tf.logging.info("***** Eval results *****")
                    for key in sorted(result.keys()):
                        tf.logging.info("  %s = %s", key, str(result[key]))
                        writer.write("%s = %s\n" % (key, str(result[key])))
        else:
            from data_generator import distributed_tf_data_utils as tf_data_utils
            name_to_features = {
                "input_ids":
                tf.FixedLenFeature([FLAGS.max_length], tf.int64),
                "input_mask":
                tf.FixedLenFeature([FLAGS.max_length], tf.int64),
                "segment_ids":
                tf.FixedLenFeature([FLAGS.max_length], tf.int64),
                "input_ori_ids":
                tf.FixedLenFeature([FLAGS.max_length], tf.int64),
                "masked_lm_positions":
                tf.FixedLenFeature([FLAGS.max_predictions_per_seq], tf.int64),
                "masked_lm_ids":
                tf.FixedLenFeature([FLAGS.max_predictions_per_seq], tf.int64),
                "masked_lm_weights":
                tf.FixedLenFeature([FLAGS.max_predictions_per_seq],
                                   tf.float32),
                "next_sentence_labels":
                tf.FixedLenFeature([], tf.int64),
            }

            def _decode_record(record, name_to_features):
                """Decodes a record to a TensorFlow example.
				"""
                example = tf.parse_single_example(record, name_to_features)

                # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
                # So cast all int64 to int32.
                for name in list(example.keys()):
                    t = example[name]
                    if t.dtype == tf.int64:
                        t = tf.to_int32(t)
                    example[name] = t

                return example

            def _decode_batch_record(record, name_to_features):
                example = tf.parse_example(record, name_to_features)
                # for name in list(example.keys()):
                # 	t = example[name]
                # 	if t.dtype == tf.int64:
                # 		t = tf.to_int32(t)
                # 	example[name] = t

                return example

            params = Bunch({})
            params.epoch = FLAGS.epoch
            params.batch_size = FLAGS.batch_size

            if kargs.get("run_config", None):
                if kargs.get("parse_type", "parse_single") == "parse_single":
                    train_features = lambda: tf_data_utils.all_reduce_train_input_fn(
                        train_file,
                        _decode_record,
                        name_to_features,
                        params,
                        if_shard=FLAGS.if_shard,
                        worker_count=worker_count,
                        task_index=task_index)
                    eval_features = lambda: tf_data_utils.all_reduce_eval_input_fn(
                        dev_file,
                        _decode_record,
                        name_to_features,
                        params,
                        if_shard=FLAGS.if_shard,
                        worker_count=worker_count,
                        task_index=task_index)
                elif kargs.get("parse_type", "parse_single") == "parse_batch":
                    print("==apply parse example==")
                    train_features = lambda: tf_data_utils.all_reduce_train_batch_input_fn(
                        train_file,
                        _decode_batch_record,
                        name_to_features,
                        params,
                        if_shard=FLAGS.if_shard,
                        worker_count=worker_count,
                        task_index=task_index)
                    eval_features = lambda: tf_data_utils.all_reduce_eval_batch_input_fn(
                        dev_file,
                        _decode_batch_record,
                        name_to_features,
                        params,
                        if_shard=FLAGS.if_shard,
                        worker_count=worker_count,
                        task_index=task_index)

            else:
                train_features = lambda: tf_data_utils.train_input_fn(
                    train_file,
                    _decode_record,
                    name_to_features,
                    params,
                    if_shard=FLAGS.if_shard,
                    worker_count=worker_count,
                    task_index=task_index)

                eval_features = lambda: tf_data_utils.eval_input_fn(
                    dev_file,
                    _decode_record,
                    name_to_features,
                    params,
                    if_shard=FLAGS.if_shard,
                    worker_count=worker_count,
                    task_index=task_index)

            train_hooks = []
            eval_hooks = []

            sess_config = tf.ConfigProto(allow_soft_placement=False,
                                         log_device_placement=False)
            if FLAGS.opt_type == "ps" or FLAGS.opt_type == "ps_sync":
                print("==no need for hook==")
            elif FLAGS.opt_type == "pai_soar" and pai:
                print("no need for hook")
            elif FLAGS.opt_type == "hvd" and hvd:
                sess_config.gpu_options.allow_growth = True
                sess_config.gpu_options.visible_device_list = str(
                    hvd.local_rank())
                print("==no need fo hook==")
            else:
                print("==no need for hooks==")

            if kargs.get("run_config", None):
                run_config = kargs.get("run_config", None)
                run_config = run_config.replace(
                    save_checkpoints_steps=num_storage_steps)
                print("==run config==", run_config.save_checkpoints_steps)
            else:
                run_config = tf.estimator.RunConfig(
                    model_dir=checkpoint_dir,
                    save_checkpoints_steps=num_storage_steps,
                    session_config=sess_config)

            if kargs.get("profiler", "profiler") == "profiler":
                if checkpoint_dir:
                    hooks = tf.train.ProfilerHook(
                        save_steps=100,
                        save_secs=None,
                        output_dir=os.path.join(checkpoint_dir, "profiler"),
                    )
                    train_hooks.append(hooks)
                    print("==add profiler hooks==")

            model_estimator = tf.estimator.Estimator(model_fn=model_fn,
                                                     model_dir=checkpoint_dir,
                                                     config=run_config)

            train_being_time = time.time()
            tf.logging.info("==training distribution_strategy=={}".format(
                kargs.get("distribution_strategy", "MirroredStrategy")))
            if kargs.get("distribution_strategy",
                         "MirroredStrategy") == "MirroredStrategy":
                print("==apply single machine multi-card training==")

                train_spec = tf.estimator.TrainSpec(input_fn=train_features,
                                                    max_steps=num_train_steps)

                eval_spec = tf.estimator.EvalSpec(input_fn=eval_features,
                                                  steps=num_eval_steps)

                model_estimator.train(input_fn=train_features,
                                      max_steps=num_train_steps,
                                      hooks=train_hooks)
                # tf.estimator.train(model_estimator, train_spec)

                train_end_time = time.time()
                print("==training time==", train_end_time - train_being_time)
                tf.logging.info("==training time=={}".format(train_end_time -
                                                             train_being_time))
                eval_results = model_estimator.evaluate(input_fn=eval_features,
                                                        steps=num_eval_steps)
                print(eval_results)

            elif kargs.get("distribution_strategy", "MirroredStrategy") in [
                    "ParameterServerStrategy", "CollectiveAllReduceStrategy"
            ]:
                print("==apply multi-machine machine multi-card training==")
                try:
                    print(os.environ['TF_CONFIG'], "==tf_run_config==")
                except:
                    print("==not tf config==")
                train_spec = tf.estimator.TrainSpec(input_fn=train_features,
                                                    max_steps=num_train_steps)

                eval_spec = tf.estimator.EvalSpec(input_fn=eval_features,
                                                  steps=num_eval_steps)

                # tf.estimator.train(model_estimator, train_spec) # tf 1.12 doesn't need evaluate

                tf.estimator.train_and_evaluate(model_estimator, train_spec,
                                                eval_spec)
Beispiel #4
0
            t = example[name]
            if t.dtype == tf.int64:
                t = tf.to_int32(t)
            example[name] = t
        for name in ["input_ids", "input_mask", "segment_ids"]:
            example[name] = tf.reshape(example[name], [-1, max_seq_length])
        return example

    params = Bunch({})
    params.epoch = 5
    params.batch_size = 6
    train_features = tf_data_utils.train_input_fn(
        "/data/xuht/concat/data/train.tfrecords", _decode_record,
        name_to_features, params)
    eval_features = tf_data_utils.eval_input_fn(
        "/data/xuht/concat/data/test.tfrecords", _decode_record,
        name_to_features, params)

    [train_op, train_loss, train_per_example_loss,
     train_logits] = model_train_fn(train_features, [],
                                    tf.estimator.ModeKeys.TRAIN)
    [_, eval_loss, eval_per_example_loss,
     eval_logits] = model_eval_fn(eval_features, [],
                                  tf.estimator.ModeKeys.EVAL)
    result = metric_fn(eval_features, eval_logits, eval_loss)

    model_io_fn.set_saver()

    init_op = tf.group(tf.global_variables_initializer(),
                       tf.local_variables_initializer())
    sess.run(init_op)
Beispiel #5
0
def main(_):
    graph = tf.Graph()
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    with graph.as_default():
        import json

        os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.gpu_id
        sess = tf.Session()

        config = json.load(open(FLAGS.config_file, "r"))

        student_config = json.load(open(FLAGS.student_config_file, "r"))

        student_config = Bunch(student_config)
        # student_config.use_one_hot_embeddings = True
        # student_config.scope = "student/bert"
        # student_config.dropout_prob = 0.1
        # student_config.label_type = "single_label"
        # student_config.init_checkpoint = FLAGS.student_init_checkpoint

        temperature = student_config.temperature
        distill_ratio = student_config.distill_ratio

        # json.dump(student_config, open(FLAGS.model_output+"/student_config.json", "w"))

        teacher_config = Bunch(config)
        teacher_config.use_one_hot_embeddings = True
        teacher_config.scope = "teacher/bert"
        teacher_config.dropout_prob = 0.1
        teacher_config.label_type = "single_label"
        teacher_config.init_checkpoint = FLAGS.teacher_init_checkpoint

        # json.dump(teacher_config, open(FLAGS.model_output+"/teacher_config.json", "w"))

        model_config_dict = {
            "student": student_config,
            "teacher": teacher_config
        }
        init_checkpoint_dict = {
            "student": FLAGS.student_init_checkpoint,
            "teacher": FLAGS.teacher_init_checkpoint
        }

        print("==student checkpoint=={}".format(FLAGS.student_init_checkpoint))

        num_train_steps = int(FLAGS.train_size / FLAGS.batch_size *
                              FLAGS.epoch)
        num_warmup_steps = int(num_train_steps * 0.1)

        num_storage_steps = int(FLAGS.train_size / FLAGS.batch_size)

        print(num_train_steps, num_warmup_steps, "=============")

        opt_config = Bunch({
            "init_lr": 1e-5,
            "num_train_steps": num_train_steps,
            "num_warmup_steps": num_warmup_steps
        })

        model_io_config = Bunch({"fix_lm": False})

        model_io_fn = model_io.ModelIO(model_io_config)

        num_choice = FLAGS.num_classes
        max_seq_length = FLAGS.max_length

        model_eval_fn = distillation.distillation_model_fn(
            model_config_dict=model_config_dict,
            num_labels=num_choice,
            init_checkpoint_dict=init_checkpoint_dict,
            model_reuse=None,
            load_pretrained={
                "teacher": True,
                "student": True
            },
            model_io_fn=model_io_fn,
            model_io_config=model_io_config,
            opt_config=opt_config,
            student_input_name=["a", "b"],
            teacher_input_name=["a", "b"],
            unlabel_input_name=["ua", "ub"],
            temperature=temperature,
            exclude_scope_dict={
                "student": "",
                "teacher": "teacher"
            },
            not_storage_params=["adam_m", "adam_v"],
            distillation_weight={
                "label": distill_ratio,
                "unlabel": distill_ratio
            },
            if_distill_unlabeled=False)

        def metric_fn(features, logits, loss):
            print(logits.get_shape(), "===logits shape===")
            pred_label = tf.argmax(logits, axis=-1, output_type=tf.int32)
            prob = tf.nn.softmax(logits)
            accuracy = correct = tf.equal(
                tf.cast(pred_label, tf.int32),
                tf.cast(features["label_ids"], tf.int32))
            accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
            return {
                "accuracy": accuracy,
                "loss": loss,
                "pred_label": pred_label,
                "label_ids": features["label_ids"],
                "pred_prob": prob
            }

        name_to_features = {
            "input_ids_a": tf.FixedLenFeature([max_seq_length], tf.int64),
            "input_mask_a": tf.FixedLenFeature([max_seq_length], tf.int64),
            "segment_ids_a": tf.FixedLenFeature([max_seq_length], tf.int64),
            "input_ids_b": tf.FixedLenFeature([max_seq_length], tf.int64),
            "input_mask_b": tf.FixedLenFeature([max_seq_length], tf.int64),
            "segment_ids_b": tf.FixedLenFeature([max_seq_length], tf.int64),
            "label_ids": tf.FixedLenFeature([], tf.int64),
        }

        def _decode_record(record, name_to_features):
            """Decodes a record to a TensorFlow example.
			"""
            example = tf.parse_single_example(record, name_to_features)

            # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
            # So cast all int64 to int32.
            for name in list(example.keys()):
                t = example[name]
                if t.dtype == tf.int64:
                    t = tf.to_int32(t)
                example[name] = t
            return example

        params = Bunch({})
        params.epoch = FLAGS.epoch
        params.batch_size = FLAGS.batch_size
        # train_features = tf_data_utils.train_input_fn("/data/xuht/wsdm19/data/train.tfrecords",
        #                             _decode_record, name_to_features, params)
        # eval_features = tf_data_utils.eval_input_fn("/data/xuht/wsdm19/data/dev.tfrecords",
        #                             _decode_record, name_to_features, params)

        eval_features = tf_data_utils.eval_input_fn(FLAGS.dev_file,
                                                    _decode_record,
                                                    name_to_features, params)

        [_, eval_loss, eval_per_example_loss,
         eval_logits] = model_eval_fn(eval_features, [],
                                      tf.estimator.ModeKeys.EVAL)
        result = metric_fn(eval_features, eval_logits, eval_loss)

        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())
        sess.run(init_op)

        def eval_fn(result):
            i = 0
            total_accuracy = 0
            total_loss = 0.0
            pred_prob = []
            label, label_id = [], []
            while True:
                try:
                    eval_result = sess.run(result)
                    total_accuracy += eval_result["accuracy"]
                    label_id.extend(eval_result["label_ids"])
                    label.extend(eval_result["pred_label"])
                    total_loss += eval_result["loss"]
                    pred_prob.extend(eval_result["pred_prob"])
                    i += 1
                except tf.errors.OutOfRangeError:
                    print("End of dataset")
                    break
            f1 = f1_score(label_id, label, average="macro")
            accuracy = accuracy_score(label_id, label)
            print("test accuracy {} accuracy {} loss {} f1 {}".format(
                total_accuracy / i, accuracy, total_loss / i, f1))
            return accuracy, f1, pred_prob

        print("===========begin to eval============")
        accuracy, f1, label = eval_fn(result)
        print("==accuracy {} f1 {} size {}==".format(accuracy, f1, len(label)))
Beispiel #6
0
def main(_):
    graph = tf.Graph()
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    with graph.as_default():

        tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                               do_lower_case=True)

        classifier_data_api = classifier_processor.PiarOrderProcessor()

        eval_examples = classifier_data_api.get_test_examples(
            FLAGS.eval_data_file, FLAGS.lang)
        print(len(eval_examples), eval_examples[0:10])

        label_id = json.load(open(FLAGS.label_id, "r"))

        num_choice = FLAGS.num_classes
        max_seq_length = FLAGS.max_length

        write_to_tfrecords.convert_pair_order_classifier_examples_to_features(
            eval_examples, label_id["label2id"], max_seq_length, tokenizer,
            FLAGS.output_file)

        os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.gpu_id
        sess = tf.Session()

        config = json.load(open(FLAGS.config_file, "r"))

        student_config = json.load(open(FLAGS.student_config_file, "r"))

        student_config = Bunch(student_config)
        # student_config.use_one_hot_embeddings = True
        # student_config.scope = "student/bert"
        # student_config.dropout_prob = 0.1
        # student_config.label_type = "single_label"
        # student_config.init_checkpoint = FLAGS.student_init_checkpoint

        temperature = student_config.temperature
        distill_ratio = student_config.distill_ratio

        # json.dump(student_config, open(FLAGS.model_output+"/student_config.json", "w"))

        teacher_config = Bunch(config)
        teacher_config.use_one_hot_embeddings = True
        teacher_config.scope = "teacher/bert"
        teacher_config.dropout_prob = 0.1
        teacher_config.label_type = "single_label"
        teacher_config.init_checkpoint = FLAGS.teacher_init_checkpoint

        # json.dump(teacher_config, open(FLAGS.model_output+"/teacher_config.json", "w"))

        model_config_dict = {
            "student": student_config,
            "teacher": teacher_config
        }
        init_checkpoint_dict = {
            "student": FLAGS.student_init_checkpoint,
            "teacher": FLAGS.teacher_init_checkpoint
        }

        num_train_steps = int(FLAGS.train_size / FLAGS.batch_size *
                              FLAGS.epoch)
        num_warmup_steps = int(num_train_steps * 0.1)

        num_storage_steps = int(FLAGS.train_size / FLAGS.batch_size)

        print(num_train_steps, num_warmup_steps, "=============")

        opt_config = Bunch({
            "init_lr": 1e-5,
            "num_train_steps": num_train_steps,
            "num_warmup_steps": num_warmup_steps
        })

        model_io_config = Bunch({"fix_lm": False})

        model_io_fn = model_io.ModelIO(model_io_config)

        model_eval_fn = distillation.distillation_model_fn(
            model_config_dict=model_config_dict,
            num_labels=num_choice,
            init_checkpoint_dict=init_checkpoint_dict,
            model_reuse=None,
            load_pretrained={
                "teacher": True,
                "student": True
            },
            model_io_fn=model_io_fn,
            model_io_config=model_io_config,
            opt_config=opt_config,
            student_input_name=["a", "b"],
            teacher_input_name=["a", "b"],
            unlabel_input_name=["ua", "ub"],
            temperature=temperature,
            exclude_scope_dict={
                "student": "",
                "teacher": "teacher"
            },
            not_storage_params=["adam_m", "adam_v"],
            distillation_weight={
                "label": distill_ratio,
                "unlabel": distill_ratio
            },
            if_distill_unlabeled=False)

        def metric_fn(features, logits):
            print(logits.get_shape(), "===logits shape===")
            pred_label = tf.argmax(logits, axis=-1, output_type=tf.int32)
            prob = tf.exp(tf.nn.log_softmax(logits))
            return {
                "pred_label": pred_label,
                "qas_id": features["qas_id"],
                "prob": prob
            }

        name_to_features = {
            "input_ids_a": tf.FixedLenFeature([max_seq_length], tf.int64),
            "input_mask_a": tf.FixedLenFeature([max_seq_length], tf.int64),
            "segment_ids_a": tf.FixedLenFeature([max_seq_length], tf.int64),
            "input_ids_b": tf.FixedLenFeature([max_seq_length], tf.int64),
            "input_mask_b": tf.FixedLenFeature([max_seq_length], tf.int64),
            "segment_ids_b": tf.FixedLenFeature([max_seq_length], tf.int64),
            "label_ids": tf.FixedLenFeature([], tf.int64),
            "qas_id": tf.FixedLenFeature([], tf.int64),
        }

        def _decode_record(record, name_to_features):
            """Decodes a record to a TensorFlow example.
			"""
            example = tf.parse_single_example(record, name_to_features)

            # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
            # So cast all int64 to int32.
            for name in list(example.keys()):
                t = example[name]
                if t.dtype == tf.int64:
                    t = tf.to_int32(t)
                example[name] = t
            return example

        params = Bunch({})
        params.epoch = FLAGS.epoch
        params.batch_size = FLAGS.batch_size
        # train_features = tf_data_utils.train_input_fn("/data/xuht/wsdm19/data/train.tfrecords",
        #                             _decode_record, name_to_features, params)
        # eval_features = tf_data_utils.eval_input_fn("/data/xuht/wsdm19/data/dev.tfrecords",
        #                             _decode_record, name_to_features, params)

        eval_features = tf_data_utils.eval_input_fn(FLAGS.output_file,
                                                    _decode_record,
                                                    name_to_features, params)

        [_, eval_loss, eval_per_example_loss,
         eval_logits] = model_eval_fn(eval_features, [],
                                      tf.estimator.ModeKeys.EVAL)
        result = metric_fn(eval_features, eval_logits)

        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())
        sess.run(init_op)

        def eval_fn(result):
            i = 0
            pred_label, qas_id, prob = [], [], []
            while True:
                try:
                    eval_result = sess.run(result)
                    pred_label.extend(eval_result["pred_label"].tolist())
                    qas_id.extend(eval_result["qas_id"].tolist())
                    prob.extend(eval_result["prob"].tolist())
                    i += 1
                except tf.errors.OutOfRangeError:
                    print("End of dataset")
                    break
            return pred_label, qas_id, prob

        print("===========begin to eval============")
        [pred_label, qas_id, prob] = eval_fn(result)
        result = dict(zip(qas_id, pred_label))

        print(FLAGS.result_file.split("."))
        tmp_output = FLAGS.result_file.split(".")[0] + ".json"
        print(tmp_output, "===temp output===")
        json.dump({
            "id": qas_id,
            "label": pred_label,
            "prob": prob
        }, open(tmp_output, "w"))

        print(len(result), "=====valid result======")

        import pandas as pd
        df = pd.read_csv(FLAGS.eval_data_file)

        output = {}
        for index in range(df.shape[0]):
            output[df.loc[index]["id"]] = ""

        final_output = []

        cnt = 0
        for key in output:
            if key in result:
                final_output.append({
                    "Id":
                    key,
                    "Category":
                    label_id["id2label"][str(result[key])]
                })
                cnt += 1
            else:
                final_output.append({"Id": key, "Category": "unrelated"})

        df_out = pd.DataFrame(final_output)
        df_out.to_csv(FLAGS.result_file)

        print(len(output), cnt, len(final_output),
              "======num of results from model==========")
Beispiel #7
0
def main(_):

	hvd.init()

	sess_config = tf.ConfigProto()
	sess_config.gpu_options.visible_device_list = str(hvd.local_rank())

	graph = tf.Graph()
	from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
	with graph.as_default():
		import json
		
		# config = json.load(open("/data/xuht/bert/chinese_L-12_H-768_A-12/bert_config.json", "r"))
		
		config = json.load(open(FLAGS.config_file, "r"))

		init_checkpoint = FLAGS.init_checkpoint
		print("===init checkoutpoint==={}".format(init_checkpoint))

		config = Bunch(config)
		config.use_one_hot_embeddings = True
		config.scope = "bert"
		config.dropout_prob = 0.1
		config.label_type = "single_label"
		config.lm_ratio = 0.0
		config.task_ratio = 1.0

		json.dump(config, open(FLAGS.model_output+"/config.json", "w"))

		init_lr = 1e-5

		if FLAGS.if_shard == "0":
			train_size = FLAGS.train_size
			epoch = int(FLAGS.epoch / hvd.size())
		elif FLAGS.if_shard == "1":
			train_size = int(FLAGS.train_size/hvd.size())
			epoch = FLAGS.epoch

		sess = tf.Session(config=sess_config)

		num_train_steps = int(
			train_size / FLAGS.batch_size * epoch)
		num_warmup_steps = int(num_train_steps * 0.1)

		num_storage_steps = int(train_size / FLAGS.batch_size)

		print(num_train_steps, num_warmup_steps, "=============")
		
		opt_config = Bunch({"init_lr":init_lr/(hvd.size()), 
							"num_train_steps":num_train_steps,
							"num_warmup_steps":num_warmup_steps})

		model_io_config = Bunch({"fix_lm":False})
		
		model_io_fn = model_io.ModelIO(model_io_config)

		optimizer_fn = optimizer.Optimizer(opt_config)
		
		num_choice = FLAGS.num_classes
		max_seq_length = FLAGS.max_length
		max_predictions_per_seq = FLAGS.max_predictions_per_seq

		model_train_fn = classifier_fn.classifier_model_fn_builder(config, 
												num_choice, init_checkpoint, 
												reuse=None, 
												load_pretrained=True,
												model_io_fn=model_io_fn,
												optimizer_fn=optimizer_fn,
												model_io_config=model_io_config, 
												opt_config=opt_config)


		model_eval_fn = classifier_fn.classifier_model_fn_builder(config, 
												num_choice, init_checkpoint, 
												reuse=True, 
												load_pretrained=True,
												model_io_fn=model_io_fn,
												optimizer_fn=optimizer_fn,
												model_io_config=model_io_config, 
												opt_config=opt_config)
		
		name_to_features = {
				"input_ids":
					tf.FixedLenFeature([max_seq_length], tf.int64),
				"input_mask":
					tf.FixedLenFeature([max_seq_length], tf.int64),
				"segment_ids":
					tf.FixedLenFeature([max_seq_length], tf.int64),
				"masked_lm_positions":
					tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
				"masked_lm_ids":
					tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
				"masked_lm_weights":
					tf.FixedLenFeature([max_predictions_per_seq], tf.float32),
				"label_ids":
					tf.FixedLenFeature([], tf.int64),
				}

		def _decode_record(record, name_to_features):
			"""Decodes a record to a TensorFlow example.
			"""
			example = tf.parse_single_example(record, name_to_features)

			# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
			# So cast all int64 to int32.
			for name in list(example.keys()):
				t = example[name]
				if t.dtype == tf.int64:
					t = tf.to_int32(t)
				example[name] = t
			return example 

		params = Bunch({})
		params.epoch = epoch
		params.batch_size = FLAGS.batch_size

		def parse_folder(path):
			files = os.listdir(path)
			output = []
			for file_name in files:
				output.append(os.path.join(path, file_name))
			random.shuffle(output)
			return output

		train_features = tf_data_utils.train_input_fn(
									parse_folder(FLAGS.train_file),
									_decode_record, name_to_features, params)
		train_dict = model_train_fn(train_features, [], tf.estimator.ModeKeys.TRAIN)

		eval_features = tf_data_utils.eval_input_fn(
										parse_folder(FLAGS.dev_file),
										_decode_record, name_to_features, params)
		eval_dict = model_eval_fn(eval_features, [], tf.estimator.ModeKeys.EVAL)

		model_io_fn.set_saver()
		
		init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
		sess.run(init_op)

		sess.run(hvd.broadcast_global_variables(0))
		
		def eval_fn(op_dict):
			i = 0
			eval_total_dict = {}
			while True:
				try:
					eval_result = sess.run(op_dict)
					for key in eval_result:
						if key in ["probabilities", "label_ids"]:
							if key in eval_total_dict:
								eval_total_dict[key].extend(eval_result[key])
							else:
								eval_total_dict[key] = []
								eval_total_dict[key].extend(eval_result[key])
					i += 1
				except tf.errors.OutOfRangeError:
					print("End of dataset")
					break

			for key in eval_result:
				if key not in ["probabilities", "label_ids"]:
					eval_total_dict[key] = eval_result[key]

			label_id = eval_total_dict["label_ids"]
			label = np.argmax(np.array(eval_total_dict["probabilities"]), axis=-1)

			macro_f1 = f1_score(label_id, label, average="macro")
			micro_f1 = f1_score(label_id, label, average="micro")
			accuracy = accuracy_score(label_id, label)

			print("test accuracy {} macro_f1 score {} micro_f1 {} masked_lm_accuracy {} sentence_f {}".format(accuracy, 
																		macro_f1,  micro_f1, 
																		eval_total_dict["masked_lm_accuracy"],
																		eval_total_dict["sentence_f"]))
			return eval_total_dict

		def run_eval(steps):
			import _pickle as pkl
			eval_features = tf_data_utils.eval_input_fn(
										parse_folder(FLAGS.dev_file),
										_decode_record, name_to_features, params)
			eval_dict = model_eval_fn(eval_features, [], tf.estimator.ModeKeys.EVAL)
			sess.run(tf.local_variables_initializer())
			eval_finial_dict = eval_fn(eval_dict)
			if hvd.rank() == 0:
				pkl.dump(eval_finial_dict, open(FLAGS.model_output+"/eval_dict_{}.pkl".format(steps), "wb"))
			return eval_finial_dict
		
		def train_fn(op_dict):
			i = 0
			cnt = 0
			loss_dict = {}
			monitoring_train = []
			monitoring_eval = []
			while True:
				try:
					train_result = sess.run(op_dict)
					for key in train_result:
						if key == "train_op":
							continue
						else:
							if np.isnan(train_result[key]):
								print(train_loss, "get nan loss")
								break
							else:
								if key in loss_dict:
									loss_dict[key] += train_result[key]
								else:
									loss_dict[key] = train_result[key]
					
					i += 1
					cnt += 1
					
					if np.mod(i, num_storage_steps) == 0:
						string = ""
						for key in loss_dict:
							tmp = key + " " + str(loss_dict[key]/cnt) + "\t"
							string += tmp
						print(string)
						monitoring_train.append(loss_dict)

						eval_finial_dict = run_eval(int(i/num_storage_steps))
						monitoring_eval.append(eval_finial_dict)

						for key in loss_dict:
							loss_dict[key] = 0.0
						if hvd.rank() == 0:
							model_io_fn.save_model(sess, FLAGS.model_output+"/model_{}.ckpt".format(int(i/num_storage_steps)))
							print("==successful storing model=={}".format(int(i/num_storage_steps)))
						cnt = 0

				except tf.errors.OutOfRangeError:
					if hvd.rank() == 0:
						import _pickle as pkl
						pkl.dump({"train":monitoring_train,
							"eval":monitoring_eval}, open(FLAGS.model_output+"/monitoring.pkl", "wb"))

					break
		print("===========begin to train============")        
		train_fn(train_dict)
		if hvd.rank() == 0:
			model_io_fn.save_model(sess, FLAGS.model_output+"/model.ckpt")
			print("===========begin to eval============")
			eval_finial_dict = run_eval("final")
Beispiel #8
0
def main(_):

    graph = tf.Graph()
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    with graph.as_default():
        import json

        # config = json.load(open("/data/xuht/bert/chinese_L-12_H-768_A-12/bert_config.json", "r"))

        config = json.load(open(FLAGS.config_file, "r"))

        init_checkpoint = FLAGS.init_checkpoint
        print("===init checkoutpoint==={}".format(init_checkpoint))

        # init_checkpoint = "/data/xuht/bert/chinese_L-12_H-768_A-12/bert_model.ckpt"
        # init_checkpoint = "/data/xuht/concat/model_1/oqmrc.ckpt"
        config = Bunch(config)
        config.use_one_hot_embeddings = True
        config.scope = "bert"
        config.dropout_prob = 0.1
        config.label_type = "single_label"
        # config.loss = "focal_loss"

        os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.gpu_id
        sess = tf.Session()

        num_train_steps = int(FLAGS.train_size / FLAGS.batch_size *
                              FLAGS.epoch)
        num_warmup_steps = int(num_train_steps * 0.1)

        num_storage_steps = int(FLAGS.train_size / FLAGS.batch_size)

        print(num_train_steps, num_warmup_steps, "=============")

        opt_config = Bunch({
            "init_lr": 1e-5,
            "num_train_steps": num_train_steps,
            "num_warmup_steps": num_warmup_steps
        })

        model_io_config = Bunch({"fix_lm": False})

        model_io_fn = model_io.ModelIO(model_io_config)

        num_choice = FLAGS.num_classes
        max_seq_length = FLAGS.max_length

        model_train_fn = bert_order_classifier.classifier_model_fn_builder_v1(
            config,
            num_choice,
            init_checkpoint,
            model_reuse=None,
            load_pretrained=True,
            model_io_fn=model_io_fn,
            model_io_config=model_io_config,
            opt_config=opt_config,
            input_name=["a", "b"])

        model_eval_fn = bert_order_classifier.classifier_model_fn_builder_v1(
            config,
            num_choice,
            init_checkpoint,
            model_reuse=True,
            load_pretrained=True,
            model_io_fn=model_io_fn,
            model_io_config=model_io_config,
            opt_config=opt_config,
            input_name=["a", "b"])

        def metric_fn(features, logits, loss):
            print(logits.get_shape(), "===logits shape===")
            pred_label = tf.argmax(logits, axis=-1, output_type=tf.int32)
            prob = tf.nn.softmax(logits)
            accuracy = correct = tf.equal(
                tf.cast(pred_label, tf.int32),
                tf.cast(features["label_ids"], tf.int32))
            accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
            return {
                "accuracy": accuracy,
                "loss": loss,
                "pred_label": pred_label,
                "label_ids": features["label_ids"]
            }

        name_to_features = {
            "input_ids_a": tf.FixedLenFeature([max_seq_length], tf.int64),
            "input_mask_a": tf.FixedLenFeature([max_seq_length], tf.int64),
            "segment_ids_a": tf.FixedLenFeature([max_seq_length], tf.int64),
            "input_ids_b": tf.FixedLenFeature([max_seq_length], tf.int64),
            "input_mask_b": tf.FixedLenFeature([max_seq_length], tf.int64),
            "segment_ids_b": tf.FixedLenFeature([max_seq_length], tf.int64),
            "label_ids": tf.FixedLenFeature([], tf.int64),
        }

        def _decode_record(record, name_to_features):
            """Decodes a record to a TensorFlow example.
            """
            example = tf.parse_single_example(record, name_to_features)

            # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
            # So cast all int64 to int32.
            for name in list(example.keys()):
                t = example[name]
                if t.dtype == tf.int64:
                    t = tf.to_int32(t)
                example[name] = t
            return example

        params = Bunch({})
        params.epoch = FLAGS.epoch
        params.batch_size = FLAGS.batch_size
        # train_features = tf_data_utils.train_input_fn("/data/xuht/wsdm19/data/train.tfrecords",
        #                             _decode_record, name_to_features, params)
        # eval_features = tf_data_utils.eval_input_fn("/data/xuht/wsdm19/data/dev.tfrecords",
        #                             _decode_record, name_to_features, params)

        train_features = tf_data_utils.train_input_fn(FLAGS.train_file,
                                                      _decode_record,
                                                      name_to_features, params)
        eval_features = tf_data_utils.eval_input_fn(FLAGS.dev_file,
                                                    _decode_record,
                                                    name_to_features, params)

        [train_op, train_loss, train_per_example_loss,
         train_logits] = model_train_fn(train_features, [],
                                        tf.estimator.ModeKeys.TRAIN)
        [_, eval_loss, eval_per_example_loss,
         eval_logits] = model_eval_fn(eval_features, [],
                                      tf.estimator.ModeKeys.EVAL)
        result = metric_fn(eval_features, eval_logits, eval_loss)

        model_io_fn.set_saver()

        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())
        sess.run(init_op)

        def eval_fn(result):
            i = 0
            total_accuracy = 0
            label, label_id = [], []
            while True:
                try:
                    eval_result = sess.run(result)
                    total_accuracy += eval_result["accuracy"]
                    label_id.extend(eval_result["label_ids"])
                    label.extend(eval_result["pred_label"])
                    i += 1
                except tf.errors.OutOfRangeError:
                    print("End of dataset")
                    break
            f1 = f1_score(label_id, label, average="macro")
            accuracy = accuracy_score(label_id, label)
            print("test accuracy accuracy {} {} f1 {}".format(
                total_accuracy / i, accuracy, f1))
            return total_accuracy / i, f1

        def train_fn(op, loss):
            i = 0
            cnt = 0
            total_loss = 0.0
            while True:
                try:
                    [_, train_loss] = sess.run([op, loss])
                    total_loss += train_loss
                    i += 1
                    cnt += 1
                    if np.mod(i, num_storage_steps) == 0:
                        print(total_loss / cnt)
                        # model_io_fn.save_model(sess, "/data/xuht/wsdm19/data/model_11_15_focal_loss/oqmrc_{}.ckpt".format(int(i/8000)))
                        model_io_fn.save_model(
                            sess, FLAGS.model_output + "/oqmrc_{}.ckpt".format(
                                int(i / num_storage_steps)))

                        print("==successful storing model=={}".format(
                            int(i / num_storage_steps)))
                        total_loss = 0
                        cnt = 0
                except tf.errors.OutOfRangeError:
                    break

        print("===========begin to train============")
        train_fn(train_op, train_loss)
        print("===========begin to eval============")
        accuracy, f1 = eval_fn(result)
        print("==accuracy {} f1 {}==".format(accuracy, f1))
        # model_io_fn.save_model(sess, "/data/xuht/wsdm19/data/model_11_15_focal_loss/oqmrc.ckpt")

        model_io_fn.save_model(sess, FLAGS.model_output + "/oqmrc.ckpt")