Exemple #1
0
 def __init__(self,
              model_config,
              num_labels,
              init_checkpoint,
              load_pretrained=True,
              model_io_config={},
              opt_config={},
              exclude_scope="",
              not_storage_params=[],
              target="a",
              label_lst=None,
              output_type="sess",
              **kargs):
     self.model_config = model_config
     self.num_labels = num_labels
     self.init_checkpoint = init_checkpoint
     self.load_pretrained = load_pretrained
     self.model_io_config = model_io_config
     self.opt_config = opt_config
     self.exclude_scope = exclude_scope
     self.not_storage_params = not_storage_params
     self.target = target
     self.label_lst = label_lst
     self.output_type = output_type
     self.kargs = kargs
     self.model_io_fn = model_io.ModelIO(self.model_io_config)
     self.optimizer_fn = optimizer.Optimizer(self.opt_config)
Exemple #2
0
    def model_fn(features, labels, mode, params):

        model_api = model_zoo(model_config)

        model = model_api(model_config,
                          features,
                          labels,
                          mode,
                          target,
                          reuse=tf.AUTO_REUSE,
                          **kargs)

        if mode == tf.estimator.ModeKeys.TRAIN:
            dropout_prob = model_config.dropout_prob
        else:
            dropout_prob = 0.0

        if model_io_config.fix_lm == True:
            scope = model_config.scope + "_finetuning"
        else:
            scope = model_config.scope

        logits = global_discriminator_logits(model_config,
                                             model.get_pooled_output(),
                                             reuse=tf.AUTO_REUSE,
                                             **kargs)

        model_io_fn = model_io.ModelIO(model_io_config)

        pretrained_tvars = model_io_fn.get_params(
            model_config.scope, not_storage_params=not_storage_params)
        global_prediction_tvars = model_io_fn.get_params(
            "cls/seq_global", not_storage_params=not_storage_params)

        pretrained_tvars.extend(global_prediction_tvars)
        tvars = pretrained_tvars

        print('==discriminator parameters==', tvars)

        if load_pretrained == "yes":
            use_tpu = 1 if kargs.get('use_tpu', False) else 0
            scaffold_fn = model_io_fn.load_pretrained(
                tvars,
                init_checkpoint,
                exclude_scope=exclude_scope,
                use_tpu=use_tpu)
        else:
            scaffold_fn = None

        if mode == tf.estimator.ModeKeys.PREDICT:
            estimator_spec = tf.estimator.EstimatorSpec(
                mode=mode,
                predictions={"probs": tf.nn.softmax(logits)},
                export_outputs={
                    "output":
                    tf.estimator.export.PredictOutput(
                        {"probs": tf.nn.softmax(logits)})
                })
            return estimator_spec
Exemple #3
0
def export_model_v2(config):

	opt_config = Bunch({"init_lr":2e-5, "num_train_steps":1e30, "cycle":False})
	model_io_config = Bunch({"fix_lm":False})

	bert_config = json.load(open(config["config_file"], "r"))
	model_config = Bunch(bert_config)

	model_config.use_one_hot_embeddings = True
	model_config.scope = "bert"
	model_config.dropout_prob = 0.1
	model_config.label_type = "single_label"

	with open(config["label2id"], "r") as frobj:
		label_dict = json.load(frobj)

	num_classes = len(label_dict["id2label"])
	max_seq_length = config["max_length"]

	def serving_input_receiver_fn():
		label_ids = tf.placeholder(tf.int32, [None], name='label_ids')

		input_ids = tf.placeholder(tf.int32, [None, max_seq_length], name='input_ids')
		input_mask = tf.placeholder(tf.int32, [None, max_seq_length], name='input_mask')
		segment_ids = tf.placeholder(tf.int32, [None, max_seq_length], name='segment_ids')

		input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({
			'label_ids': label_ids,
			'input_ids': input_ids,
			'input_mask': input_mask,
			'segment_ids': segment_ids
		})()
		return input_fn

	model_io_fn = model_io.ModelIO(model_io_config)

	model_fn = bert_classifier_estimator.classifier_model_fn_builder(
									model_config, 
									num_classes, 
									config["init_checkpoint"], 
									reuse=None, 
									load_pretrained=True,
									model_io_fn=model_io_fn,
									model_io_config=model_io_config, 
									opt_config=opt_config)

	estimator = tf.estimator.Estimator(
				model_fn=model_fn,
				model_dir=config["model_dir"])

	export_dir = estimator.export_savedmodel(config["export_path"], 
									serving_input_receiver_fn,
									checkpoint_path=config["init_checkpoint"])

	print("===Succeeded in exporting saved model==={}".format(export_dir))
Exemple #4
0
    def model_fn(features, labels, mode):

        model_api = model_zoo(model_config)

        model = model_api(model_config,
                          features,
                          labels,
                          mode,
                          target,
                          reuse=model_reuse)

        label_ids = features["label_ids"]

        if mode == tf.estimator.ModeKeys.TRAIN:
            dropout_prob = model_config.dropout_prob
        else:
            dropout_prob = 0.0

        if model_io_config.fix_lm == True:
            scope = model_config.scope + "_finetuning"
        else:
            scope = model_config.scope

        with tf.variable_scope(scope):
            (loss, per_example_loss,
             logits) = classifier.classifier(model_config,
                                             model.get_pooled_output(),
                                             num_labels, label_ids,
                                             dropout_prob)

        model_io_fn = model_io.ModelIO(model_io_config)

        tvars = model_io_fn.get_params(model_config.scope,
                                       not_storage_params=not_storage_params)

        try:
            params_size = model_io_fn.count_params(model_config.scope)
            print("==total params==", params_size)
        except:
            print("==not count params==")
        if load_pretrained == "yes":
            model_io_fn.load_pretrained(tvars,
                                        init_checkpoint,
                                        exclude_scope='teacher')
        return_dict = {
            "loss": loss,
            "logits": logits,
            "tvars": tvars,
            "model": model,
            "per_example_loss": per_example_loss
        }
        return return_dict
	def model_fn(features, labels, mode, params):

		model_api = model_zoo(model_config)

		model = model_api(model_config, features, labels,
							mode, target, reuse=tf.AUTO_REUSE,
							**kargs)

		if mode == tf.estimator.ModeKeys.TRAIN:
			dropout_prob = model_config.dropout_prob
		else:
			dropout_prob = 0.0

		if model_io_config.fix_lm == True:
			scope = model_config.scope + "_finetuning"
		else:
			scope = model_config.scope

		logits = global_discriminator_logits(model_config, 
											model.get_pooled_output(), 
											reuse=tf.AUTO_REUSE, **kargs)

		model_io_fn = model_io.ModelIO(model_io_config)

		pretrained_tvars = model_io_fn.get_params(model_config.scope, 
										not_storage_params=not_storage_params)
		global_prediction_tvars = model_io_fn.get_params("cls/seq_global", 
									not_storage_params=not_storage_params)

		pretrained_tvars.extend(global_prediction_tvars)
		tvars = pretrained_tvars

		print('==discriminator parameters==', tvars)

		if load_pretrained == "yes":
			use_tpu = 1 if kargs.get('use_tpu', False) else 0
			scaffold_fn = model_io_fn.load_pretrained(tvars, 
											init_checkpoint,
											exclude_scope=exclude_scope,
											use_tpu=use_tpu,
											restore_var_name=model_config.get('restore_var_name', []))
		else:
			scaffold_fn = None
		
		return_dict = {
					"logits":logits,
					"tvars":tvars,
					"model":model
				}
		return return_dict
Exemple #6
0
    def init_model(self):

        self.graph = tf.Graph()
        with self.graph.as_default():

            init_checkpoint = self.config["init_checkpoint"]
            bert_config = json.load(open(self.config["bert_config"], "r"))

            self.model_config = Bunch(bert_config)
            self.model_config.use_one_hot_embeddings = True
            self.model_config.scope = "bert"
            self.model_config.dropout_prob = 0.1
            self.model_config.label_type = "single_label"

            self.input_queue = Queue(maxsize=self.config.get("batch_size", 20))
            self.output_queue = Queue(
                maxsize=self.config.get("batch_size", 20))

            opt_config = Bunch({
                "init_lr": 2e-5,
                "num_train_steps": 1e30,
                "cycle": False
            })
            model_io_config = Bunch({"fix_lm": False})

            self.num_classes = len(self.label_dict["id2label"])
            self.max_seq_length = self.config["max_length"]

            self.tokenizer = tokenization.FullTokenizer(
                vocab_file=self.config["bert_vocab"], do_lower_case=True)

            self.sess = tf.Session()
            self.model_io_fn = model_io.ModelIO(model_io_config)

            model_fn = bert_classifier_estimator.classifier_model_fn_builder(
                self.model_config,
                self.num_classes,
                init_checkpoint,
                reuse=None,
                load_pretrained=True,
                model_io_fn=self.model_io_fn,
                model_io_config=model_io_config,
                opt_config=opt_config)

            self.estimator = tf.estimator.Estimator(
                model_fn=model_fn, model_dir=self.config["model_dir"])
Exemple #7
0
	def model_fn(features, labels, mode):

		original_loss = tf.constant(0.0)
		distilled_loss = tf.constant(0.0)

		st_model = st_model_fn(model_config_dict['student'],
		 			num_labels_dict['student'],
					init_checkpoint_dict['student'],
					model_reuse=None,
					load_pretrained=load_pretrained_dict['student'],
					model_io_config=model_io_config,
					opt_config=opt_config,
					exclude_scope=exclude_scope_dict.get('student', ""),
					not_storage_params=not_storage_params_dict.get('student', []),
					target=target_dict['student'],
					**kargs)
		st_dict = st_model(features, labels, mode)

		ta_model = ta_model_fn(model_config_dict['teacher'],
		 			num_labels_dict['teacher'],
					init_checkpoint_dict['teacher'],
					model_reuse=None,
					load_pretrained=load_pretrained_dict['teacher'],
					model_io_config=model_io_config,
					opt_config=opt_config,
					exclude_scope=exclude_scope_dict.get('teacher', ""),
					not_storage_params=not_storage_params_dict.get('teacher', []),
					target=target_dict['teacher'],
					**kargs)
		ta_dict = ta_model(features, labels, mode)

		studnet_logit = st_dict['logits']
		teacher_logit = ta_dict['logits']

		model_io_fn = model_io.ModelIO(model_io_config)

		feature_flag = False

		original_loss += st_dict['loss'] * (distillation_config.get('ce_loss', 1.0))
		print(distillation_config.get('ce_loss', 1.0), '===ce_loss===')

		
Exemple #8
0
    sess = tf.Session()

    num_train_steps = int(FLAGS.train_size / FLAGS.batch_size * FLAGS.epoch)
    num_warmup_steps = int(num_train_steps * 0.1)

    num_storage_steps = int(FLAGS.train_size / FLAGS.batch_size)

    opt_config = Bunch({
        "init_lr": 1e-5,
        "num_train_steps": num_train_steps,
        "num_warmup_steps": num_warmup_steps
    })

    model_io_config = Bunch({"fix_lm": False})

    model_io_fn = model_io.ModelIO(model_io_config)

    num_choice = FLAGS.num_classes
    max_seq_length = FLAGS.max_length

    model_train_fn = bert_classifier.multichoice_model_fn_builder(
        config,
        num_choice,
        init_checkpoint,
        reuse=None,
        load_pretrained=True,
        model_io_fn=model_io_fn,
        model_io_config=model_io_config,
        opt_config=opt_config)

    model_eval_fn = bert_classifier.multichoice_model_fn_builder(
def train_eval_fn(FLAGS, worker_count, task_index, is_chief, target,
                  init_checkpoint, train_file, dev_file, checkpoint_dir,
                  is_debug, **kargs):

    graph = tf.Graph()
    with graph.as_default():
        import json

        # config = json.load(open(FLAGS.config_file, "r"))

        # config = Bunch(config)
        # config.use_one_hot_embeddings = True
        # config.scope = "bert"
        # config.dropout_prob = 0.1
        # config.label_type = "single_label"

        # config.model = FLAGS.model_type

        config = model_config_parser(FLAGS)

        if FLAGS.if_shard == "0":
            train_size = FLAGS.train_size
            epoch = int(FLAGS.epoch / worker_count)
        elif FLAGS.if_shard == "1":
            print("==number of gpus==", kargs.get('num_gpus', 1))
            train_size = int(FLAGS.train_size / worker_count /
                             kargs.get('num_gpus', 1))
            # train_size = int(FLAGS.train_size)
            epoch = FLAGS.epoch
        else:
            train_size = int(FLAGS.train_size / worker_count)
            epoch = FLAGS.epoch

        init_lr = FLAGS.init_lr

        distillation_dict = json.load(tf.gfile.Open(FLAGS.distillation_config))
        distillation_config = Bunch(
            json.load(tf.gfile.Open(FLAGS.multi_task_config)))

        warmup_ratio = config.get('warmup', 0.1)

        num_train_steps = int(train_size / FLAGS.batch_size * epoch)
        if config.get('ln_type', 'postln') == 'postln':
            num_warmup_steps = int(num_train_steps * warmup_ratio)
        elif config.get('ln_type', 'preln') == 'postln':
            num_warmup_steps = 0
        else:
            num_warmup_steps = int(num_train_steps * warmup_ratio)
        print('==num warmup steps==', num_warmup_steps)

        num_storage_steps = min([int(train_size / FLAGS.batch_size), 10000])
        if num_storage_steps <= 100:
            num_storage_steps = 500

        num_eval_steps = int(FLAGS.eval_size / FLAGS.batch_size)

        if is_debug == "0":
            num_storage_steps = 2
            num_eval_steps = 10
            num_train_steps = 10
        print("num_train_steps {}, num_eval_steps {}, num_storage_steps {}".
              format(num_train_steps, num_eval_steps, num_storage_steps))

        print(" model type {}".format(FLAGS.model_type))

        print(num_train_steps, num_warmup_steps, "=============",
              kargs.get('num_gpus', 1), '==number of gpus==')

        if worker_count * kargs.get("num_gpus", 1) >= 2:
            clip_norm_scale = 1.0
            lr_scale = 0.8
        else:
            clip_norm_scale = 1.0
            lr_scale = 1.0
        lr = init_lr * worker_count * kargs.get("num_gpus", 1) * lr_scale
        if lr >= 1e-3:
            lr = 1e-3
        print('==init lr==', lr)

        opt_config = Bunch({
            "init_lr": lr,
            "num_train_steps": num_train_steps,
            "num_warmup_steps": num_warmup_steps,
            "worker_count": worker_count,
            "gpu_count": worker_count * kargs.get("num_gpus", 1),
            "opt_type": FLAGS.opt_type,
            "is_chief": is_chief,
            "train_op": kargs.get("train_op", "adam"),
            "decay": kargs.get("decay", "no"),
            "warmup": kargs.get("warmup", "no"),
            "clip_norm": config.get("clip_norm", 1.0),
            "grad_clip": config.get("grad_clip", "global_norm"),
            "epoch": FLAGS.epoch,
            "strategy": FLAGS.distribution_strategy
        })

        anneal_config = Bunch({
            "initial_value": 1.0,
            "num_train_steps": num_train_steps
        })

        model_io_config = Bunch({"fix_lm": False})
        model_io_fn = model_io.ModelIO(model_io_config)

        num_classes = FLAGS.num_classes

        if FLAGS.opt_type == "hvd" and hvd:
            checkpoint_dir = checkpoint_dir if task_index == 0 else None
        elif FLAGS.opt_type == "all_reduce":
            checkpoint_dir = checkpoint_dir
        elif FLAGS.opt_type == "collective_reduce":
            checkpoint_dir = checkpoint_dir if task_index == 0 else None
        elif FLAGS.opt_type == "ps" or FLAGS.opt_type == "ps_sync":
            checkpoint_dir = checkpoint_dir if task_index == 0 else None
        print("==checkpoint_dir==", checkpoint_dir, is_chief)

        model_config_dict = {}
        num_labels_dict = {}
        init_checkpoint_dict = {}
        load_pretrained_dict = {}
        exclude_scope_dict = {}
        not_storage_params_dict = {}
        target_dict = {}

        for task_type in FLAGS.multi_task_type.split(","):
            print("==task type==", task_type)
            model_config_dict[task_type] = model_config_parser(
                Bunch(distillation_config[task_type]))
            print(task_type, distillation_config[task_type],
                  '=====task model config======')
            num_labels_dict[task_type] = distillation_config[task_type][
                "num_labels"]
            init_checkpoint_dict[task_type] = os.path.join(
                FLAGS.buckets,
                distillation_config[task_type]["init_checkpoint"])
            load_pretrained_dict[task_type] = distillation_config[task_type][
                "load_pretrained"]
            exclude_scope_dict[task_type] = distillation_config[task_type][
                "exclude_scope"]
            not_storage_params_dict[task_type] = distillation_config[
                task_type]["not_storage_params"]
            target_dict[task_type] = distillation_config[task_type]["target"]

        model_fn = distillation_model_fn(
            model_config_dict,
            num_labels_dict,
            init_checkpoint_dict,
            load_pretrained_dict,
            model_io_config=model_io_config,
            opt_config=opt_config,
            exclude_scope_dict=exclude_scope_dict,
            not_storage_params_dict=not_storage_params_dict,
            target_dict=target_dict,
            output_type="estimator",
            distillation_config=distillation_dict,
            **kargs)

        name_to_features = data_interface(FLAGS)

        def _decode_record(record, name_to_features):
            """Decodes a record to a TensorFlow example.
			"""
            example = tf.parse_single_example(record, name_to_features)

            # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
            # So cast all int64 to int32.
            for name in list(example.keys()):
                t = example[name]
                if t.dtype == tf.int64:
                    t = tf.to_int32(t)
                example[name] = t

            return example

        def _decode_batch_record(record, name_to_features):
            example = tf.parse_example(record, name_to_features)
            # for name in list(example.keys()):
            # 	t = example[name]
            # 	if t.dtype == tf.int64:
            # 		t = tf.to_int32(t)
            # 	example[name] = t

            return example

        params = Bunch({})
        params.epoch = FLAGS.epoch
        params.batch_size = FLAGS.batch_size

        if kargs.get("run_config", None):
            if kargs.get("parse_type", "parse_single") == "parse_single":
                train_features = lambda: tf_data_utils.all_reduce_train_input_fn(
                    train_file,
                    _decode_record,
                    name_to_features,
                    params,
                    if_shard=FLAGS.if_shard,
                    worker_count=worker_count,
                    task_index=task_index)
                eval_features = lambda: tf_data_utils.all_reduce_eval_input_fn(
                    dev_file,
                    _decode_record,
                    name_to_features,
                    params,
                    if_shard=FLAGS.if_shard,
                    worker_count=worker_count,
                    task_index=task_index)
            elif kargs.get("parse_type", "parse_single") == "parse_batch":
                print("==apply parse example==")
                train_features = lambda: tf_data_utils.all_reduce_train_batch_input_fn(
                    train_file,
                    _decode_batch_record,
                    name_to_features,
                    params,
                    if_shard=FLAGS.if_shard,
                    worker_count=worker_count,
                    task_index=task_index)
                eval_features = lambda: tf_data_utils.all_reduce_eval_batch_input_fn(
                    dev_file,
                    _decode_batch_record,
                    name_to_features,
                    params,
                    if_shard=FLAGS.if_shard,
                    worker_count=worker_count,
                    task_index=task_index)

        else:
            train_features = lambda: tf_data_utils.train_input_fn(
                train_file,
                _decode_record,
                name_to_features,
                params,
                if_shard=FLAGS.if_shard,
                worker_count=worker_count,
                task_index=task_index)

            eval_features = lambda: tf_data_utils.eval_input_fn(
                dev_file,
                _decode_record,
                name_to_features,
                params,
                if_shard=FLAGS.if_shard,
                worker_count=worker_count,
                task_index=task_index)

        train_hooks = []
        eval_hooks = []

        sess_config = tf.ConfigProto(allow_soft_placement=False,
                                     log_device_placement=False)
        if FLAGS.opt_type == "ps" or FLAGS.opt_type == "ps_sync":
            print("==no need for hook==")
        elif FLAGS.opt_type == "pai_soar" and pai:
            print("no need for hook")
        elif FLAGS.opt_type == "hvd" and hvd:
            sess_config.gpu_options.allow_growth = True
            sess_config.gpu_options.visible_device_list = str(hvd.local_rank())
            print("==no need fo hook==")
        else:
            print("==no need for hooks==")

        if kargs.get("run_config", None):
            run_config = kargs.get("run_config", None)
            run_config = run_config.replace(
                save_checkpoints_steps=num_storage_steps)
            print("==run config==", run_config.save_checkpoints_steps)
        else:
            run_config = tf.estimator.RunConfig(
                model_dir=checkpoint_dir,
                save_checkpoints_steps=num_storage_steps,
                session_config=sess_config)

        if kargs.get("profiler", "profiler") == "profiler":
            if checkpoint_dir:
                hooks = tf.train.ProfilerHook(
                    save_steps=100,
                    save_secs=None,
                    output_dir=os.path.join(checkpoint_dir, "profiler"),
                )
                train_hooks.append(hooks)
                print("==add profiler hooks==")

        model_estimator = tf.estimator.Estimator(model_fn=model_fn,
                                                 model_dir=checkpoint_dir,
                                                 config=run_config)

        train_being_time = time.time()
        tf.logging.info("==training distribution_strategy=={}".format(
            kargs.get("distribution_strategy", "MirroredStrategy")))
        if kargs.get("distribution_strategy",
                     "MirroredStrategy") == "MirroredStrategy":
            print("==apply single machine multi-card training==")

            train_spec = tf.estimator.TrainSpec(input_fn=train_features,
                                                max_steps=num_train_steps)

            eval_spec = tf.estimator.EvalSpec(input_fn=eval_features,
                                              steps=num_eval_steps)

            model_estimator.train(input_fn=train_features,
                                  max_steps=num_train_steps,
                                  hooks=train_hooks)
            # tf.estimator.train(model_estimator, train_spec)

            train_end_time = time.time()
            print("==training time==", train_end_time - train_being_time)
            tf.logging.info("==training time=={}".format(train_end_time -
                                                         train_being_time))
            eval_results = model_estimator.evaluate(input_fn=eval_features,
                                                    steps=num_eval_steps)
            print(eval_results)

        elif kargs.get("distribution_strategy", "MirroredStrategy") in [
                "ParameterServerStrategy", "CollectiveAllReduceStrategy"
        ]:
            print("==apply multi-machine machine multi-card training==")
            try:
                print(os.environ['TF_CONFIG'], "==tf_run_config==")
            except:
                print("==not tf config==")
            train_spec = tf.estimator.TrainSpec(input_fn=train_features,
                                                max_steps=num_train_steps)

            eval_spec = tf.estimator.EvalSpec(input_fn=eval_features,
                                              steps=num_eval_steps)

            # tf.estimator.train(model_estimator, train_spec) # tf 1.12 doesn't need evaluate

            tf.estimator.train_and_evaluate(model_estimator, train_spec,
                                            eval_spec)
    def model_fn(features, labels, mode):

        original_loss = tf.constant(0.0)
        distilled_loss = tf.constant(0.0)

        st_model = st_model_fn(
            model_config_dict['student'],
            num_labels_dict['student'],
            init_checkpoint_dict['student'],
            model_reuse=None,
            load_pretrained=load_pretrained_dict['student'],
            model_io_config=model_io_config,
            opt_config=opt_config,
            exclude_scope=exclude_scope_dict.get('student', ""),
            not_storage_params=not_storage_params_dict.get('student', []),
            target=target_dict['student'],
            **kargs)
        st_dict = st_model(features, labels, mode)

        ta_model = ta_model_fn(
            model_config_dict['teacher'],
            num_labels_dict['teacher'],
            init_checkpoint_dict['teacher'],
            model_reuse=None,
            load_pretrained=load_pretrained_dict['teacher'],
            model_io_config=model_io_config,
            opt_config=opt_config,
            exclude_scope=exclude_scope_dict.get('teacher', ""),
            not_storage_params=not_storage_params_dict.get('teacher', []),
            target=target_dict['teacher'],
            **kargs)
        ta_dict = ta_model(features, labels, mode)

        studnet_logit = st_dict['logits']
        teacher_logit = ta_dict['logits']

        model_io_fn = model_io.ModelIO(model_io_config)

        feature_flag = False

        original_loss += st_dict['loss'] * (distillation_config.get(
            'ce_loss', 1.0))
        print(distillation_config.get('ce_loss', 1.0), '===ce_loss===')
        tf.summary.scalar("ce_loss", st_dict['loss'])

        if 'kl_logits' in distillation_config.get('distillation_type',
                                                  ['kl_logits']):
            temperature = distillation_config.get('kl_temperature', 2.0)
            distilled_teacher_logit = tf.nn.log_softmax(
                (teacher_logit + 1e-10) / temperature)  # log_softmax logits
            distilled_student_logit = tf.nn.log_softmax(
                (studnet_logit + 1e-10) / temperature)  # log_softmax logits

            kl_distilled_loss = tf.reduce_mean(
                distillation_utils.kd(distilled_teacher_logit,
                                      distilled_student_logit))

            tf.summary.scalar("kl_logits_loss", kl_distilled_loss)
            tf.logging.info(
                "***** with knowledge distillation %s tenperature *****",
                str(temperature))

            # kl_distilled_loss *= np.power(temperature, 2)
            distilled_loss += kl_distilled_loss * distillation_config.get(
                'kl_logits_ratio', 0.9)
            print(distillation_config.get('kl_logits_ratio', 0.9),
                  '===kl_logits_ratio===')

        if 'rkd' in distillation_config.get('distillation_type',
                                            ['kl_logits']):
            source = ta_dict['model'].get_pooled_output()
            target = st_dict['model'].get_pooled_output()
            print("==apply rkd==")
            with tf.variable_scope("distillation", reuse=tf.AUTO_REUSE):
                rkd_loss = repo_distillation_utils.RKD(source,
                                                       target,
                                                       l=[25, 50])
            tf.summary.scalar("rkd_loss", rkd_loss)
            distilled_loss += rkd_loss * distillation_config.get(
                "rkd_ratio", 0.1)

        if "attention_score_uniform" in distillation_config.get(
                'distillation_type', ['kl_logits']):
            source_attention_score = ta_dict['model'].get_multihead_attention()
            target_attention_score = st_dict['model'].get_multihead_attention()

            print("==apply attention_score_uniform==")

            with tf.variable_scope("distillation", reuse=tf.AUTO_REUSE):
                attention_loss = uniform_mapping.attention_score_matching(
                    source_attention_score, target_attention_score,
                    features['input_mask'], 0)
            tf.summary.scalar("attention_score_uniform_loss", attention_loss)
            feature_flag = True
            distilled_loss += attention_loss * distillation_config.get(
                "attention_score_uniform", 0.1)

            print(distillation_config.get('attention_score_uniform', 0.1),
                  '===attention_score_uniform===')

        if "hidden_uniform" in distillation_config.get('distillation_type',
                                                       ['kl_logits']):
            source_hidden = ta_dict['model'].get_all_encoder_layers()
            target_hidden = st_dict['model'].get_all_encoder_layers()

            print("==apply hidden_uniform==")

            with tf.variable_scope("distillation", reuse=tf.AUTO_REUSE):
                hidden_loss = uniform_mapping.hidden_matching(
                    source_hidden, target_hidden, features['input_mask'], 0)
            tf.summary.scalar("hidden_uniform_loss", hidden_loss)
            distilled_loss += hidden_loss * distillation_config.get(
                "hidden_uniform", 0.1)
            feature_flag = True

            print(distillation_config.get('hidden_uniform', 0.1),
                  '===hidden_uniform===')

        if "hidden_cls_uniform" in distillation_config.get(
                'distillation_type', ['kl_logits']):
            source_hidden = ta_dict['model'].get_all_encoder_layers()
            target_hidden = st_dict['model'].get_all_encoder_layers()

            print("==apply hidden_cls_uniform==")
            with tf.variable_scope("distillation", reuse=tf.AUTO_REUSE):
                hidden_cls_loss = uniform_mapping.hidden_cls_matching(
                    source_hidden, target_hidden, 0)
            tf.summary.scalar("hidden_cls_uniform_loss", hidden_cls_loss)
            distilled_loss += hidden_cls_loss * distillation_config.get(
                "hidden_uniform", 0.1)
            feature_flag = True

        if "mdd" in distillation_config.get('distillation_type', ['mdd']):
            source = ta_dict['model'].get_pooled_output()
            target = st_dict['model'].get_pooled_output()

            print("==apply mdd==")

        if "cpc" in distillation_config.get('distillation_type', ['mdd']):
            source_hidden = ta_dict['model'].get_all_encoder_layers()
            target_hidden = st_dict['model'].get_all_encoder_layers()
            with tf.variable_scope("distillation", reuse=tf.AUTO_REUSE):
                cpc_loss = cpc_utils.CPC_Hidden(target_hidden, source_hidden,
                                                features['input_mask'])
            tf.summary.scalar("hidden_cpc_loss", cpc_loss)
            distilled_loss += cpc_loss + distillation_config.get(
                "cpc_hidden", 0.1)

        if "wpc" in distillation_config.get('distillation_type', ['mdd']):
            source_hidden = ta_dict['model'].get_all_encoder_layers()
            target_hidden = st_dict['model'].get_all_encoder_layers()
            with tf.variable_scope("distillation", reuse=tf.AUTO_REUSE):
                wpc_loss = cpc_utils.WPC_Hidden(target_hidden, source_hidden,
                                                features['input_mask'])
            tf.summary.scalar("hidden_wpc_loss", wpc_loss)
            distilled_loss += wpc_loss + distillation_config.get(
                "wpc_hidden", 0.1)

        total_loss = distilled_loss + original_loss

        tvars = []
        tvars.extend(st_dict['tvars'])

        if feature_flag:
            distillation_vars = model_io_fn.get_params('distillation',
                                                       not_storage_params=[])
            tvars.extend(distillation_vars)

        if mode == tf.estimator.ModeKeys.TRAIN:

            optimizer_fn = optimizer.Optimizer(opt_config)

            model_io_fn.print_params(tvars, string=", trainable params")
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            print("==update_ops==", update_ops)

            print('==total trainable vars==', list(tvars))

            with tf.control_dependencies(update_ops):
                train_op = optimizer_fn.get_train_op(
                    total_loss, list(set(tvars)), opt_config.init_lr,
                    opt_config.num_train_steps, **kargs)

                model_io_fn.set_saver()

                if kargs.get("task_index", 1) == 0 and kargs.get(
                        "run_config", None):
                    training_hooks = []
                elif kargs.get("task_index", 1) == 0:
                    model_io_fn.get_hooks(kargs.get("checkpoint_dir", None),
                                          kargs.get("num_storage_steps", 1000))

                    training_hooks = model_io_fn.checkpoint_hook
                else:
                    training_hooks = []

                if len(optimizer_fn.distributed_hooks) >= 1:
                    training_hooks.extend(optimizer_fn.distributed_hooks)
                print(training_hooks, "==training_hooks==", "==task_index==",
                      kargs.get("task_index", 1))

                estimator_spec = tf.estimator.EstimatorSpec(
                    mode=mode,
                    loss=total_loss,
                    train_op=train_op,
                    training_hooks=training_hooks)
                if output_type == "sess":
                    return {
                        "train": {
                            "loss": total_loss,
                            "logits": studnet_logit,
                            "train_op": train_op
                        },
                        "hooks": training_hooks
                    }
                elif output_type == "estimator":
                    return estimator_spec

        elif mode == tf.estimator.ModeKeys.EVAL:

            def metric_fn(per_example_loss, logits, label_ids, model_type):
                """Computes the loss and accuracy of the model."""
                sentence_log_probs = tf.reshape(logits, [-1, logits.shape[-1]])
                sentence_predictions = tf.argmax(logits,
                                                 axis=-1,
                                                 output_type=tf.int32)
                sentence_labels = tf.reshape(label_ids, [-1])
                sentence_accuracy = tf.metrics.accuracy(
                    labels=label_ids, predictions=sentence_predictions)
                sentence_mean_loss = tf.metrics.mean(values=per_example_loss)
                sentence_f = tf_metrics.f1(label_ids,
                                           sentence_predictions,
                                           num_labels_dict['student'],
                                           None,
                                           average="macro")

                eval_metric_ops = {
                    "{}_f1".format(model_type): sentence_f,
                    "{}_acc".format(model_type): sentence_accuracy
                }

                return eval_metric_ops

            if output_type == "sess":
                return {
                    "eval": {
                        "per_example_loss":
                        st_dict['logits']['per_example_loss'],
                        "logits":
                        studnet_logit,
                        "loss":
                        tf.reduce_mean(st_dict['logits']['per_example_loss']),
                        "feature":
                        st_dict['model'].get_pooled_output()
                    }
                }
            elif output_type == "estimator":
                eval_metric_ops = metric_fn(st_dict['per_example_loss'],
                                            studnet_logit,
                                            features['label_ids'], "student")
                teacher_eval_metric_ops = metric_fn(
                    ta_dict['per_example_loss'], teacher_logit,
                    features['label_ids'], "teacher")

                eval_metric_ops.update(teacher_eval_metric_ops)

                estimator_spec = tf.estimator.EstimatorSpec(
                    mode=mode,
                    loss=total_loss,
                    eval_metric_ops=eval_metric_ops)
                return estimator_spec
        else:
            raise NotImplementedError()
Exemple #11
0
    def model_fn(features, labels, mode):
        model = bert_encoder(model_config,
                             features,
                             labels,
                             mode,
                             target,
                             reuse=model_reuse)

        label_ids = features["label_ids"]

        if mode == tf.estimator.ModeKeys.TRAIN:
            dropout_prob = model_config.dropout_prob
        else:
            dropout_prob = 0.0

        if model_io_config.fix_lm == True:
            scope = model_config.scope + "_finetuning"
        else:
            scope = model_config.scope

        with tf.variable_scope(scope, reuse=model_reuse):
            (loss, per_example_loss,
             logits) = classifier.classifier(model_config,
                                             model.get_pooled_output(),
                                             num_labels, label_ids,
                                             dropout_prob)

        model_io_fn = model_io.ModelIO(model_io_config)

        tvars = model_io_fn.get_params(model_config.scope,
                                       not_storage_params=not_storage_params)
        if load_pretrained:
            model_io_fn.load_pretrained(tvars,
                                        init_checkpoint,
                                        exclude_scope=exclude_scope)

        model_io_fn.set_saver(var_lst=tvars)

        if mode == tf.estimator.ModeKeys.TRAIN:

            optimizer_fn = optimizer.Optimizer(opt_config)

            model_io_fn.print_params(tvars, string=", trainable params")
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            with tf.control_dependencies(update_ops):
                train_op = optimizer_fn.get_train_op(
                    loss, tvars, opt_config.init_lr,
                    opt_config.num_train_steps)

                estimator_spec = tf.estimator.EstimatorSpec(mode=mode,
                                                            loss=loss,
                                                            train_op=train_op)
                if output_type == "sess":
                    return {
                        "train": {
                            "loss": loss,
                            "logits": logits,
                            "train_op": train_op
                        }
                    }
                elif output_type == "estimator":
                    return estimator_spec

        elif mode == tf.estimator.ModeKeys.PREDICT:
            print(logits.get_shape(), "===logits shape===")
            pred_label = tf.argmax(logits, axis=-1, output_type=tf.int32)
            prob = tf.nn.softmax(logits)
            max_prob = tf.reduce_max(prob, axis=-1)

            estimator_spec = tf.estimator.EstimatorSpec(
                mode=mode,
                predictions={
                    'pred_label': pred_label,
                    "max_prob": max_prob
                },
                export_outputs={
                    "output":
                    tf.estimator.export.PredictOutput({
                        'pred_label': pred_label,
                        "max_prob": max_prob
                    })
                })
            return estimator_spec

        elif mode == tf.estimator.ModeKeys.EVAL:

            def metric_fn(per_example_loss, logits, label_ids):
                """Computes the loss and accuracy of the model."""
                sentence_log_probs = tf.reshape(logits, [-1, logits.shape[-1]])
                sentence_predictions = tf.argmax(logits,
                                                 axis=-1,
                                                 output_type=tf.int32)
                sentence_labels = tf.reshape(label_ids, [-1])
                sentence_accuracy = tf.metrics.accuracy(
                    labels=label_ids, predictions=sentence_predictions)
                sentence_mean_loss = tf.metrics.mean(values=per_example_loss)
                sentence_f = tf_metrics.f1(label_ids,
                                           sentence_predictions,
                                           num_labels,
                                           label_lst,
                                           average="macro")

                eval_metric_ops = {"f1": sentence_f, "acc": sentence_accuracy}

                return eval_metric_ops

            eval_metric_ops = metric_fn(per_example_loss, logits, label_ids)

            estimator_spec = tf.estimator.EstimatorSpec(
                mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)

            if output_type == "sess":
                return {
                    "eval": {
                        "per_example_loss": per_example_loss,
                        "logits": logits,
                        "loss": tf.reduce_mean(per_example_loss)
                    }
                }
            elif output_type == "estimator":
                return estimator_spec
        else:
            raise NotImplementedError()
Exemple #12
0
    def model_fn(features, labels, mode):

        task_type = kargs.get("task_type", "cls")
        num_task = kargs.get('num_task', 1)
        temp = kargs.get('temp', 0.1)

        print("==task_type==", task_type)

        model_io_fn = model_io.ModelIO(model_io_config)
        label_ids = tf.cast(features["{}_label_ids".format(task_type)],
                            dtype=tf.int32)

        if mode == tf.estimator.ModeKeys.TRAIN:
            dropout_prob = model_config.dropout_prob
            is_training = True
        else:
            dropout_prob = 0.0
            is_training = False

        if model_io_config.fix_lm == True:
            scope = model_config.scope + "_finetuning"
        else:
            scope = model_config.scope

        if kargs.get("get_pooled_output", "pooled_output") == "pooled_output":
            pooled_feature = model.get_pooled_output()
        elif kargs.get("get_pooled_output", "task_output") == "task_output":
            pooled_feature_dict = model.get_task_output()
            pooled_feature = pooled_feature_dict['pooled_feature']

        shape_list = bert_utils.get_shape_list(
            pooled_feature_dict['feature_a'], expected_rank=[2])
        batch_size = shape_list[0]

        if kargs.get('apply_head_proj', False):
            with tf.variable_scope(scope + "/head_proj", reuse=tf.AUTO_REUSE):
                feature_a = simclr_utils.projection_head(
                    pooled_feature_dict['feature_a'],
                    is_training,
                    head_proj_dim=128,
                    num_nlh_layers=1,
                    head_proj_mode='nonlinear',
                    name='head_contrastive')
                pooled_feature_dict['feature_a'] = feature_a

            with tf.variable_scope(scope + "/head_proj", reuse=tf.AUTO_REUSE):
                feature_b = simclr_utils.projection_head(
                    pooled_feature_dict['feature_b'],
                    is_training,
                    head_proj_dim=128,
                    num_nlh_layers=1,
                    head_proj_mode='nonlinear',
                    name='head_contrastive')
                pooled_feature_dict['feature_b'] = feature_b
            tf.logging.info(
                "****** apply contrastive feature projection *******")
        else:
            feature_a = pooled_feature_dict['feature_a']
            feature_b = pooled_feature_dict['feature_b']
            tf.logging.info("****** not apply projection *******")

        # feature_a = tf.nn.l2_normalize(feature_a, axis=-1)
        # feature_b = tf.nn.l2_normalize(feature_b, axis=-1)
        # [batch_size, batch_size]
        if kargs.get("task_seperate_proj", False):
            if task_type == 'xquad' or task_type == 'wsdm':
                # for passage representation
                with tf.variable_scope(
                        scope + "/{}/feature_output_b".format(task_type),
                        reuse=tf.AUTO_REUSE):
                    feature_b = tf.layers.dense(
                        feature_b,
                        128,
                        use_bias=True,
                        activation=tf.tanh,
                        kernel_initializer=tf.truncated_normal_initializer(
                            stddev=0.01))
                tf.logging.info("****** apply passage projection *******")
            if task_type == 'afqmc':
                # for anchor representation
                with tf.variable_scope(
                        scope + "/{}/feature_output_a".format(task_type),
                        reuse=tf.AUTO_REUSE):
                    feature_a = tf.layers.dense(
                        feature_a,
                        128,
                        use_bias=True,
                        activation=tf.tanh,
                        kernel_initializer=tf.truncated_normal_initializer(
                            stddev=0.01))
                # for successor representation
                with tf.variable_scope(
                        scope + "/{}/feature_output_b".format(task_type),
                        reuse=tf.AUTO_REUSE):
                    feature_b = tf.layers.dense(
                        feature_b,
                        128,
                        use_bias=True,
                        activation=tf.tanh,
                        kernel_initializer=tf.truncated_normal_initializer(
                            stddev=0.01))
                tf.logging.info(
                    "****** apply cpc anchor, successor projection *******")

        cosine_score = tf.matmul(feature_a,
                                 tf.transpose(feature_b)) / model_config.get(
                                     'temperature', 0.5)
        print("==cosine_score shape==", cosine_score.get_shape())
        loss_mask = tf.cast(features["{}_loss_multipiler".format(task_type)],
                            tf.float32)

        if task_type == 'xquad':
            neg_true_mask = tf.cast(
                triplet_loss_utils._get_anchor_negative_triplet_mask(
                    label_ids), tf.float32)
            pos_true_mask = (1.0 - neg_true_mask) * tf.expand_dims(
                loss_mask, axis=-1) * tf.expand_dims(loss_mask, axis=0)
            neg_true_mask = neg_true_mask * tf.expand_dims(
                loss_mask, axis=-1) * tf.expand_dims(loss_mask, axis=0)
        elif task_type == 'wsdm':
            pos_label_mask = tf.cast(
                features["{}_label_ids".format(task_type)], dtype=tf.float32)
            loss_mask *= pos_label_mask
            pos_label_mask = tf.expand_dims(pos_label_mask,
                                            axis=-1)  # batch x batch
            score_shape = bert_utils.get_shape_list(cosine_score,
                                                    expected_rank=[2, 3])
            pos_true_mask = pos_label_mask * tf.eye(score_shape[0])
            neg_true_mask = tf.ones_like(cosine_score) - pos_true_mask
            pos_true_mask = pos_true_mask * tf.expand_dims(
                loss_mask, axis=-1) * tf.expand_dims(loss_mask, axis=0)
            neg_true_mask = neg_true_mask * tf.expand_dims(
                loss_mask, axis=-1) * tf.expand_dims(loss_mask, axis=0)
        elif task_type == 'afqmc':
            score_shape = bert_utils.get_shape_list(cosine_score,
                                                    expected_rank=[2, 3])
            not_equal_mask = get_labels_of_similarity(features['input_ids_a'],
                                                      features['input_ids_b'])
            pos_true_mask = tf.expand_dims(loss_mask, axis=-1) * tf.eye(
                score_shape[0])
            neg_true_mask = not_equal_mask * tf.expand_dims(
                loss_mask, axis=-1) * tf.expand_dims(loss_mask, axis=0)

        cosine_score_neg = neg_true_mask * cosine_score
        cosine_score_pos = -pos_true_mask * cosine_score

        y_pred_neg = cosine_score_neg - (1 - neg_true_mask) * 1e12
        y_pred_pos = cosine_score_pos - (1 - pos_true_mask) * 1e12

        # add circle-loss without margin and scale-factor
        joint_neg_loss = tf.reduce_logsumexp(y_pred_neg, axis=-1)
        joint_pos_loss = tf.reduce_logsumexp(y_pred_pos, axis=-1)
        logits = tf.nn.softplus(joint_neg_loss + joint_pos_loss)

        loss = tf.reduce_sum(
            logits * loss_mask) / (1e-10 + tf.reduce_sum(loss_mask))
        task_loss = loss
        params_size = model_io_fn.count_params(model_config.scope)
        print("==total encoder params==", params_size)

        if kargs.get("feature_distillation", True):
            universal_feature_a = features.get("input_ids_a_features", None)
            universal_feature_b = features.get("input_ids_b_features", None)

            if universal_feature_a is None or universal_feature_b is None:
                tf.logging.info(
                    "****** not apply feature distillation *******")
                feature_loss = tf.constant(0.0)
            else:
                feature_a = pooled_feature_dict['feature_a']
                feature_a_shape = bert_utils.get_shape_list(
                    feature_a, expected_rank=[2, 3])
                pretrain_feature_a_shape = bert_utils.get_shape_list(
                    universal_feature_a, expected_rank=[2, 3])
                if feature_a_shape[-1] != pretrain_feature_a_shape[-1]:
                    with tf.variable_scope(scope + "/feature_proj",
                                           reuse=tf.AUTO_REUSE):
                        proj_feature_a = tf.layers.dense(
                            feature_a, pretrain_feature_a_shape[-1])
                    # with tf.variable_scope(scope+"/feature_rec", reuse=tf.AUTO_REUSE):
                    # 	proj_feature_a_rec = tf.layers.dense(proj_feature_a, feature_a_shape[-1])
                    # loss += tf.reduce_mean(tf.reduce_sum(tf.square(proj_feature_a_rec-feature_a), axis=-1))/float(num_task)
                    tf.logging.info(
                        "****** apply auto-encoder for feature compression *******"
                    )
                else:
                    proj_feature_a = feature_a
                feature_a_norm = tf.stop_gradient(
                    tf.sqrt(
                        tf.reduce_sum(tf.pow(proj_feature_a, 2),
                                      axis=-1,
                                      keepdims=True)) + 1e-20)
                proj_feature_a /= feature_a_norm

                feature_b = pooled_feature_dict['feature_b']
                if feature_a_shape[-1] != pretrain_feature_a_shape[-1]:
                    with tf.variable_scope(scope + "/feature_proj",
                                           reuse=tf.AUTO_REUSE):
                        proj_feature_b = tf.layers.dense(
                            feature_b, pretrain_feature_a_shape[-1])
                    # with tf.variable_scope(scope+"/feature_rec", reuse=tf.AUTO_REUSE):
                    # 	proj_feature_b_rec = tf.layers.dense(proj_feature_b, feature_a_shape[-1])
                    # loss += tf.reduce_mean(tf.reduce_sum(tf.square(proj_feature_b_rec-feature_b), axis=-1))/float(num_task)
                    tf.logging.info(
                        "****** apply auto-encoder for feature compression *******"
                    )
                else:
                    proj_feature_b = feature_b

                feature_b_norm = tf.stop_gradient(
                    tf.sqrt(
                        tf.reduce_sum(tf.pow(proj_feature_b, 2),
                                      axis=-1,
                                      keepdims=True)) + 1e-20)
                proj_feature_b /= feature_b_norm

                feature_a_distillation = tf.reduce_mean(
                    tf.square(universal_feature_a - proj_feature_a), axis=-1)
                feature_b_distillation = tf.reduce_mean(
                    tf.square(universal_feature_b - proj_feature_b), axis=-1)

                feature_loss = tf.reduce_mean(
                    (feature_a_distillation + feature_b_distillation) /
                    2.0) / float(num_task)
                loss += feature_loss
                tf.logging.info(
                    "****** apply prertained feature distillation *******")

        if kargs.get("embedding_distillation", True):
            word_embed = model.emb_mat
            random_embed_shape = bert_utils.get_shape_list(
                word_embed, expected_rank=[2, 3])
            print("==random_embed_shape==", random_embed_shape)
            pretrained_embed = kargs.get('pretrained_embed', None)
            if pretrained_embed is None:
                tf.logging.info(
                    "****** not apply prertained feature distillation *******")
                embed_loss = tf.constant(0.0)
            else:
                pretrain_embed_shape = bert_utils.get_shape_list(
                    pretrained_embed, expected_rank=[2, 3])
                print("==pretrain_embed_shape==", pretrain_embed_shape)
                if random_embed_shape[-1] != pretrain_embed_shape[-1]:
                    with tf.variable_scope(scope + "/embedding_proj",
                                           reuse=tf.AUTO_REUSE):
                        proj_embed = tf.layers.dense(word_embed,
                                                     pretrain_embed_shape[-1])
                else:
                    proj_embed = word_embed

                embed_loss = tf.reduce_mean(
                    tf.reduce_mean(tf.square(proj_embed - pretrained_embed),
                                   axis=-1)) / float(num_task)
                loss += embed_loss
                tf.logging.info(
                    "****** apply prertained feature distillation *******")

        if mode == tf.estimator.ModeKeys.TRAIN:
            multi_task_config = kargs.get("multi_task_config", {})
            if multi_task_config.get(task_type,
                                     {}).get("lm_augumentation", False):
                print("==apply lm_augumentation==")
                masked_lm_positions = features["masked_lm_positions"]
                masked_lm_ids = features["masked_lm_ids"]
                masked_lm_weights = features["masked_lm_weights"]
                (masked_lm_loss, masked_lm_example_loss,
                 masked_lm_log_probs) = pretrain.get_masked_lm_output(
                     model_config,
                     model.get_sequence_output(),
                     model.get_embedding_table(),
                     masked_lm_positions,
                     masked_lm_ids,
                     masked_lm_weights,
                     reuse=model_reuse)

                masked_lm_loss_mask = tf.expand_dims(loss_mask, -1) * tf.ones(
                    (1,
                     multi_task_config[task_type]["max_predictions_per_seq"]))
                masked_lm_loss_mask = tf.reshape(masked_lm_loss_mask, (-1, ))

                masked_lm_label_weights = tf.reshape(masked_lm_weights, [-1])
                masked_lm_loss_mask *= tf.cast(masked_lm_label_weights,
                                               tf.float32)

                masked_lm_example_loss *= masked_lm_loss_mask  # multiply task_mask
                masked_lm_loss = tf.reduce_sum(masked_lm_example_loss) / (
                    1e-10 + tf.reduce_sum(masked_lm_loss_mask))
                loss += multi_task_config[task_type][
                    "masked_lm_loss_ratio"] * masked_lm_loss

                masked_lm_label_ids = tf.reshape(masked_lm_ids, [-1])

                print(masked_lm_log_probs.get_shape(),
                      "===masked lm log probs===")
                print(masked_lm_label_ids.get_shape(), "===masked lm ids===")
                print(masked_lm_label_weights.get_shape(),
                      "===masked lm mask===")

                lm_acc = build_accuracy(masked_lm_log_probs,
                                        masked_lm_label_ids,
                                        masked_lm_loss_mask)

        if kargs.get("task_invariant", "no") == "yes":
            print("==apply task adversarial training==")
            with tf.variable_scope(scope + "/dann_task_invariant",
                                   reuse=model_reuse):
                (_, task_example_loss,
                 task_logits) = distillation_utils.feature_distillation(
                     model.get_pooled_output(), 1.0, features["task_id"],
                     kargs.get("num_task", 7), dropout_prob, True)
                masked_task_example_loss = loss_mask * task_example_loss
                masked_task_loss = tf.reduce_sum(masked_task_example_loss) / (
                    1e-10 + tf.reduce_sum(loss_mask))
                loss += kargs.get("task_adversarial", 1e-2) * masked_task_loss

        tvars = model_io_fn.get_params(model_config.scope,
                                       not_storage_params=not_storage_params)

        if mode == tf.estimator.ModeKeys.TRAIN:
            multi_task_config = kargs.get("multi_task_config", {})
            if multi_task_config.get(task_type,
                                     {}).get("lm_augumentation", False):
                print("==apply lm_augumentation==")
                masked_lm_pretrain_tvars = model_io_fn.get_params(
                    "cls/predictions", not_storage_params=not_storage_params)
                tvars.extend(masked_lm_pretrain_tvars)

        try:
            params_size = model_io_fn.count_params(model_config.scope)
            print("==total params==", params_size)
        except:
            print("==not count params==")
        # print(tvars)
        if load_pretrained == "yes":
            model_io_fn.load_pretrained(tvars,
                                        init_checkpoint,
                                        exclude_scope=exclude_scope)

        if mode == tf.estimator.ModeKeys.TRAIN:

            # acc = build_accuracy(logits,
            # 					label_ids,
            # 					loss_mask,
            # 					loss_type=kargs.get('loss', 'contrastive_loss'))

            return_dict = {
                "loss": loss,
                "logits": logits,
                "task_num": tf.reduce_sum(loss_mask),
                "{}_pos_num".format(task_type): tf.reduce_sum(pos_true_mask),
                "{}_neg_num".format(task_type): tf.reduce_sum(neg_true_mask),
                "tvars": tvars
            }
            # return_dict["{}_acc".format(task_type)] = acc
            if kargs.get("task_invariant", "no") == "yes":
                return_dict["{}_task_loss".format(
                    task_type)] = masked_task_loss
                task_acc = build_accuracy(task_logits, features["task_id"],
                                          loss_mask)
                return_dict["{}_task_acc".format(task_type)] = task_acc
            if multi_task_config.get(task_type,
                                     {}).get("lm_augumentation", False):
                return_dict["{}_masked_lm_loss".format(
                    task_type)] = masked_lm_loss
                return_dict["{}_masked_lm_acc".format(task_type)] = lm_acc
            if kargs.get("embedding_distillation", True):
                return_dict["embed_loss"] = embed_loss * float(num_task)
            else:
                return_dict["embed_loss"] = task_loss
            if kargs.get("feature_distillation", True):
                return_dict["feature_loss"] = feature_loss * float(num_task)
            else:
                return_dict["feature_loss"] = task_loss
            return_dict["task_loss"] = task_loss
            return return_dict
        elif mode == tf.estimator.ModeKeys.EVAL:
            eval_dict = {
                "loss": loss,
                "logits": logits,
                "feature": model.get_pooled_output()
            }
            if kargs.get("adversarial", "no") == "adversarial":
                eval_dict["task_logits"] = task_logits
            return eval_dict
Exemple #13
0
    def model_fn(features, labels, mode, params):

        model_api = model_zoo(model_config)

        if kargs.get('random_generator', '1') == '1':
            if mode in [
                    tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.TRAIN
            ]:
                input_ori_ids = features['input_ori_ids']

                # [output_ids,
                # sampled_binary_mask] = random_input_ids_generation(model_config,
                # 							features['input_ori_ids'],
                # 							features['input_mask'],
                # 							mask_probability=0.2,
                # 							replace_probability=0.1,
                # 							original_probability=0.1,
                # 							**kargs)

                [output_ids, sampled_binary_mask] = hmm_input_ids_generation(
                    model_config,
                    features['input_ori_ids'],
                    features['input_mask'], [
                        tf.cast(tf.constant(hmm_tran_prob), tf.float32)
                        for hmm_tran_prob in hmm_tran_prob_list
                    ],
                    mask_probability=0.2,
                    replace_probability=0.0,
                    original_probability=0.0,
                    mask_prior=tf.constant(mask_prior, tf.float32),
                    **kargs)

                features['input_ids'] = output_ids
                tf.logging.info("****** do random generator *******")
            else:
                sampled_binary_mask = None
        else:
            sampled_binary_mask = None

        model = model_api(model_config,
                          features,
                          labels,
                          mode,
                          target,
                          reuse=tf.AUTO_REUSE,
                          **kargs)

        if mode == tf.estimator.ModeKeys.TRAIN:
            dropout_prob = model_config.dropout_prob
        else:
            dropout_prob = 0.0

        if model_io_config.fix_lm == True:
            scope = model_config.scope + "_finetuning"
        else:
            scope = model_config.scope

        (nsp_loss, nsp_per_example_loss,
         nsp_log_prob) = pretrain.get_next_sentence_output(
             model_config,
             model.get_pooled_output(),
             features['next_sentence_labels'],
             reuse=tf.AUTO_REUSE,
             scope=generator_scope_prefix)

        masked_lm_positions = features["masked_lm_positions"]
        masked_lm_ids = features["masked_lm_ids"]
        masked_lm_weights = features["masked_lm_weights"]

        if model_config.model_type == 'bert':
            masked_lm_fn = pretrain.get_masked_lm_output
            seq_masked_lm_fn = pretrain.seq_mask_masked_lm_output
            print("==apply bert masked lm==")
        elif model_config.model_type == 'albert':
            masked_lm_fn = pretrain_albert.get_masked_lm_output
            seq_masked_lm_fn = pretrain_albert.seq_mask_masked_lm_output
            print("==apply albert masked lm==")
        else:
            masked_lm_fn = pretrain.get_masked_lm_output
            seq_masked_lm_fn = pretrain_albert.seq_mask_masked_lm_output
            print("==apply bert masked lm==")

        if sampled_binary_mask is not None:
            (masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs,
             masked_lm_mask) = seq_masked_lm_fn(
                 model_config,
                 model.get_sequence_output(),
                 model.get_embedding_table(),
                 features['input_mask'],
                 features['input_ori_ids'],
                 features['input_ids'],
                 sampled_binary_mask,
                 reuse=tf.AUTO_REUSE,
                 embedding_projection=model.get_embedding_projection_table(),
                 scope=generator_scope_prefix)
            masked_lm_ids = features['input_ori_ids']
        else:
            (masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs,
             masked_lm_mask) = masked_lm_fn(
                 model_config,
                 model.get_sequence_output(),
                 model.get_embedding_table(),
                 masked_lm_positions,
                 masked_lm_ids,
                 masked_lm_weights,
                 reuse=tf.AUTO_REUSE,
                 embedding_projection=model.get_embedding_projection_table(),
                 scope=generator_scope_prefix)
        print(model_config.lm_ratio, '==mlm lm_ratio==')
        loss = model_config.lm_ratio * masked_lm_loss + 0.0 * nsp_loss

        if kargs.get("resample_discriminator", False):
            input_ori_ids = features['input_ori_ids']

            [output_ids, sampled_binary_mask
             ] = random_input_ids_generation(model_config,
                                             features['input_ori_ids'],
                                             features['input_mask'],
                                             mask_probability=0.2,
                                             replace_probability=0.1,
                                             original_probability=0.1)

            resample_features = {}
            for key in features:
                resample_features[key] = features[key]

            resample_features['input_ids'] = tf.identity(output_ids)
            model_resample = model_api(model_config,
                                       resample_features,
                                       labels,
                                       mode,
                                       target,
                                       reuse=tf.AUTO_REUSE,
                                       **kargs)

            tf.logging.info("**** apply discriminator resample **** ")
        else:
            model_resample = model
            resample_features = features
            tf.logging.info("**** not apply discriminator resample **** ")

        sampled_ids = token_generator(model_config,
                                      model_resample.get_sequence_output(),
                                      model_resample.get_embedding_table(),
                                      resample_features['input_ids'],
                                      resample_features['input_ori_ids'],
                                      resample_features['input_mask'],
                                      embedding_projection=model_resample.
                                      get_embedding_projection_table(),
                                      scope=generator_scope_prefix,
                                      mask_method='only_mask',
                                      use_tpu=kargs.get('use_tpu', True))

        if model_config.get('gen_sample', 1) == 1:
            input_ids = features['input_ori_ids']
            input_mask = features['input_mask']
            segment_ids = features['segment_ids']
        else:
            input_ids = tf.expand_dims(features['input_ori_ids'], axis=-1)
            # batch x seq_length x 1
            input_ids = tf.einsum(
                'abc,cd->abd', input_ids,
                tf.ones((1, model_config.get('gen_sample', 1))))
            input_ids = tf.cast(input_ids, tf.int32)

            input_shape_list = bert_utils.get_shape_list(input_ids,
                                                         expected_rank=3)
            batch_size = input_shape_list[0]
            seq_length = input_shape_list[1]
            gen_sample = input_shape_list[2]

            sampled_ids = tf.reshape(sampled_ids,
                                     [batch * gen_sample, seq_length])
            input_ids = tf.reshape(input_ids, [batch * gen_sample, seq_length])

            input_mask = tf.expand_dims(features['input_mask'], axis=-1)
            input_mask = tf.einsum(
                'abc,cd->abd', input_mask,
                tf.ones((1, model_config.get('gen_sample', 1))))
            input_mask = tf.cast(input_mask, tf.int32)

            segment_ids = tf.expand_dims(features['segmnet_ids'], axis=-1)
            segment_ids = tf.einsum(
                'abc,cd->abd', segment_ids,
                tf.ones((1, model_config.get('gen_sample', 1))))
            segment_ids = tf.cast(segment_ids, tf.int32)

            segment_ids = tf.reshape(segment_ids,
                                     [batch * gen_sample, seq_length])
            input_mask = tf.reshape(input_mask,
                                    [batch * gen_sample, seq_length])

        model_io_fn = model_io.ModelIO(model_io_config)

        pretrained_tvars = model_io_fn.get_params(
            model_config.scope, not_storage_params=not_storage_params)

        if generator_scope_prefix:
            """
			"generator/cls/predictions"
			"""
            lm_pretrain_tvars = model_io_fn.get_params(
                generator_scope_prefix + "/cls/predictions",
                not_storage_params=not_storage_params)

            nsp_pretrain_vars = model_io_fn.get_params(
                generator_scope_prefix + "/cls/seq_relationship",
                not_storage_params=not_storage_params)
        else:
            lm_pretrain_tvars = model_io_fn.get_params(
                "cls/predictions", not_storage_params=not_storage_params)

            nsp_pretrain_vars = model_io_fn.get_params(
                "cls/seq_relationship", not_storage_params=not_storage_params)

        if model_config.get('embedding_scope', None) is not None:
            embedding_tvars = model_io_fn.get_params(
                model_config.get('embedding_scope', 'bert') + "/embeddings",
                not_storage_params=not_storage_params)
            pretrained_tvars.extend(embedding_tvars)

        pretrained_tvars.extend(lm_pretrain_tvars)
        pretrained_tvars.extend(nsp_pretrain_vars)
        tvars = pretrained_tvars

        print('==generator parameters==', tvars)

        if load_pretrained == "yes":
            use_tpu = 1 if kargs.get('use_tpu', False) else 0
            scaffold_fn = model_io_fn.load_pretrained(
                tvars,
                init_checkpoint,
                exclude_scope=exclude_scope,
                use_tpu=use_tpu,
                restore_var_name=model_config.get('restore_var_name', []))
        else:
            scaffold_fn = None
        tf.add_to_collection("generator_loss", loss)
        return_dict = {
            "loss": loss,
            "tvars": tvars,
            "model": model,
            "sampled_ids": sampled_ids,  # batch x gen_sample, seg_length
            "sampled_input_ids": input_ids,  # batch x gen_sample, seg_length,
            "sampled_input_mask": input_mask,
            "sampled_segment_ids": segment_ids,
            "masked_lm_ids": masked_lm_ids,
            "masked_lm_weights": masked_lm_mask,
            "masked_lm_log_probs": masked_lm_log_probs,
            "masked_lm_example_loss": masked_lm_example_loss,
            "next_sentence_example_loss": nsp_per_example_loss,
            "next_sentence_log_probs": nsp_log_prob,
            "next_sentence_labels": features['next_sentence_labels'],
            "sampled_binary_mask": sampled_binary_mask
        }
        return return_dict
Exemple #14
0
    def model_fn(features, labels, mode, params):

        train_op_type = kargs.get('train_op_type', 'joint')
        gen_disc_type = kargs.get('gen_disc_type', 'all_disc')
        mask_method = kargs.get('mask_method', 'only_mask')
        use_tpu = 1 if kargs.get('use_tpu', False) else 0
        print(train_op_type, "===train op type===", gen_disc_type,
              "===generator loss type===")
        if mask_method == 'only_mask':
            tf.logging.info(
                "****** generator token generation mask type:%s with only masked token *******",
                mask_method)
        elif mask_method == 'all_mask':
            tf.logging.info(
                "****** generator token generation mask type:%s with all token *******",
                mask_method)
        else:
            mask_method = 'only_mask'
            tf.logging.info(
                "****** generator token generation mask type:%s with only masked token *******",
                mask_method)

        if kargs.get('optimization_type', 'grl') == 'grl':
            if_flip_grad = True
            train_op_type = 'joint'
        elif kargs.get('optimization_type', 'grl') == 'minmax':
            if_flip_grad = False
        else:
            if_flip_grad = True
            train_op_type = 'joint'
        generator_fn = generator(
            model_config_dict['generator'],
            num_labels_dict['generator'],
            init_checkpoint_dict['generator'],
            model_reuse=None,
            load_pretrained=load_pretrained_dict['generator'],
            model_io_config=model_io_config,
            opt_config=opt_config,
            exclude_scope=exclude_scope_dict.get('generator', ""),
            not_storage_params=not_storage_params_dict.get('generator', []),
            target=target_dict['generator'],
            if_flip_grad=if_flip_grad,
            # mask_method='only_mask',
            **kargs)

        tf.logging.info("****** train_op_type:%s *******", train_op_type)
        tf.logging.info("****** optimization_type:%s *******",
                        kargs.get('optimization_type', 'grl'))
        generator_dict = generator_fn(features, labels, mode, params)

        discriminator_fn = discriminator_generator(
            model_config_dict['discriminator'],
            num_labels_dict['discriminator'],
            init_checkpoint_dict['discriminator'],
            model_reuse=None,
            load_pretrained=load_pretrained_dict['discriminator'],
            model_io_config=model_io_config,
            opt_config=opt_config,
            exclude_scope=exclude_scope_dict.get('discriminator', ""),
            not_storage_params=not_storage_params_dict.get(
                'discriminator', []),
            target=target_dict['discriminator'],
            loss='cross_entropy',
            **kargs)

        discriminator_features = {}
        # minmax_mode in ['masked', 'corrupted']
        minmax_mode = kargs.get('minmax_mode', 'corrupted')
        tf.logging.info("****** minmax mode for discriminator: %s *******",
                        minmax_mode)
        if minmax_mode == 'corrupted':
            tf.logging.info("****** gumbel 3-D sampled_ids *******")
        elif minmax_mode == 'masked':
            discriminator_features['ori_sampled_ids'] = generator_dict[
                'output_ids']
            discriminator_features['sampled_binary_mask'] = generator_dict[
                'sampled_binary_mask']
            tf.logging.info("****** conditional sampled_ids *******")
        discriminator_features['input_ids'] = generator_dict['sampled_ids']
        discriminator_features['input_mask'] = generator_dict[
            'sampled_input_mask']
        discriminator_features['segment_ids'] = generator_dict[
            'sampled_segment_ids']
        discriminator_features['input_ori_ids'] = generator_dict[
            'sampled_input_ids']
        discriminator_features['next_sentence_labels'] = features[
            'next_sentence_labels']
        discriminator_features['ori_input_ids'] = generator_dict['sampled_ids']

        discriminator_dict = discriminator_fn(discriminator_features, labels,
                                              mode, params)

        [disc_loss, disc_logits, disc_per_example_loss
         ] = optimal_discriminator(model_config_dict['discriminator'],
                                   generator_dict,
                                   features,
                                   discriminator_dict,
                                   discriminator_features,
                                   use_tpu=use_tpu)

        [
            equal_per_example_loss, equal_loss_all, equal_loss_self,
            not_equal_per_example_loss, not_equal_loss_all, not_equal_loss_self
        ] = modified_loss(disc_per_example_loss,
                          disc_logits,
                          discriminator_features['input_ori_ids'],
                          discriminator_features['ori_input_ids'],
                          discriminator_features['input_mask'],
                          sampled_binary_mask=discriminator_features.get(
                              'sampled_binary_mask', None),
                          **kargs)
        output_dict = {}
        output_dict['logits'] = disc_logits
        output_dict['per_example_loss'] = disc_per_example_loss
        output_dict['loss'] = disc_loss + 0.0 * discriminator_dict["loss"]
        output_dict["equal_per_example_loss"] = equal_per_example_loss,
        output_dict["equal_loss_all"] = equal_loss_all,
        output_dict["equal_loss_self"] = equal_loss_self,
        output_dict["not_equal_per_example_loss"] = not_equal_per_example_loss,
        output_dict["not_equal_loss_all"] = not_equal_loss_all,
        output_dict["not_equal_loss_self"] = not_equal_loss_self
        output_dict['tvars'] = discriminator_dict['tvars']

        model_io_fn = model_io.ModelIO(model_io_config)

        tvars = []

        loss = kargs.get('dis_loss', 1.0) * output_dict['loss']

        tvars.extend(discriminator_dict['tvars'])

        if kargs.get('joint_train', '1') == '1':
            tf.logging.info(
                "****** joint generator and discriminator training *******")
            tvars.extend(generator_dict['tvars'])
            loss += generator_dict['loss']
        tvars = list(set(tvars))

        var_checkpoint_dict_list = []
        for key in init_checkpoint_dict:
            if load_pretrained_dict[key] == "yes":
                if key == 'generator':
                    tmp = {
                        "tvars":
                        generator_dict['tvars'],
                        "init_checkpoint":
                        init_checkpoint_dict['generator'],
                        "exclude_scope":
                        exclude_scope_dict[key],
                        "restore_var_name":
                        model_config_dict['generator'].get(
                            'restore_var_name', [])
                    }
                    if kargs.get("sharing_mode", "none") != "none":
                        tmp['exclude_scope'] = ''
                    var_checkpoint_dict_list.append(tmp)
                elif key == 'discriminator':
                    tmp = {
                        "tvars":
                        discriminator_dict['tvars'],
                        "init_checkpoint":
                        init_checkpoint_dict['discriminator'],
                        "exclude_scope":
                        exclude_scope_dict[key],
                        "restore_var_name":
                        model_config_dict['discriminator'].get(
                            'restore_var_name', [])
                    }
                    var_checkpoint_dict_list.append(tmp)

        use_tpu = 1 if kargs.get('use_tpu', False) else 0

        if len(var_checkpoint_dict_list) >= 1:
            scaffold_fn = model_io_fn.load_multi_pretrained(
                var_checkpoint_dict_list, use_tpu=use_tpu)
        else:
            scaffold_fn = None

        if mode == tf.estimator.ModeKeys.TRAIN:

            if not kargs.get('use_tpu', False):
                metric_dict = discriminator_metric_train(
                    output_dict['per_example_loss'], output_dict['logits'],
                    generator_dict['sampled_input_ids'],
                    generator_dict['sampled_ids'],
                    generator_dict['sampled_input_mask'])

                for key in metric_dict:
                    tf.summary.scalar(key, metric_dict[key])
                tf.summary.scalar("generator_loss", generator_dict['loss'])
                tf.summary.scalar("discriminator_loss",
                                  discriminator_dict['loss'])

            if kargs.get('use_tpu', False):
                optimizer_fn = optimizer.Optimizer(opt_config)
                use_tpu = 1
            else:
                optimizer_fn = distributed_optimizer.Optimizer(opt_config)
                use_tpu = 0

            model_io_fn.print_params(tvars, string=", trainable params")

            train_op = get_train_op(generator_dict,
                                    output_dict,
                                    optimizer_fn,
                                    opt_config,
                                    model_config_dict['generator'],
                                    model_config_dict['discriminator'],
                                    use_tpu=use_tpu,
                                    train_op_type=train_op_type,
                                    gen_disc_type=gen_disc_type)

            # update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            # with tf.control_dependencies(update_ops):
            # 	train_op = optimizer_fn.get_train_op(loss, list(set(tvars)),
            # 					opt_config.init_lr,
            # 					opt_config.num_train_steps,
            # 					use_tpu=use_tpu)

            if kargs.get('use_tpu', False):
                estimator_spec = tf.contrib.tpu.TPUEstimatorSpec(
                    mode=mode,
                    loss=loss,
                    train_op=train_op,
                    scaffold_fn=scaffold_fn
                    # training_hooks=[logging_hook]
                )
            else:
                estimator_spec = tf.estimator.EstimatorSpec(mode=mode,
                                                            loss=loss,
                                                            train_op=train_op)

            return estimator_spec

        elif mode == tf.estimator.ModeKeys.EVAL:

            if kargs.get('joint_train', '0') == '1':

                def joint_metric(masked_lm_example_loss, masked_lm_log_probs,
                                 masked_lm_ids, masked_lm_weights,
                                 next_sentence_example_loss,
                                 next_sentence_log_probs, next_sentence_labels,
                                 per_example_loss, logits, input_ori_ids,
                                 input_ids, input_mask):
                    generator_metric = generator_metric_fn_eval(
                        masked_lm_example_loss, masked_lm_log_probs,
                        masked_lm_ids, masked_lm_weights,
                        next_sentence_example_loss, next_sentence_log_probs,
                        next_sentence_labels)
                    discriminator_metric = discriminator_metric_eval(
                        per_example_loss, logits, input_ori_ids, input_ids,
                        input_mask)
                    generator_metric.update(discriminator_metric)
                    return generator_metric

                tpu_eval_metrics = (joint_metric, [
                    generator_dict['masked_lm_example_loss'],
                    generator_dict['masked_lm_log_probs'],
                    generator_dict['masked_lm_ids'],
                    generator_dict['masked_lm_weights'],
                    generator_dict.get('next_sentence_example_loss', None),
                    generator_dict.get('next_sentence_log_probs', None),
                    generator_dict.get('next_sentence_labels', None),
                    discriminator_dict['per_example_loss'],
                    discriminator_dict['logits'],
                    generator_dict['sampled_input_ids'],
                    generator_dict['sampled_ids'],
                    generator_dict['sampled_input_mask']
                ])
                gpu_eval_metrics = joint_metric(
                    generator_dict['masked_lm_example_loss'],
                    generator_dict['masked_lm_log_probs'],
                    generator_dict['masked_lm_ids'],
                    generator_dict['masked_lm_weights'],
                    generator_dict.get('next_sentence_example_loss', None),
                    generator_dict.get('next_sentence_log_probs', None),
                    generator_dict.get('next_sentence_labels', None),
                    discriminator_dict['per_example_loss'],
                    discriminator_dict['logits'],
                    generator_dict['sampled_input_ids'],
                    generator_dict['sampled_ids'],
                    generator_dict['sampled_input_mask'])
            else:
                gpu_eval_metrics = discriminator_metric_eval(
                    discriminator_dict['per_example_loss'],
                    discriminator_dict['logits'],
                    generator_dict['sampled_input_ids'],
                    generator_dict['sampled_ids'],
                    generator_dict['sampled_input_mask'])
                tpu_eval_metrics = (discriminator_metric_eval, [
                    discriminator_dict['per_example_loss'],
                    discriminator_dict['logits'],
                    generator_dict['sampled_input_ids'],
                    generator_dict['sampled_ids'],
                    generator_dict['sampled_input_mask']
                ])

            if kargs.get('use_tpu', False):
                estimator_spec = tf.contrib.tpu.TPUEstimatorSpec(
                    mode=mode,
                    loss=loss,
                    eval_metrics=tpu_eval_metrics,
                    scaffold_fn=scaffold_fn)
            else:
                estimator_spec = tf.estimator.EstimatorSpec(
                    mode=mode, loss=loss, eval_metric_ops=gpu_eval_metrics)

            return estimator_spec
        else:
            raise NotImplementedError()
Exemple #15
0
	def model_fn(features, labels, mode):

		if model_io_config.fix_lm == True:
			scope = model_config.scope + "_finetuning"
		else:
			scope = model_config.scope

		if mode == tf.estimator.ModeKeys.TRAIN:
			dropout_prob = model_config.dropout_prob
		else:
			dropout_prob = 0.0

		label_ids = features["label_ids"]

		model_lst = []
		for index, name in enumerate(target):
			if index > 0:
				reuse = True
			else:
				reuse = model_reuse
			model_lst.append(bert_encoding(model_config, features, labels, 
												mode, name,
												scope, dropout_rate, 
												reuse=reuse))

		[input_mask_a, repres_a] = model_lst[0]
		[input_mask_b, repres_b] = model_lst[1]

		output_a, output_b = alignment_aggerate(model_config, 
				repres_a, repres_b, 
				input_mask_a, 
				input_mask_b, 
				scope, 
				reuse=model_reuse)

		if model_config.pooling == "ave_max_pooling":
			pooling_fn = ave_max_pooling
		elif model_config.pooling == "multihead_pooling":
			pooling_fn = multihead_pooling

		repres_a = pooling_fn(model_config, output_a, 
					input_mask_a, 
					scope, 
					dropout_prob, 
					reuse=model_reuse)

		repres_b = pooling_fn(model_config, output_b,
					input_mask_b,
					scope, 
					dropout_prob,
					reuse=True)

		pair_repres = tf.concat([repres_a, repres_b,
					tf.abs(repres_a-repres_b),
					repres_b*repres_a], axis=-1)

		with tf.variable_scope(scope, reuse=model_reuse):
			(loss, 
				per_example_loss, 
				logits) = classifier.classifier(model_config,
											pair_repres,
											num_labels,
											label_ids,
											dropout_prob)

		model_io_fn = model_io.ModelIO(model_io_config)

		tvars = model_io_fn.get_params(model_config.scope, 
										not_storage_params=not_storage_params)
		if load_pretrained:
			model_io_fn.load_pretrained(tvars, 
										init_checkpoint,
										exclude_scope=exclude_scope)

		if mode == tf.estimator.ModeKeys.TRAIN:

			optimizer_fn = optimizer.Optimizer(opt_config)

			model_io_fn.print_params(tvars, string=", trainable params")
			update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
			with tf.control_dependencies(update_ops):
				train_op = optimizer_fn.get_train_op(loss, tvars, 
								opt_config.init_lr, 
								opt_config.num_train_steps)

				model_io_fn.set_saver()

				if kargs.get("task_index", 1) == 0:
					model_io_fn.get_hooks(kargs.get("checkpoint_dir", None), 
										kargs.get("num_storage_steps", 1000))

					training_hooks = model_io_fn.checkpoint_hook
				else:
					training_hooks = []

				if len(optimizer_fn.distributed_hooks) >= 1:
					training_hooks.extend(optimizer_fn.distributed_hooks)
				print(training_hooks)

				estimator_spec = tf.estimator.EstimatorSpec(mode=mode, 
								loss=loss, train_op=train_op,
								training_hooks=training_hooks)

				if output_type == "sess":
					return {
						"train":{
										"loss":loss, 
										"logits":logits,
										"train_op":train_op
									},
						"hooks":training_hooks
					}
				elif output_type == "estimator":
					return estimator_spec
		elif mode == tf.estimator.ModeKeys.PREDICT:
			print(logits.get_shape(), "===logits shape===")
			pred_label = tf.argmax(logits, axis=-1, output_type=tf.int32)
			prob = tf.nn.softmax(logits)
			max_prob = tf.reduce_max(prob, axis=-1)
			
			estimator_spec = tf.estimator.EstimatorSpec(
									mode=mode,
									predictions={
												'pred_label':pred_label,
												"max_prob":max_prob
								  	},
									export_outputs={
										"output":tf.estimator.export.PredictOutput(
													{
														'pred_label':pred_label,
														"max_prob":max_prob
													}
												)
								  	}
						)
			return estimator_spec

		elif mode == tf.estimator.ModeKeys.EVAL:
			def metric_fn(per_example_loss,
						logits, 
						label_ids):
				"""Computes the loss and accuracy of the model."""
				sentence_log_probs = tf.reshape(
					logits, [-1, logits.shape[-1]])
				sentence_predictions = tf.argmax(
					logits, axis=-1, output_type=tf.int32)
				sentence_labels = tf.reshape(label_ids, [-1])
				sentence_accuracy = tf.metrics.accuracy(
					labels=label_ids, predictions=sentence_predictions)
				sentence_mean_loss = tf.metrics.mean(
					values=per_example_loss)
				sentence_f = tf_metrics.f1(label_ids, 
										sentence_predictions, 
										num_labels, 
										label_lst, average="macro")

				eval_metric_ops = {
									"f1": sentence_f,
									"loss": sentence_mean_loss,
									"acc":sentence_accuracy
								}

				return eval_metric_ops

			eval_metric_ops = metric_fn( 
							per_example_loss,
							logits, 
							label_ids)
			
			estimator_spec = tf.estimator.EstimatorSpec(mode=mode, 
								loss=loss,
								eval_metric_ops=eval_metric_ops)
			if output_type == "sess":
				return {
					"eval":{
							"per_example_loss":per_example_loss,
							"logits":logits,
							"loss":tf.reduce_mean(per_example_loss)
						}
				}
			elif output_type == "estimator":
				return estimator_spec
		else:
			raise NotImplementedError()
Exemple #16
0
    def model_fn(features, labels, mode):

        train_ops = []
        train_hooks = []
        logits_dict = {}
        losses_dict = {}
        features_dict = {}
        tvars = []
        task_num_dict = {}
        multi_task_config = kargs.get('multi_task_config', {})

        total_loss = tf.constant(0.0)

        task_num = 0

        encoder = {}
        hook_dict = {}

        print(task_type_dict.keys(), "==task type dict==")
        num_task = len(task_type_dict)

        from data_generator import load_w2v
        flags = kargs.get('flags', Bunch({}))
        print(flags.pretrained_w2v_path, "===pretrain vocab path===")
        w2v_path = os.path.join(flags.buckets, flags.pretrained_w2v_path)
        vocab_path = os.path.join(flags.buckets, flags.vocab_file)

        # [w2v_embed, token2id,
        # id2token, is_extral_symbol, use_pretrained] = load_w2v.load_pretrained_w2v(vocab_path, w2v_path)

        # pretrained_embed = tf.cast(tf.constant(w2v_embed), tf.float32)
        pretrained_embed = None

        for index, task_type in enumerate(task_type_dict.keys()):
            if model_config_dict[task_type].model_type in model_type_lst:
                reuse = True
            else:
                reuse = None
                model_type_lst.append(model_config_dict[task_type].model_type)

            if model_config_dict[task_type].model_type not in encoder:
                model_api = model_zoo(model_config_dict[task_type])

                model = model_api(model_config_dict[task_type],
                                  features,
                                  labels,
                                  mode,
                                  target_dict[task_type],
                                  reuse=reuse,
                                  cnn_type=model_config_dict[task_type].get(
                                      'cnn_type', 'bi_dgcnn'))
                encoder[model_config_dict[task_type].model_type] = model

                # vae_kl_model = vae_model_fn(encoder[model_config_dict[task_type].model_type],
                # 			model_config_dict[task_type],
                # 			num_labels_dict[task_type],
                # 			init_checkpoint_dict[task_type],
                # 			reuse,
                # 			load_pretrained_dict[task_type],
                # 			model_io_config,
                # 			opt_config,
                # 			exclude_scope=exclude_scope_dict[task_type],
                # 			not_storage_params=not_storage_params_dict[task_type],
                # 			target=target_dict[task_type],
                # 			label_lst=None,
                # 			output_type=output_type,
                # 			task_layer_reuse=task_layer_reuse,
                # 			task_type=task_type,
                # 			num_task=num_task,
                # 			task_adversarial=1e-2,
                # 			get_pooled_output='task_output',
                # 			feature_distillation=False,
                # 			embedding_distillation=False,
                # 			pretrained_embed=pretrained_embed,
                # 			**kargs)
                # vae_result_dict = vae_kl_model(features, labels, mode)
                # tvars.extend(vae_result_dict['tvars'])
                # total_loss += vae_result_dict["loss"]
                # for key in vae_result_dict:
                # 	if key in ['perplexity', 'token_acc', 'kl_div']:
                # 		hook_dict[key] = vae_result_dict[key]
            print(encoder, "==encode==")

            if task_type_dict[task_type] == "cls_task":
                task_model_fn = cls_model_fn(
                    encoder[model_config_dict[task_type].model_type],
                    model_config_dict[task_type],
                    num_labels_dict[task_type],
                    init_checkpoint_dict[task_type],
                    reuse,
                    load_pretrained_dict[task_type],
                    model_io_config,
                    opt_config,
                    exclude_scope=exclude_scope_dict[task_type],
                    not_storage_params=not_storage_params_dict[task_type],
                    target=target_dict[task_type],
                    label_lst=None,
                    output_type=output_type,
                    task_layer_reuse=task_layer_reuse,
                    task_type=task_type,
                    num_task=num_task,
                    task_adversarial=1e-2,
                    get_pooled_output='task_output',
                    feature_distillation=False,
                    embedding_distillation=False,
                    pretrained_embed=pretrained_embed,
                    **kargs)
                result_dict = task_model_fn(features, labels, mode)
                tf.logging.info("****** task: *******",
                                task_type_dict[task_type], task_type)
            elif task_type_dict[task_type] == "embed_task":
                task_model_fn = embed_model_fn(
                    encoder[model_config_dict[task_type].model_type],
                    model_config_dict[task_type],
                    num_labels_dict[task_type],
                    init_checkpoint_dict[task_type],
                    reuse,
                    load_pretrained_dict[task_type],
                    model_io_config,
                    opt_config,
                    exclude_scope=exclude_scope_dict[task_type],
                    not_storage_params=not_storage_params_dict[task_type],
                    target=target_dict[task_type],
                    label_lst=None,
                    output_type=output_type,
                    task_layer_reuse=task_layer_reuse,
                    task_type=task_type,
                    num_task=num_task,
                    task_adversarial=1e-2,
                    get_pooled_output='task_output',
                    feature_distillation=False,
                    embedding_distillation=False,
                    pretrained_embed=pretrained_embed,
                    loss='contrastive_loss',
                    apply_head_proj=False,
                    **kargs)
                result_dict = task_model_fn(features, labels, mode)
                tf.logging.info("****** task: *******",
                                task_type_dict[task_type], task_type)
                # cpc_model_fn = embed_cpc_model_fn(encoder[model_config_dict[task_type].model_type],
                # 								model_config_dict[task_type],
                # 								num_labels_dict[task_type],
                # 								init_checkpoint_dict[task_type],
                # 								reuse,
                # 								load_pretrained_dict[task_type],
                # 								model_io_config,
                # 								opt_config,
                # 								exclude_scope=exclude_scope_dict[task_type],
                # 								not_storage_params=not_storage_params_dict[task_type],
                # 								target=target_dict[task_type],
                # 								label_lst=None,
                # 								output_type=output_type,
                # 								task_layer_reuse=task_layer_reuse,
                # 								task_type=task_type,
                # 								num_task=num_task,
                # 								task_adversarial=1e-2,
                # 								get_pooled_output='task_output',
                # 								feature_distillation=False,
                # 								embedding_distillation=False,
                # 								pretrained_embed=pretrained_embed,
                # 								loss='contrastive_loss',
                # 								apply_head_proj=False,
                # 								**kargs)

                # cpc_result_dict = cpc_model_fn(features, labels, mode)
                # result_dict['loss'] += cpc_result_dict['loss']
                # result_dict['tvars'].extend(cpc_result_dict['tvars'])
                # hook_dict["{}_all_neg_loss".format(task_type)] = cpc_result_dict['loss']
                # hook_dict["{}_all_neg_num".format(task_type)] = cpc_result_dict['task_num']

            elif task_type_dict[task_type] == "cpc_task":
                task_model_fn = embed_cpc_v1_model_fn(
                    encoder[model_config_dict[task_type].model_type],
                    model_config_dict[task_type],
                    num_labels_dict[task_type],
                    init_checkpoint_dict[task_type],
                    reuse,
                    load_pretrained_dict[task_type],
                    model_io_config,
                    opt_config,
                    exclude_scope=exclude_scope_dict[task_type],
                    not_storage_params=not_storage_params_dict[task_type],
                    target=target_dict[task_type],
                    label_lst=None,
                    output_type=output_type,
                    task_layer_reuse=task_layer_reuse,
                    task_type=task_type,
                    num_task=num_task,
                    task_adversarial=1e-2,
                    get_pooled_output='task_output',
                    feature_distillation=False,
                    embedding_distillation=False,
                    pretrained_embed=pretrained_embed,
                    loss='contrastive_loss',
                    apply_head_proj=False,
                    task_seperate_proj=True,
                    **kargs)
                result_dict = task_model_fn(features, labels, mode)
                tf.logging.info("****** task: *******",
                                task_type_dict[task_type], task_type)

            elif task_type_dict[task_type] == "regression_task":
                task_model_fn = regression_model_fn(
                    encoder[model_config_dict[task_type].model_type],
                    model_config_dict[task_type],
                    num_labels_dict[task_type],
                    init_checkpoint_dict[task_type],
                    reuse,
                    load_pretrained_dict[task_type],
                    model_io_config,
                    opt_config,
                    exclude_scope=exclude_scope_dict[task_type],
                    not_storage_params=not_storage_params_dict[task_type],
                    target=target_dict[task_type],
                    label_lst=None,
                    output_type=output_type,
                    task_layer_reuse=task_layer_reuse,
                    task_type=task_type,
                    num_task=num_task,
                    task_adversarial=1e-2,
                    get_pooled_output='task_output',
                    feature_distillation=False,
                    embedding_distillation=False,
                    pretrained_embed=pretrained_embed,
                    loss='contrastive_loss',
                    apply_head_proj=False,
                    **kargs)
                result_dict = task_model_fn(features, labels, mode)
                tf.logging.info("****** task: *******",
                                task_type_dict[task_type], task_type)
            else:
                continue
            print("==SUCCEEDED IN LODING==", task_type)

            # result_dict = task_model_fn(features, labels, mode)
            logits_dict[task_type] = result_dict["logits"]
            losses_dict[task_type] = result_dict["loss"]  # task loss
            for key in [
                    "pos_num", "neg_num", "masked_lm_loss", "task_loss", "acc",
                    "task_acc", "masked_lm_acc"
            ]:
                name = "{}_{}".format(task_type, key)
                if name in result_dict:
                    hook_dict[name] = result_dict[name]
            hook_dict["{}_loss".format(task_type)] = result_dict["loss"]
            hook_dict["{}_num".format(task_type)] = result_dict["task_num"]
            print("==loss ratio==", task_type,
                  multi_task_config[task_type].get('loss_ratio', 1.0))
            total_loss += result_dict["loss"] * multi_task_config[
                task_type].get('loss_ratio', 1.0)
            hook_dict['embed_loss'] = result_dict["embed_loss"]
            hook_dict['feature_loss'] = result_dict["feature_loss"]
            hook_dict["{}_task_loss".format(
                task_type)] = result_dict["task_loss"]
            if 'positive_label' in result_dict:
                hook_dict["{}_task_positive_label".format(
                    task_type)] = result_dict["positive_label"]
            if mode == tf.estimator.ModeKeys.TRAIN:
                tvars.extend(result_dict["tvars"])
                task_num += result_dict["task_num"]
                task_num_dict[task_type] = result_dict["task_num"]
            elif mode == tf.estimator.ModeKeys.EVAL:
                features[task_type] = result_dict["feature"]

        hook_dict["total_loss"] = total_loss

        if mode == tf.estimator.ModeKeys.TRAIN:
            model_io_fn = model_io.ModelIO(model_io_config)

            optimizer_fn = optimizer.Optimizer(opt_config)

            model_io_fn.print_params(list(set(tvars)),
                                     string=", trainable params")
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            print("==update_ops==", update_ops)

            with tf.control_dependencies(update_ops):
                train_op = optimizer_fn.get_train_op(
                    total_loss, list(set(tvars)), opt_config.init_lr,
                    opt_config.num_train_steps, **kargs)

                model_io_fn.set_saver(optimizer_fn.opt)

                if kargs.get("task_index", 1) == 1 and kargs.get(
                        "run_config", None):
                    model_io_fn.get_hooks(kargs.get("checkpoint_dir", None),
                                          kargs.get("num_storage_steps", 1000))

                    training_hooks = model_io_fn.checkpoint_hook
                elif kargs.get("task_index", 1) == 1:
                    training_hooks = []
                else:
                    training_hooks = []

                if len(optimizer_fn.distributed_hooks) >= 1:
                    training_hooks.extend(optimizer_fn.distributed_hooks)
                print(training_hooks, "==training_hooks==", "==task_index==",
                      kargs.get("task_index", 1))

            if output_type == "sess":
                return {
                    "train": {
                        "total_loss": total_loss,
                        "loss": losses_dict,
                        "logits": logits_dict,
                        "train_op": train_op,
                        "task_num_dict": task_num_dict
                    },
                    "hooks": train_hooks
                }
            elif output_type == "estimator":

                hook_dict['learning_rate'] = optimizer_fn.learning_rate
                logging_hook = tf.train.LoggingTensorHook(hook_dict,
                                                          every_n_iter=100)
                training_hooks.append(logging_hook)

                print("==hook_dict==")

                print(hook_dict)

                for key in hook_dict:
                    tf.summary.scalar(key, hook_dict[key])
                    for index, task_type in enumerate(task_type_dict.keys()):
                        tmp = "{}_loss".format(task_type)
                        if tmp == key:
                            tf.summary.scalar(
                                "loss_gap_{}".format(task_type),
                                hook_dict["total_loss"] - hook_dict[key])
                for key in task_num_dict:
                    tf.summary.scalar(key + "_task_num", task_num_dict[key])

                estimator_spec = tf.estimator.EstimatorSpec(
                    mode=mode,
                    loss=total_loss,
                    train_op=train_op,
                    training_hooks=training_hooks)
                return estimator_spec

        elif mode == tf.estimator.ModeKeys.EVAL:  # eval execute for each class solo

            def metric_fn(logits, label_ids):
                """Computes the loss and accuracy of the model."""
                sentence_log_probs = tf.reshape(logits, [-1, logits.shape[-1]])
                sentence_predictions = tf.argmax(logits,
                                                 axis=-1,
                                                 output_type=tf.int32)
                sentence_labels = tf.reshape(label_ids, [-1])
                sentence_accuracy = tf.metrics.accuracy(
                    labels=label_ids, predictions=sentence_predictions)
                sentence_f = tf_metrics.f1(label_ids,
                                           sentence_predictions,
                                           num_labels,
                                           label_lst,
                                           average="macro")

                eval_metric_ops = {"f1": sentence_f, "acc": sentence_accuracy}

                return eval_metric_ops

            if output_type == "sess":
                return {
                    "eval": {
                        "logits": logits_dict,
                        "total_loss": total_loss,
                        "feature": features,
                        "loss": losses_dict
                    }
                }
            elif output_type == "estimator":
                eval_metric_ops = {}
                for key in logits_dict:
                    eval_dict = metric_fn(logits_dict[key],
                                          features_task_dict[key]["label_ids"])
                    for sub_key in eval_dict.keys():
                        eval_key = "{}_{}".format(key, sub_key)
                        eval_metric_ops[eval_key] = eval_dict[sub_key]
                estimator_spec = tf.estimator.EstimatorSpec(
                    mode=mode,
                    loss=total_loss / task_num,
                    eval_metric_ops=eval_metric_ops)
                return estimator_spec
        else:
            raise NotImplementedError()
Exemple #17
0
def train_eval_fn(FLAGS, worker_count, task_index, is_chief, target,
                  init_checkpoint, train_file, dev_file, checkpoint_dir,
                  is_debug, **kargs):

    graph = tf.Graph()
    with graph.as_default():
        import json

        config = json.load(open(FLAGS.config_file, "r"))

        config = Bunch(config)
        config.use_one_hot_embeddings = True
        config.scope = "bert"
        config.dropout_prob = 0.1
        config.label_type = "single_label"

        config.model = FLAGS.model_type
        config.init_lr = 1e-4
        config.ln_type = FLAGS.ln_type

        config.loss = 'entropy'

        print('==init learning rate==', config.init_lr)

        if FLAGS.if_shard == "0":
            train_size = FLAGS.train_size
            epoch = int(FLAGS.epoch / worker_count)
        elif FLAGS.if_shard == "1":
            train_size = int(FLAGS.train_size / worker_count)
            epoch = FLAGS.epoch
        else:
            train_size = int(FLAGS.train_size / worker_count)
            epoch = FLAGS.epoch

        init_lr = config.init_lr

        label_dict = json.load(tf.gfile.Open(FLAGS.label_id))

        num_train_steps = int(train_size / FLAGS.batch_size * epoch)
        num_warmup_steps = int(num_train_steps * 0.1)

        num_storage_steps = int(train_size / FLAGS.batch_size)

        num_eval_steps = int(FLAGS.eval_size / FLAGS.batch_size)

        if is_debug == "0":
            num_storage_steps = 2
            num_eval_steps = 10
            num_train_steps = 10
        print("num_train_steps {}, num_eval_steps {}, num_storage_steps {}".
              format(num_train_steps, num_eval_steps, num_storage_steps))

        print(" model type {}".format(FLAGS.model_type))

        print(num_train_steps, num_warmup_steps, "=============")

        if worker_count * kargs.get("num_gpus", 1) >= 2:
            clip_norm_scale = 1.0
            lr_scale = 0.75
        else:
            clip_norm_scale = 1.0
            lr_scale = 1.0
        lr = init_lr * worker_count * kargs.get("num_gpus", 1) * lr_scale
        # if lr >= 1e-3:
        # 	lr = 1e-3
        lr = config.init_lr
        print('--training learning rate--', lr)

        opt_config = Bunch({
            "init_lr": lr,
            "num_train_steps": num_train_steps,
            "num_warmup_steps": num_warmup_steps,
            "worker_count": worker_count,
            "opt_type": FLAGS.opt_type,
            "is_chief": is_chief,
            "train_op": kargs.get("train_op", "adam"),
            "decay": kargs.get("decay", "no"),
            "warmup": kargs.get("warmup", "no"),
            "clip_norm": 1
        })

        anneal_config = Bunch({
            "initial_value": 1.0,
            "num_train_steps": num_train_steps
        })

        model_io_config = Bunch({"fix_lm": False})
        model_io_fn = model_io.ModelIO(model_io_config)

        num_classes = FLAGS.num_classes

        if FLAGS.opt_type == "hvd" and hvd:
            checkpoint_dir = checkpoint_dir if task_index == 0 else None
        elif FLAGS.opt_type == "all_reduce":
            checkpoint_dir = checkpoint_dir
        elif FLAGS.opt_type == "collective_reduce":
            checkpoint_dir = checkpoint_dir if task_index == 0 else None
        elif FLAGS.opt_type == "ps" or FLAGS.opt_type == "ps_sync":
            checkpoint_dir = checkpoint_dir if task_index == 0 else None
        print("==checkpoint_dir==", checkpoint_dir, is_chief)

        model_fn = model_fn_builder(config,
                                    num_classes,
                                    init_checkpoint,
                                    model_reuse=None,
                                    load_pretrained=FLAGS.load_pretrained,
                                    model_io_config=model_io_config,
                                    opt_config=opt_config,
                                    model_io_fn=model_io_fn,
                                    exclude_scope="",
                                    not_storage_params=[],
                                    target=kargs.get("input_target", ""),
                                    output_type="estimator",
                                    checkpoint_dir=checkpoint_dir,
                                    num_storage_steps=num_storage_steps,
                                    task_index=task_index,
                                    anneal_config=anneal_config,
                                    **kargs)

        name_to_features = {
            "input_ids": tf.FixedLenFeature([FLAGS.max_length], tf.int64),
            "input_mask": tf.FixedLenFeature([FLAGS.max_length], tf.int64),
            "segment_ids": tf.FixedLenFeature([FLAGS.max_length], tf.int64),
            "label_ids": tf.FixedLenFeature([], tf.int64),
        }

        def _decode_record(record, name_to_features):
            """Decodes a record to a TensorFlow example.
			"""
            example = tf.parse_single_example(record, name_to_features)

            # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
            # So cast all int64 to int32.
            for name in list(example.keys()):
                t = example[name]
                if t.dtype == tf.int64:
                    t = tf.to_int32(t)
                example[name] = t

            return example

        def _decode_batch_record(record, name_to_features):
            example = tf.parse_example(record, name_to_features)
            # for name in list(example.keys()):
            # 	t = example[name]
            # 	if t.dtype == tf.int64:
            # 		t = tf.to_int32(t)
            # 	example[name] = t

            return example

        params = Bunch({})
        params.epoch = FLAGS.epoch
        params.batch_size = FLAGS.batch_size

        train_features = lambda: tf_data_utils.all_reduce_train_batch_input_fn(
            train_file,
            _decode_batch_record,
            name_to_features,
            params,
            if_shard=FLAGS.if_shard,
            worker_count=worker_count,
            task_index=task_index)
        eval_features = lambda: tf_data_utils.all_reduce_eval_batch_input_fn(
            dev_file,
            _decode_batch_record,
            name_to_features,
            params,
            if_shard=FLAGS.if_shard,
            worker_count=worker_count,
            task_index=task_index)

        sess_config = tf.ConfigProto(allow_soft_placement=False,
                                     log_device_placement=False)
        if FLAGS.opt_type == "ps" or FLAGS.opt_type == "ps_sync":
            print("==no need for hook==")
        elif FLAGS.opt_type == "pai_soar" and pai:
            print("no need for hook")
        elif FLAGS.opt_type == "hvd" and hvd:
            sess_config.gpu_options.allow_growth = True
            sess_config.gpu_options.visible_device_list = str(hvd.local_rank())
            print("==no need fo hook==")
        else:
            print("==no need for hooks==")

        if kargs.get("run_config", None):
            run_config = kargs.get("run_config", None)
            run_config = run_config.replace(
                save_checkpoints_steps=num_storage_steps)
            print("==run config==", run_config.save_checkpoints_steps)
        else:
            run_config = tf.estimator.RunConfig(
                model_dir=checkpoint_dir,
                save_checkpoints_steps=num_storage_steps,
                session_config=sess_config)

        train_hooks = []
        if kargs.get("profiler", "profiler") == "profiler":
            if checkpoint_dir:
                hooks = tf.train.ProfilerHook(
                    save_steps=100,
                    save_secs=None,
                    output_dir=os.path.join(checkpoint_dir, "profiler"),
                )
                train_hooks.append(hooks)
                print("==add profiler hooks==")

        model_estimator = tf.estimator.Estimator(model_fn=model_fn,
                                                 model_dir=checkpoint_dir,
                                                 config=run_config)

        train_being_time = time.time()
        tf.logging.info("==training distribution_strategy=={}".format(
            kargs.get("distribution_strategy", "MirroredStrategy")))
        if kargs.get("distribution_strategy",
                     "MirroredStrategy") == "MirroredStrategy":
            print("==apply single machine multi-card training==")
            # model_estimator.train(input_fn=train_features,
            # 				max_steps=num_train_steps,
            # 				hooks=train_hooks)

            train_spec = tf.estimator.TrainSpec(input_fn=train_features,
                                                max_steps=num_train_steps)

            eval_spec = tf.estimator.EvalSpec(input_fn=eval_features,
                                              steps=num_eval_steps)

            model_estimator.train(input_fn=train_features,
                                  max_steps=num_train_steps,
                                  hooks=train_hooks)
            # tf.estimator.train(model_estimator, train_spec)
            # tf.estimator.evaluate(model_estimator, eval_spec)

            train_end_time = time.time()
            print("==training time==", train_end_time - train_being_time)
            tf.logging.info("==training time=={}".format(train_end_time -
                                                         train_being_time))
            eval_results = model_estimator.evaluate(input_fn=eval_features,
                                                    steps=num_eval_steps)
            # print(eval_results)

        elif kargs.get("distribution_strategy", "MirroredStrategy") in [
                "ParameterServerStrategy", "CollectiveAllReduceStrategy"
        ]:
            print("==apply multi-machine machine multi-card training==")
            try:
                print(os.environ['TF_CONFIG'], "==tf_run_config==")
            except:
                print("==not tf config==")
            train_spec = tf.estimator.TrainSpec(input_fn=train_features,
                                                max_steps=num_train_steps)

            eval_spec = tf.estimator.EvalSpec(input_fn=eval_features,
                                              steps=num_eval_steps)

            tf.estimator.train_and_evaluate(model_estimator, train_spec,
                                            eval_spec)
            train_end_time = time.time()
            print("==training time==", train_end_time - train_being_time)
    def model_fn(features, labels, mode, params):

        train_op_type = kargs.get('train_op_type', 'joint')

        ebm_noise_fce = EBM_NOISE_NCE(
            model_config_dict,
            num_labels_dict,
            init_checkpoint_dict,
            load_pretrained_dict,
            model_io_config=model_io_config,
            opt_config=opt_config,
            exclude_scope_dict=exclude_scope_dict,
            not_storage_params_dict=not_storage_params_dict,
            target_dict=target_dict,
            **kargs)

        model_io_fn = model_io.ModelIO(model_io_config)
        use_tpu = 1 if kargs.get('use_tpu', False) else 0

        if mode == tf.estimator.ModeKeys.TRAIN:

            if kargs.get('use_tpu', False):
                optimizer_fn = optimizer.Optimizer(opt_config)
                use_tpu = 1
            else:
                optimizer_fn = distributed_optimizer.Optimizer(opt_config)
                use_tpu = 0

            train_op = get_train_op(ebm_noise_fce,
                                    optimizer_fn,
                                    opt_config,
                                    model_config_dict['ebm_dist'],
                                    model_config_dict['noise_dist'],
                                    model_config_dict['generator'],
                                    features,
                                    labels,
                                    mode,
                                    params,
                                    use_tpu=use_tpu,
                                    train_op_type=train_op_type,
                                    alternate_order=['ebm', 'generator'])

            ebm_noise_fce.load_pretrained_model(**kargs)
            var_checkpoint_dict_list = ebm_noise_fce.var_checkpoint_dict_list
            loss = ebm_noise_fce.loss
            tvars = ebm_noise_fce.tvars

            if len(var_checkpoint_dict_list) >= 1:
                scaffold_fn = model_io_fn.load_multi_pretrained(
                    var_checkpoint_dict_list, use_tpu=use_tpu)
            else:
                scaffold_fn = None

            metric_dict = ebm_train_metric(
                ebm_noise_fce.true_ebm_dist_dict['logits'],
                ebm_noise_fce.fake_ebm_dist_dict['logits'])

            if not kargs.get('use_tpu', False):
                for key in metric_dict:
                    tf.summary.scalar(key, metric_dict[key])
                tf.summary.scalar("ebm_loss",
                                  ebm_noise_fce.ebm_opt_dict['ebm_loss'])
                tf.summary.scalar("mlm_loss",
                                  ebm_noise_fce.ebm_opt_dict['mlm_loss'])
                tf.summary.scalar("all_loss",
                                  ebm_noise_fce.ebm_opt_dict['all_loss'])

            model_io_fn.print_params(tvars, string=", trainable params")

            if kargs.get('use_tpu', False):
                estimator_spec = tf.contrib.tpu.TPUEstimatorSpec(
                    mode=mode,
                    loss=loss,
                    train_op=train_op,
                    scaffold_fn=scaffold_fn)
            else:
                estimator_spec = tf.estimator.EstimatorSpec(mode=mode,
                                                            loss=loss,
                                                            train_op=train_op)

            return estimator_spec

        elif mode == tf.estimator.ModeKeys.EVAL:

            ebm_noise_fce.get_loss(features, labels, mode, params, **kargs)
            ebm_noise_fce.load_pretrained_model(**kargs)
            var_checkpoint_dict_list = ebm_noise_fce.var_checkpoint_dict_list
            loss = ebm_noise_fce.loss

            if len(var_checkpoint_dict_list) >= 1:
                scaffold_fn = model_io_fn.load_multi_pretrained(
                    var_checkpoint_dict_list, use_tpu=use_tpu)
            else:
                scaffold_fn = None

            tpu_eval_metrics = (ebm_eval_metric, [
                ebm_noise_fce.true_ebm_dist_dict['logits'],
                ebm_noise_fce.fake_ebm_dist_dict['logits']
            ])
            gpu_eval_metrics = ebm_eval_metric(
                ebm_noise_fce.true_ebm_dist_dict['logits'],
                ebm_noise_fce.fake_ebm_dist_dict['logits'])

            if kargs.get('use_tpu', False):
                estimator_spec = tf.contrib.tpu.TPUEstimatorSpec(
                    mode=mode,
                    loss=loss,
                    eval_metrics=tpu_eval_metrics,
                    scaffold_fn=scaffold_fn)
            else:
                estimator_spec = tf.estimator.EstimatorSpec(
                    mode=mode, loss=loss, eval_metric_ops=gpu_eval_metrics)

            return estimator_spec
        else:
            raise NotImplementedError()
Exemple #19
0
	def model_fn(features, labels, mode, params):

		model_api = model_zoo(model_config)

		model = model_api(model_config, features, labels,
							mode, target, reuse=tf.AUTO_REUSE)

		if mode == tf.estimator.ModeKeys.TRAIN:
			dropout_prob = model_config.dropout_prob
		else:
			dropout_prob = 0.0

		if model_io_config.fix_lm == True:
			scope = model_config.scope + "_finetuning"
		else:
			scope = model_config.scope
		
		(nsp_loss, 
		 nsp_per_example_loss, 
		 nsp_log_prob) = pretrain.get_next_sentence_output(model_config,
										model.get_pooled_output(),
										features['next_sentence_labels'],
										reuse=tf.AUTO_REUSE)

		masked_lm_positions = features["masked_lm_positions"]
		masked_lm_ids = features["masked_lm_ids"]
		masked_lm_weights = features["masked_lm_weights"]

		if model_config.model_type == 'bert':
			masked_lm_fn = pretrain.get_masked_lm_output
			print("==apply bert masked lm==")
		elif model_config.model_type == 'albert':
			masked_lm_fn = pretrain_albert.get_masked_lm_output
			print("==apply albert masked lm==")
		else:
			masked_lm_fn = pretrain.get_masked_lm_output
			print("==apply bert masked lm==")

		(masked_lm_loss,
		masked_lm_example_loss, 
		masked_lm_log_probs,
		masked_lm_mask) = masked_lm_fn(
										model_config, 
										model.get_sequence_output(), 
										model.get_embedding_table(),
										masked_lm_positions, 
										masked_lm_ids, 
										masked_lm_weights,
										reuse=tf.AUTO_REUSE,
										embedding_projection=model.get_embedding_projection_table())
		print(model_config.lm_ratio, '==mlm lm_ratio==')
		loss = model_config.lm_ratio * masked_lm_loss #+ model_config.nsp_ratio * nsp_loss
		
		model_io_fn = model_io.ModelIO(model_io_config)

		pretrained_tvars = model_io_fn.get_params(model_config.scope, 
										not_storage_params=not_storage_params)

		lm_pretrain_tvars = model_io_fn.get_params("cls/predictions", 
									not_storage_params=not_storage_params)

		pretrained_tvars.extend(lm_pretrain_tvars)

		if load_pretrained == "yes":
			scaffold_fn = model_io_fn.load_pretrained(pretrained_tvars, 
											init_checkpoint,
											exclude_scope=exclude_scope,
											use_tpu=1)
		else:
			scaffold_fn = None
                print("******* scaffold fn *******", scaffold_fn)
		if mode == tf.estimator.ModeKeys.TRAIN:
						
			optimizer_fn = optimizer.Optimizer(opt_config)
						
			tvars = pretrained_tvars
			model_io_fn.print_params(tvars, string=", trainable params")
			
			# update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
			# with tf.control_dependencies(update_ops):
			print('==gpu count==', opt_config.get('gpu_count', 1))

			train_op = optimizer_fn.get_train_op(loss, tvars,
							opt_config.init_lr, 
							opt_config.num_train_steps,
							use_tpu=opt_config.use_tpu)

			train_metric_dict = train_metric_fn(
					masked_lm_example_loss, masked_lm_log_probs, 
					masked_lm_ids,
					masked_lm_weights, 
					nsp_per_example_loss,
					nsp_log_prob, 
					features['next_sentence_labels'],
					masked_lm_mask=masked_lm_mask
				)

			# for key in train_metric_dict:
			# 	tf.summary.scalar(key, train_metric_dict[key])
			# tf.summary.scalar('learning_rate', optimizer_fn.learning_rate)

			estimator_spec = tf.contrib.tpu.TPUEstimatorSpec(
							mode=mode,
							loss=loss,
							train_op=train_op,
							scaffold_fn=scaffold_fn)

			return estimator_spec

		elif mode == tf.estimator.ModeKeys.EVAL:

			def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
					masked_lm_weights, next_sentence_example_loss,
					next_sentence_log_probs, next_sentence_labels):
				"""Computes the loss and accuracy of the model."""
				masked_lm_log_probs = tf.reshape(masked_lm_log_probs,
												 [-1, masked_lm_log_probs.shape[-1]])
				masked_lm_predictions = tf.argmax(
					masked_lm_log_probs, axis=-1, output_type=tf.int32)
				masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1])
				masked_lm_ids = tf.reshape(masked_lm_ids, [-1])
				masked_lm_weights = tf.reshape(masked_lm_weights, [-1])
				masked_lm_accuracy = tf.metrics.accuracy(
					labels=masked_lm_ids,
					predictions=masked_lm_predictions,
					weights=masked_lm_weights)
				masked_lm_mean_loss = tf.metrics.mean(
					values=masked_lm_example_loss, weights=masked_lm_weights)

				next_sentence_log_probs = tf.reshape(
					next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]])
				next_sentence_predictions = tf.argmax(
					next_sentence_log_probs, axis=-1, output_type=tf.int32)
				next_sentence_labels = tf.reshape(next_sentence_labels, [-1])
				next_sentence_accuracy = tf.metrics.accuracy(
					labels=next_sentence_labels, predictions=next_sentence_predictions)
				next_sentence_mean_loss = tf.metrics.mean(
					values=next_sentence_example_loss)

				return {
					"masked_lm_accuracy": masked_lm_accuracy,
					"masked_lm_loss": masked_lm_mean_loss,
					"next_sentence_accuracy": next_sentence_accuracy,
					"next_sentence_loss": next_sentence_mean_loss
					}

			eval_metrics = (metric_fn, [
			  masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
			  masked_lm_weights, nsp_per_example_loss,
			  nsp_log_prob, features['next_sentence_labels']
			])

			estimator_spec = tf.contrib.tpu.TPUEstimatorSpec(
						  mode=mode,
						  loss=loss,
						  eval_metrics=eval_metrics,
						  scaffold_fn=scaffold_fn)

			return estimator_spec
		else:
			raise NotImplementedError()
Exemple #20
0
	def model_fn(features, labels, mode):

		# model = bert_encoder(model_config, features, labels,
		# 					mode, target, reuse=model_reuse)

		model = albert_encoder(model_config, features, labels,
							mode, target, reuse=model_reuse)

		label_ids = features["label_ids"]

		if mode == tf.estimator.ModeKeys.TRAIN:
			dropout_prob = model_config.dropout_prob
		else:
			dropout_prob = 0.0

		if model_io_config.fix_lm == True:
			scope = model_config.scope + "_finetuning"
		else:
			scope = model_config.scope

		with tf.variable_scope(scope, reuse=model_reuse):
			(loss, 
				per_example_loss, 
				logits) = classifier.classifier(model_config,
											model.get_pooled_output(),
											num_labels,
											label_ids,
											dropout_prob)

		model_io_fn = model_io.ModelIO(model_io_config)

		tvars = model_io_fn.get_params(model_config.scope, 
										not_storage_params=not_storage_params)

		if load_pretrained == "yes":
			model_io_fn.load_pretrained(tvars, 
										init_checkpoint,
										exclude_scope=exclude_scope)

		if mode == tf.estimator.ModeKeys.TRAIN:

			optimizer_fn = optimizer.Optimizer(opt_config)

			model_io_fn.print_params(tvars, string=", trainable params")
			
			update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
			print("==update_ops==", update_ops)

			with tf.control_dependencies(update_ops):
				train_op = optimizer_fn.get_train_op(loss, tvars, 
								opt_config.init_lr, 
								opt_config.num_train_steps,
								**kargs)
			# train_op, hooks = model_io_fn.get_ema_hooks(train_op, 
			# 					tvars,
			# 					kargs.get('params_moving_average_decay', 0.99),
			# 					scope, mode, 
			# 					first_stage_steps=opt_config.num_warmup_steps,
			# 					two_stage=True)

			model_io_fn.set_saver()

			estimator_spec = tf.estimator.EstimatorSpec(mode=mode, 
								loss=loss, train_op=train_op)
				
			return estimator_spec
		elif mode == tf.estimator.ModeKeys.EVAL:
			
			# _, hooks = model_io_fn.get_ema_hooks(None,
			# 							None,
			# 							kargs.get('params_moving_average_decay', 0.99), 
			# 							scope, mode)

			hooks = None

			def metric_fn(per_example_loss,
						logits, 
						label_ids):
				"""Computes the loss and accuracy of the model."""
				sentence_log_probs = tf.reshape(
					logits, [-1, logits.shape[-1]])
				sentence_predictions = tf.argmax(
					logits, axis=-1, output_type=tf.int32)
				sentence_labels = tf.reshape(label_ids, [-1])
				sentence_accuracy = tf.metrics.accuracy(
					labels=label_ids, predictions=sentence_predictions)
				sentence_mean_loss = tf.metrics.mean(
					values=per_example_loss)
				sentence_f = tf_metrics.f1(label_ids, 
										sentence_predictions, 
										num_labels, 
										label_lst, average="macro")

				eval_metric_ops = {
									"f1": sentence_f,
									"acc":sentence_accuracy
								}

				return eval_metric_ops

			eval_metric_ops = metric_fn( 
							per_example_loss,
							logits, 
							label_ids)

			eval_hooks = [hooks] if hooks else []
			
			estimator_spec = tf.estimator.EstimatorSpec(mode=mode, 
								loss=loss,
								eval_metric_ops=eval_metric_ops,
								evaluation_hooks=eval_hooks
								)

			if output_type == "sess":
				return {
					"eval":{
							"per_example_loss":per_example_loss,
							"logits":logits,
							"loss":tf.reduce_mean(per_example_loss)
						}
				}
			elif output_type == "estimator":
				return estimator_spec
		else:
			raise NotImplementedError()
Exemple #21
0
    def model_fn(features, labels, mode):

        model_api = model_zoo(model_config)

        model = model_api(model_config,
                          features,
                          labels,
                          mode,
                          target,
                          reuse=model_reuse,
                          **kargs)

        label_ids = features["label_ids"]

        if mode == tf.estimator.ModeKeys.TRAIN:
            dropout_prob = model_config.dropout_prob
        else:
            dropout_prob = 0.0

        if model_io_config.fix_lm == True:
            scope = model_config.scope + "_finetuning"
        else:
            scope = model_config.scope

        with tf.variable_scope(scope, reuse=model_reuse):
            (loss, per_example_loss,
             logits) = classifier.classifier(model_config,
                                             model.get_pooled_output(),
                                             num_labels, label_ids,
                                             dropout_prob)

        model_io_fn = model_io.ModelIO(model_io_config)

        tvars = model_io_fn.get_params(model_config.scope,
                                       not_storage_params=not_storage_params)

        try:
            params_size = model_io_fn.count_params(model_config.scope)
            print("==total params==", params_size)
        except:
            print("==not count params==")
        print(tvars)
        if load_pretrained == "yes":
            model_io_fn.load_pretrained(tvars,
                                        init_checkpoint,
                                        exclude_scope=exclude_scope)

        if mode == tf.estimator.ModeKeys.TRAIN:

            optimizer_fn = optimizer.Optimizer(opt_config)

            model_io_fn.print_params(tvars, string=", trainable params")
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            print("==update_ops==", update_ops)
            with tf.control_dependencies(update_ops):
                train_op = optimizer_fn.get_train_op(
                    loss, tvars, opt_config.init_lr,
                    opt_config.num_train_steps, **kargs)

                model_io_fn.set_saver()

                if kargs.get("task_index", 1) == 0 and kargs.get(
                        "run_config", None):
                    training_hooks = []
                elif kargs.get("task_index", 1) == 0:
                    model_io_fn.get_hooks(kargs.get("checkpoint_dir", None),
                                          kargs.get("num_storage_steps", 1000))

                    training_hooks = model_io_fn.checkpoint_hook
                else:
                    training_hooks = []

                if len(optimizer_fn.distributed_hooks) >= 1:
                    training_hooks.extend(optimizer_fn.distributed_hooks)
                print(training_hooks, "==training_hooks==", "==task_index==",
                      kargs.get("task_index", 1))

                estimator_spec = tf.estimator.EstimatorSpec(
                    mode=mode,
                    loss=loss,
                    train_op=train_op,
                    training_hooks=training_hooks)
                print(tf.global_variables(), "==global_variables==")
                if output_type == "sess":
                    return {
                        "train": {
                            "loss": loss,
                            "logits": logits,
                            "train_op": train_op
                        },
                        "hooks": training_hooks
                    }
                elif output_type == "estimator":
                    return estimator_spec

        elif mode == tf.estimator.ModeKeys.PREDICT:
            # if model_config.get('label_type', 'single_label') == 'single_label':
            # 	print(logits.get_shape(), "===logits shape===")
            # 	pred_label = tf.argmax(logits, axis=-1, output_type=tf.int32)
            # 	prob = tf.nn.softmax(logits)
            # 	max_prob = tf.reduce_max(prob, axis=-1)

            # 	estimator_spec = tf.estimator.EstimatorSpec(
            # 							mode=mode,
            # 							predictions={
            # 										'pred_label':pred_label,
            # 										"max_prob":max_prob
            # 							},
            # 							export_outputs={
            # 								"output":tf.estimator.export.PredictOutput(
            # 											{
            # 												'pred_label':pred_label,
            # 												"max_prob":max_prob
            # 											}
            # 										)
            # 							}
            # 				)
            if model_config.get('label_type', 'single_label') == 'multi_label':
                prob = tf.nn.sigmoid(logits)
                estimator_spec = tf.estimator.EstimatorSpec(
                    mode=mode,
                    predictions={
                        'pred_label': prob,
                        "max_prob": prob
                    },
                    export_outputs={
                        "output":
                        tf.estimator.export.PredictOutput({
                            'pred_label': prob,
                            "max_prob": prob
                        })
                    })
            elif model_config.get('label_type',
                                  'single_label') == "single_label":
                prob = tf.nn.softmax(logits)
                estimator_spec = tf.estimator.EstimatorSpec(
                    mode=mode,
                    predictions={
                        'pred_label': prob,
                        "max_prob": prob
                    },
                    export_outputs={
                        "output":
                        tf.estimator.export.PredictOutput({
                            'pred_label': prob,
                            "max_prob": prob
                        })
                    })
            return estimator_spec

        elif mode == tf.estimator.ModeKeys.EVAL:

            def metric_fn(per_example_loss, logits, label_ids):
                """Computes the loss and accuracy of the model."""
                sentence_log_probs = tf.reshape(logits, [-1, logits.shape[-1]])
                sentence_predictions = tf.argmax(logits,
                                                 axis=-1,
                                                 output_type=tf.int32)
                sentence_labels = tf.reshape(label_ids, [-1])
                sentence_accuracy = tf.metrics.accuracy(
                    labels=label_ids, predictions=sentence_predictions)
                sentence_mean_loss = tf.metrics.mean(values=per_example_loss)
                sentence_f = tf_metrics.f1(label_ids,
                                           sentence_predictions,
                                           num_labels,
                                           label_lst,
                                           average="macro")

                eval_metric_ops = {"f1": sentence_f, "acc": sentence_accuracy}

                return eval_metric_ops

            if output_type == "sess":
                return {
                    "eval": {
                        "per_example_loss": per_example_loss,
                        "logits": logits,
                        "loss": tf.reduce_mean(per_example_loss),
                        "feature": model.get_pooled_output()
                    }
                }
            elif output_type == "estimator":
                eval_metric_ops = metric_fn(per_example_loss, logits,
                                            label_ids)

                estimator_spec = tf.estimator.EstimatorSpec(
                    mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
                return estimator_spec
        else:
            raise NotImplementedError()
Exemple #22
0
    def model_fn(features, labels, mode):

        shape_lst_a = bert_utils.get_shape_list(features['input_ids_a'])
        batch_size_a = shape_lst_a[0]
        total_length_a = shape_lst_a[1]

        shape_lst_b = bert_utils.get_shape_list(features['input_ids_b'])
        batch_size_b = shape_lst_b[0]
        total_length_b = shape_lst_b[1]

        features['input_ids_a'] = tf.reshape(features['input_ids_a'],
                                             [-1, model_config.max_length])
        features['segment_ids_a'] = tf.reshape(features['segment_ids_a'],
                                               [-1, model_config.max_length])
        features['input_mask_a'] = tf.cast(
            tf.not_equal(features['input_ids_a'], kargs.get('[PAD]', 0)),
            tf.int64)

        features['input_ids_b'] = tf.reshape(
            features['input_ids_b'],
            [-1, model_config.max_predictions_per_seq])
        features['segment_ids_b'] = tf.reshape(
            features['segment_ids_b'],
            [-1, model_config.max_predictions_per_seq])
        features['input_mask_b'] = tf.cast(
            tf.not_equal(features['input_ids_b'], kargs.get('[PAD]', 0)),
            tf.int64)

        features['batch_size'] = batch_size_a
        features['total_length_a'] = total_length_a
        features['total_length_b'] = total_length_b

        model_dict = {}
        for target in ["a", "b"]:
            model = bert_encoder(model_config,
                                 features,
                                 labels,
                                 mode,
                                 target,
                                 reuse=tf.AUTO_REUSE)
            model_dict[target] = model

        if mode == tf.estimator.ModeKeys.TRAIN:
            dropout_prob = model_config.dropout_prob
        else:
            dropout_prob = 0.0

        if model_io_config.fix_lm == True:
            scope = model_config.scope + "_finetuning"
        else:
            scope = model_config.scope

        with tf.variable_scope(scope, reuse=model_reuse):
            (loss, per_example_loss, logits,
             transition_params) = multi_position_crf_classifier(
                 model_config, features, model_dict, num_labels, dropout_prob)

        model_io_fn = model_io.ModelIO(model_io_config)

        tvars = model_io_fn.get_params(model_config.scope,
                                       not_storage_params=not_storage_params)

        try:
            params_size = model_io_fn.count_params(model_config.scope)
            print("==total params==", params_size)
        except:
            print("==not count params==")
        print(tvars)
        if load_pretrained == "yes":
            model_io_fn.load_pretrained(tvars,
                                        init_checkpoint,
                                        exclude_scope=exclude_scope)

        if mode == tf.estimator.ModeKeys.TRAIN:

            optimizer_fn = optimizer.Optimizer(opt_config)

            model_io_fn.print_params(tvars, string=", trainable params")
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            print("==update_ops==", update_ops)
            with tf.control_dependencies(update_ops):
                train_op = optimizer_fn.get_train_op(
                    loss, tvars, opt_config.init_lr,
                    opt_config.num_train_steps, **kargs)

            train_op, hooks = model_io_fn.get_ema_hooks(
                train_op,
                tvars,
                kargs.get('params_moving_average_decay', 0.99),
                scope,
                mode,
                first_stage_steps=opt_config.num_warmup_steps,
                two_stage=True)

            model_io_fn.set_saver()

            if kargs.get("task_index", 1) == 0 and kargs.get(
                    "run_config", None):
                training_hooks = []
            elif kargs.get("task_index", 1) == 0:
                model_io_fn.get_hooks(kargs.get("checkpoint_dir", None),
                                      kargs.get("num_storage_steps", 1000))

                training_hooks = model_io_fn.checkpoint_hook
            else:
                training_hooks = []

            if len(optimizer_fn.distributed_hooks) >= 1:
                training_hooks.extend(optimizer_fn.distributed_hooks)
            print(training_hooks, "==training_hooks==", "==task_index==",
                  kargs.get("task_index", 1))

            estimator_spec = tf.estimator.EstimatorSpec(
                mode=mode,
                loss=loss,
                train_op=train_op,
                training_hooks=training_hooks)
            print(tf.global_variables(), "==global_variables==")
            if output_type == "sess":
                return {
                    "train": {
                        "loss": loss,
                        "logits": logits,
                        "train_op": train_op
                    },
                    "hooks": training_hooks
                }
            elif output_type == "estimator":
                return estimator_spec

        elif mode == tf.estimator.ModeKeys.PREDICT:
            print(logits.get_shape(), "===logits shape===")

            label_weights = tf.cast(features['label_weights'], tf.int32)
            label_seq_length = tf.reduce_sum(label_weights, axis=-1)

            decode_tags, best_score = tf.contrib.crf.crf_decode(
                logits, transition_params, label_seq_length)

            _, hooks = model_io_fn.get_ema_hooks(
                None, None, kargs.get('params_moving_average_decay', 0.99),
                scope, mode)

            estimator_spec = tf.estimator.EstimatorSpec(
                mode=mode,
                predictions={
                    'decode_tags': decode_tags,
                    "best_score": best_score,
                    "transition_params": transition_params,
                    "logits": logits
                },
                export_outputs={
                    "output":
                    tf.estimator.export.PredictOutput({
                        'decode_tags': decode_tags,
                        "best_score": best_score,
                        "transition_params": transition_params,
                        "logits": logits
                    })
                },
                prediction_hooks=[hooks])
            return estimator_spec

        elif mode == tf.estimator.ModeKeys.EVAL:

            _, hooks = model_io_fn.get_ema_hooks(
                None, None, kargs.get('params_moving_average_decay', 0.99),
                scope, mode)
            eval_hooks = []

            if output_type == "sess":
                return {
                    "eval": {
                        "per_example_loss": per_example_loss,
                        "logits": logits,
                        "loss": tf.reduce_mean(per_example_loss),
                        "feature": model.get_pooled_output()
                    }
                }
            elif output_type == "estimator":

                eval_metric_ops = eval_logtis(logits, features, num_labels,
                                              transition_params)

                estimator_spec = tf.estimator.EstimatorSpec(
                    mode=mode,
                    loss=loss,
                    eval_metric_ops=eval_metric_ops,
                    evaluation_hooks=eval_hooks)
                return estimator_spec
        else:
            raise NotImplementedError()
    def model_fn(features, labels, mode):

        model_api = model_zoo(model_config)

        model = model_api(model_config,
                          features,
                          labels,
                          mode,
                          target,
                          reuse=model_reuse)

        label_ids = features["label_ids"]

        if mode == tf.estimator.ModeKeys.TRAIN:
            dropout_prob = model_config.dropout_prob
        else:
            dropout_prob = 0.0

        if model_io_config.fix_lm == True:
            scope = model_config.scope + "_finetuning"
        else:
            scope = model_config.scope

        with tf.variable_scope(scope, reuse=model_reuse):
            (loss, per_example_loss,
             logits) = classifier.classifier(model_config,
                                             model.get_pooled_output(),
                                             num_labels, label_ids,
                                             dropout_prob)
            label_loss = tf.reduce_sum(
                per_example_loss * features["label_ratio"]) / (
                    1e-10 + tf.reduce_sum(features["label_ratio"]))

        if mode == tf.estimator.ModeKeys.TRAIN:

            distillation_api = distill.KnowledgeDistillation(
                kargs.get(
                    "disitllation_config",
                    Bunch({
                        "logits_ratio_decay": "constant",
                        "logits_ratio": 0.5,
                        "logits_decay_rate": 0.999,
                        "distillation": ['relation_kd', 'logits'],
                        "feature_ratio": 0.5,
                        "feature_ratio_decay": "constant",
                        "feature_decay_rate": 0.999,
                        "kd_type": "kd",
                        "scope": scope
                    })))
            # get teacher logits
            teacher_logit = tf.log(features["label_probs"] +
                                   1e-10) / kargs.get(
                                       "temperature",
                                       2.0)  # log_softmax logits
            student_logit = tf.nn.log_softmax(
                logits / kargs.get("temperature", 2.0))  # log_softmax logits

            distillation_features = {
                "student_logits_tensor": student_logit,
                "teacher_logits_tensor": teacher_logit,
                "student_feature_tensor": model.get_pooled_output(),
                "teacher_feature_tensor": features["distillation_feature"],
                "student_label": tf.ones_like(label_ids, dtype=tf.int32),
                "teacher_label": tf.zeros_like(label_ids, dtype=tf.int32),
                "logits_ratio": kargs.get("logits_ratio", 0.5),
                "feature_ratio": kargs.get("logits_ratio", 0.5),
                "distillation_ratio": features["distillation_ratio"],
                "src_f_logit": logits,
                "tgt_f_logit": logits,
                "src_tensor": model.get_pooled_output(),
                "tgt_tensor": features["distillation_feature"]
            }

            distillation_loss = distillation_api.distillation(
                distillation_features,
                2,
                dropout_prob,
                model_reuse,
                opt_config.num_train_steps,
                feature_ratio=1.0,
                logits_ratio_decay="constant",
                feature_ratio_decay="constant",
                feature_decay_rate=0.999,
                logits_decay_rate=0.999,
                logits_ratio=0.5,
                scope=scope + "/adv_classifier",
                num_classes=num_labels,
                gamma=kargs.get("gamma", 4))

            loss = label_loss + distillation_loss["distillation_loss"]

        model_io_fn = model_io.ModelIO(model_io_config)

        tvars = model_io_fn.get_params(model_config.scope,
                                       not_storage_params=not_storage_params)
        print(tvars)
        if load_pretrained == "yes":
            model_io_fn.load_pretrained(tvars,
                                        init_checkpoint,
                                        exclude_scope=exclude_scope)

        if mode == tf.estimator.ModeKeys.TRAIN:

            optimizer_fn = optimizer.Optimizer(opt_config)

            model_io_fn.print_params(tvars, string=", trainable params")
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            with tf.control_dependencies(update_ops):
                train_op = optimizer_fn.get_train_op(
                    loss, tvars, opt_config.init_lr,
                    opt_config.num_train_steps, **kargs)

                model_io_fn.set_saver()

                if kargs.get("task_index", 1) == 0 and kargs.get(
                        "run_config", None):
                    training_hooks = []
                elif kargs.get("task_index", 1) == 0:
                    model_io_fn.get_hooks(kargs.get("checkpoint_dir", None),
                                          kargs.get("num_storage_steps", 1000))

                    training_hooks = model_io_fn.checkpoint_hook
                else:
                    training_hooks = []

                if len(optimizer_fn.distributed_hooks) >= 1:
                    training_hooks.extend(optimizer_fn.distributed_hooks)
                print(training_hooks, "==training_hooks==", "==task_index==",
                      kargs.get("task_index", 1))

                estimator_spec = tf.estimator.EstimatorSpec(
                    mode=mode,
                    loss=loss,
                    train_op=train_op,
                    training_hooks=training_hooks)
                if output_type == "sess":

                    return {
                        "train": {
                            "loss":
                            loss,
                            "logits":
                            logits,
                            "train_op":
                            train_op,
                            "cross_entropy":
                            label_loss,
                            "distillation_loss":
                            distillation_loss["distillation_loss"],
                            "kd_num":
                            tf.reduce_sum(features["distillation_ratio"]),
                            "ce_num":
                            tf.reduce_sum(features["label_ratio"]),
                            "label_ratio":
                            features["label_ratio"],
                            "distilaltion_logits_loss":
                            distillation_loss["distillation_logits_loss"],
                            "distilaltion_feature_loss":
                            distillation_loss["distillation_feature_loss"],
                            "rkd_loss":
                            distillation_loss["rkd_loss"]
                        },
                        "hooks": training_hooks
                    }
                elif output_type == "estimator":
                    return estimator_spec

        elif mode == tf.estimator.ModeKeys.PREDICT:
            print(logits.get_shape(), "===logits shape===")
            pred_label = tf.argmax(logits, axis=-1, output_type=tf.int32)
            prob = tf.nn.softmax(logits)
            max_prob = tf.reduce_max(prob, axis=-1)

            estimator_spec = tf.estimator.EstimatorSpec(
                mode=mode,
                predictions={
                    'pred_label': pred_label,
                    "max_prob": max_prob
                },
                export_outputs={
                    "output":
                    tf.estimator.export.PredictOutput({
                        'pred_label': pred_label,
                        "max_prob": max_prob
                    })
                })
            return estimator_spec

        elif mode == tf.estimator.ModeKeys.EVAL:

            def metric_fn(per_example_loss, logits, label_ids):
                """Computes the loss and accuracy of the model."""
                sentence_log_probs = tf.reshape(logits, [-1, logits.shape[-1]])
                sentence_predictions = tf.argmax(logits,
                                                 axis=-1,
                                                 output_type=tf.int32)
                sentence_labels = tf.reshape(label_ids, [-1])
                sentence_accuracy = tf.metrics.accuracy(
                    labels=label_ids, predictions=sentence_predictions)
                sentence_mean_loss = tf.metrics.mean(values=per_example_loss)
                sentence_f = tf_metrics.f1(label_ids,
                                           sentence_predictions,
                                           num_labels,
                                           label_lst,
                                           average="macro")

                eval_metric_ops = {"f1": sentence_f, "acc": sentence_accuracy}

                return eval_metric_ops

            eval_metric_ops = metric_fn(per_example_loss, logits, label_ids)

            estimator_spec = tf.estimator.EstimatorSpec(
                mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)

            if output_type == "sess":
                return {
                    "eval": {
                        "per_example_loss": per_example_loss,
                        "logits": logits,
                        "loss": tf.reduce_mean(per_example_loss)
                    }
                }
            elif output_type == "estimator":
                return estimator_spec
        else:
            raise NotImplementedError()
Exemple #24
0
def main(_):

	hvd.init()

	sess_config = tf.ConfigProto()
	sess_config.gpu_options.visible_device_list = str(hvd.local_rank())

	graph = tf.Graph()
	with graph.as_default():
		import json
				
		config = json.load(open(FLAGS.config_file, "r"))
		init_checkpoint = FLAGS.init_checkpoint

		config = Bunch(config)
		config.use_one_hot_embeddings = True
		config.scope = "bert"
		config.dropout_prob = 0.1
		config.label_type = "single_label"
		
		if FLAGS.if_shard == "0":
			train_size = FLAGS.train_size
			epoch = int(FLAGS.epoch / hvd.size())
		elif FLAGS.if_shard == "1":
			train_size = int(FLAGS.train_size/hvd.size())
			epoch = FLAGS.epoch

		tokenizer = tokenization.FullTokenizer(
		vocab_file=FLAGS.vocab_file, 
		do_lower_case=FLAGS.lower_case)

		classifier_data_api = classifier_processor.EvaluationProcessor()
		classifier_data_api.get_labels(FLAGS.label_id)

		train_examples = classifier_data_api.get_train_examples(FLAGS.train_file)

		write_to_tfrecords.convert_classifier_examples_to_features(train_examples,
																classifier_data_api.label2id,
																FLAGS.max_length,
																tokenizer,
																FLAGS.eval_data_file)

		init_lr = 2e-5

		num_train_steps = int(
			train_size / FLAGS.batch_size * epoch)
		num_warmup_steps = int(num_train_steps * 0.1)

		num_storage_steps = int(train_size / FLAGS.batch_size)

		print(" model type {}".format(FLAGS.model_type))

		print(num_train_steps, num_warmup_steps, "=============")
		
		opt_config = Bunch({"init_lr":init_lr/hvd.size(), 
							"num_train_steps":num_train_steps,
							"num_warmup_steps":num_warmup_steps})

		sess = tf.Session(config=sess_config)

		model_io_config = Bunch({"fix_lm":False})
		
		model_io_fn = model_io.ModelIO(model_io_config)

		optimizer_fn = optimizer.Optimizer(opt_config)
		
		num_classes = FLAGS.num_classes
		
		model_eval_fn = bert_classifier.classifier_model_fn_builder(config, num_classes, init_checkpoint, 
												reuse=False, 
												load_pretrained=True,
												model_io_fn=model_io_fn,
												optimizer_fn=optimizer_fn,
												model_io_config=model_io_config, 
												opt_config=opt_config)
		
		def metric_fn(features, logits, loss):
			print(logits.get_shape(), "===logits shape===")
			pred_label = tf.argmax(logits, axis=-1, output_type=tf.int32)
			prob = tf.nn.softmax(logits)
			accuracy = correct = tf.equal(
				tf.cast(pred_label, tf.int32),
				tf.cast(features["label_ids"], tf.int32)
			)
			accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
			return {"accuracy":accuracy, "loss":loss, "pred_label":pred_label, 
				"label_ids":features["label_ids"],
				"prob":prob}
		
		name_to_features = {
				"input_ids":
						tf.FixedLenFeature([FLAGS.max_length], tf.int64),
				"input_mask":
						tf.FixedLenFeature([FLAGS.max_length], tf.int64),
				"segment_ids":
						tf.FixedLenFeature([FLAGS.max_length], tf.int64),
				"label_ids":
						tf.FixedLenFeature([], tf.int64),
		}

		def _decode_record(record, name_to_features):
			"""Decodes a record to a TensorFlow example.
			"""
			example = tf.parse_single_example(record, name_to_features)

			# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
			# So cast all int64 to int32.
			for name in list(example.keys()):
				t = example[name]
				if t.dtype == tf.int64:
					t = tf.to_int32(t)
				example[name] = t
			return example 

		params = Bunch({})
		params.epoch = FLAGS.epoch
		params.batch_size = FLAGS.batch_size

		eval_features = tf_data_utils.eval_input_fn(FLAGS.eval_data_file,
									_decode_record, name_to_features, params, if_shard=FLAGS.if_shard)
		
		[_, eval_loss, eval_per_example_loss, eval_logits] = model_eval_fn(eval_features, [], tf.estimator.ModeKeys.EVAL)
		result = metric_fn(eval_features, eval_logits, eval_loss)
		
		init_op = tf.group(tf.global_variables_initializer(), 
					tf.local_variables_initializer())
		sess.run(init_op)

		sess.run(hvd.broadcast_global_variables(0))

		print("===horovod rank==={}".format(hvd.rank()))
		
		def eval_fn(result):
			i = 0
			total_accuracy = 0
			label, label_id, prob = [], [], []
			while True:
				try:
					eval_result = sess.run(result)
					total_accuracy += eval_result["accuracy"]
					label_id.extend(eval_result["label_ids"])
					label.extend(eval_result["pred_label"])
					prob.extend(eval_result["prob"])
					i += 1
				except tf.errors.OutOfRangeError:
					print("End of dataset")
					break
			macro_f1 = f1_score(label_id, label, average="macro")
			micro_f1 = f1_score(label_id, label, average="micro")
			macro_precision = precision_score(label_id, label, average="macro")
			micro_precision = precision_score(label_id, label, average="micro")
			macro_recall = recall_score(label_id, label, average="macro")
			micro_recall = recall_score(label_id, label, average="micro")
			accuracy = accuracy_score(label_id, label)
			print("test accuracy {} macro_f1 score {} micro_f1 {} accuracy {}".format(total_accuracy/ i, 
																					macro_f1,  micro_f1, accuracy))
			return total_accuracy/ i, label_id, label, prob
		
		import time
		import time
		start = time.time()
		
		acc, true_label, pred_label, prob = eval_fn(result)
		end = time.time()
		print("==total time {} numbers of devices {}".format(end - start, hvd.size()))
		if hvd.rank() == 0:
			import _pickle as pkl
			pkl.dump({"true_label":true_label, 
						"pred_label":pred_label,
						"prob":prob}, 
						open(FLAGS.model_output+"/predict.pkl", "wb"))
Exemple #25
0
	def model_fn(features, labels, mode):

		model = gpt_encoder(model_config, features, labels, 
			mode, target, reuse=tf.AUTO_REUSE)
		scope = model_config.scope

		if mode == tf.estimator.ModeKeys.TRAIN:
			# batch x seq_length
			sequence_mask = tf.to_float(tf.not_equal(features['input_ids'][:, 1:], 
													kargs.get('[PAD]', 0)))

			# batch x seq_length
			seq_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
						labels=features['input_ids'][:, 1:], 
						logits=model.get_sequence_output_logits()[:, :-1])

			per_example_loss = tf.reduce_sum(seq_loss*sequence_mask, axis=-1) / (tf.reduce_sum(sequence_mask, axis=-1)+1e-10)
			loss = tf.reduce_mean(per_example_loss)

		model_io_fn = model_io.ModelIO(model_io_config)

		tvars = model_io_fn.get_params(model_config.scope, 
										not_storage_params=not_storage_params)

		try:
			params_size = model_io_fn.count_params(model_config.scope)
			print("==total params==", params_size)
		except:
			print("==not count params==")
		print(tvars)
		if load_pretrained == "yes":
			model_io_fn.load_pretrained(tvars, 
										init_checkpoint,
										exclude_scope=exclude_scope)

		if mode == tf.estimator.ModeKeys.TRAIN:

			optimizer_fn = optimizer.Optimizer(opt_config)

			model_io_fn.print_params(tvars, string=", trainable params")
			update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
			print("==update_ops==", update_ops)
			with tf.control_dependencies(update_ops):
				train_op = optimizer_fn.get_train_op(loss, tvars, 
								opt_config.init_lr, 
								opt_config.num_train_steps,
								**kargs)

				model_io_fn.set_saver()

				if kargs.get("task_index", 1) == 0 and kargs.get("run_config", None):
					training_hooks = []
				elif kargs.get("task_index", 1) == 0:
					model_io_fn.get_hooks(kargs.get("checkpoint_dir", None), 
														kargs.get("num_storage_steps", 1000))

					training_hooks = model_io_fn.checkpoint_hook
				else:
					training_hooks = []

				if len(optimizer_fn.distributed_hooks) >= 1:
					training_hooks.extend(optimizer_fn.distributed_hooks)
				print(training_hooks, "==training_hooks==", "==task_index==", kargs.get("task_index", 1))

				train_metric_dict = train_metric(features['input_ids'], 
												model.get_sequence_output_logits(), 
												**kargs)

				for key in train_metric_dict:
					tf.summary.scalar(key, train_metric_dict[key])
				tf.summary.scalar('learning_rate', optimizer_fn.learning_rate)
				tf.summary.scalar('seq_length', tf.reduce_mean(tf.reduce_sum(sequence_mask, axis=-1)))

				estimator_spec = tf.estimator.EstimatorSpec(mode=mode, 
								loss=loss, train_op=train_op,
								training_hooks=training_hooks)
				print(tf.global_variables(), "==global_variables==")
				if output_type == "sess":
					return {
						"train":{
										"loss":loss, 
										"logits":logits,
										"train_op":train_op
									},
						"hooks":training_hooks
					}
				elif output_type == "estimator":
					return estimator_spec

		elif mode == tf.estimator.ModeKeys.PREDICT:
			if kargs.get('predict_type', 'sample_sequence') == 'sample_sequence':
				results = sample.sample_sequence(
							gpt_encoder, hparams=model_config, 
							length=kargs.get('max_length', 64), 
							start_token=None, 
							batch_size=10, 
							context=features['input_ids'],
							temperature=2,
							top_k=10)
				
				sampled_token = results['tokens'][:, 1:]
				sampled_token_logits = results['logits'][:, 1:]

				estimator_spec = tf.estimator.EstimatorSpec(
									mode=mode,
									predictions={
												'token':sampled_token,
												"logits":sampled_token_logits
									},
									export_outputs={
										"output":tf.estimator.export.PredictOutput(
													{
														'token':sampled_token,
														"logits":sampled_token_logits
													}
												)
									}
						)

				return estimator_spec

			elif kargs.get('predict_type', 'sample_sequence') == 'infer_inputs':
				sequence_mask = tf.to_float(tf.not_equal(features['input_ids'][:, 1:], 
													kargs.get('[PAD]', 0)))
				output_logits = model.get_sequence_output_logits()[:, :-1]
				# output_logits = tf.nn.log_softmax(output_logits, axis=-1)

				output_id_logits = tf.nn.sparse_softmax_cross_entropy_with_logits(
										labels=features['input_ids'][:, 1:], 
										logits=output_logits)

				per_example_perplexity = tf.reduce_sum(output_id_logits * sequence_mask, 
												axis=-1) # batch
				per_example_perplexity /= tf.reduce_sum(sequence_mask, axis=-1) # batch

				perplexity = tf.exp(per_example_perplexity)

				estimator_spec = tf.estimator.EstimatorSpec(
									mode=mode,
									predictions={
												'token':features['input_ids'][:, 1:],
												"logits":output_id_logits,
												'perplexity':perplexity
									},
									export_outputs={
										"output":tf.estimator.export.PredictOutput(
													{
														'token':features['input_ids'][:,1:],
														"logits":output_id_logits,
														'perplexity':perplexity
													}
												)
									}
						)

				return estimator_spec

		elif mode == tf.estimator.ModeKeys.EVAL:
			def metric_fn(per_example_loss,
						logits, 
						label_ids):
				"""Computes the loss and accuracy of the model."""
				sentence_log_probs = tf.reshape(
					logits, [-1, logits.shape[-1]])
				sentence_predictions = tf.argmax(
					logits, axis=-1, output_type=tf.int32)
				sentence_labels = tf.reshape(label_ids, [-1])
				sentence_accuracy = tf.metrics.accuracy(
					labels=label_ids, predictions=sentence_predictions)
				sentence_mean_loss = tf.metrics.mean(
					values=per_example_loss)
				sentence_f = tf_metrics.f1(label_ids, 
										sentence_predictions, 
										num_labels, 
										label_lst, average="macro")

				eval_metric_ops = {
									"f1": sentence_f,
									"acc":sentence_accuracy
								}

				return eval_metric_ops

			if output_type == "sess":
				return {
					"eval":{
							"per_example_loss":per_example_loss,
							"logits":logits,
							"loss":tf.reduce_mean(per_example_loss),
							"feature":model.get_pooled_output()
						}
				}
			elif output_type == "estimator":
				eval_metric_ops = metric_fn( 
							per_example_loss,
							logits, 
							label_ids)
			
				estimator_spec = tf.estimator.EstimatorSpec(mode=mode, 
								loss=loss,
								eval_metric_ops=eval_metric_ops)
				return estimator_spec
		else:
			raise NotImplementedError()
def main(_):

	graph = tf.Graph()
	# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
	with graph.as_default():
		import json

		tokenizer = tokenization.FullTokenizer(
		  vocab_file=FLAGS.vocab_file, 
			do_lower_case=True)

		classifier_data_api = classifier_processor.PiarInteractionProcessor()

		eval_examples = classifier_data_api.get_test_examples(FLAGS.eval_data_file,
															FLAGS.lang)

		print(eval_examples[0].guid)

		label_tensor = None

		label_id = json.load(open(FLAGS.label_id, "r"))

		num_choice = 3

		write_to_tfrecords.convert_interaction_classifier_examples_to_features_v1(
																eval_examples,
																label_id["label2id"],
															   FLAGS.max_length,
															   tokenizer,
															   FLAGS.output_file)

		config = json.load(open(FLAGS.config_file, "r"))
		init_checkpoint = FLAGS.init_checkpoint

		max_seq_length = FLAGS.max_length * 2 + 3

		print("===init checkoutpoint==={}".format(init_checkpoint))

		config = Bunch(config)
		config.use_one_hot_embeddings = True
		config.scope = "esim/bert"
		config.dropout_prob = 0.2
		config.label_type = "single_label"
		config.lstm_dim = 128
		config.num_heads = 12
		config.num_units = 768
		
		# os.environ["CUDA_VISIBLE_DEVICES"] = "2"
		sess = tf.Session()
		
		opt_config = Bunch({"init_lr":(5e-5), 
							"num_train_steps":0,
							"num_warmup_steps":0,
							"train_op":"adam"})
		model_io_config = Bunch({"fix_lm":False})
		
		model_io_fn = model_io.ModelIO(model_io_config)

		model_function = bert_esim.classifier_attn_model_fn_builder
		model_eval_fn = model_function(
									config, 
									num_choice, 
									init_checkpoint, 
									model_reuse=None, 
									load_pretrained=True,
									model_io_fn=model_io_fn,
									model_io_config=model_io_config, 
									opt_config=opt_config,
									input_name=["a", "b"],
									label_tensor=label_tensor,
									not_storage_params=["adam", "adam_1"],
									exclude_scope_dict={"task":"esim"})
		
		# def metric_fn(features, logits):
		#     print(logits.get_shape(), "===logits shape===")
		#     pred_label = tf.argmax(logits, axis=-1, output_type=tf.int32)
		#     return {"pred_label":pred_label, "qas_id":features["qas_id"]}

		def metric_fn(features, logits):
			print(logits.get_shape(), "===logits shape===")
			pred_label = tf.argmax(logits, axis=-1, output_type=tf.int32)
			prob = tf.exp(tf.nn.log_softmax(logits))
			return {"pred_label":pred_label, 
					"qas_id":features["qas_id"],
					"prob":prob}
		
		name_to_features = {
				"input_ids_a":
						tf.FixedLenFeature([max_seq_length], tf.int64),
				"input_mask_a":
						tf.FixedLenFeature([max_seq_length], tf.int64),
				"segment_ids_a":
						tf.FixedLenFeature([max_seq_length], tf.int64),
				"input_ids_b":
						tf.FixedLenFeature([max_seq_length], tf.int64),
				"input_mask_b":
						tf.FixedLenFeature([max_seq_length], tf.int64),
				"segment_ids_b":
						tf.FixedLenFeature([max_seq_length], tf.int64),
				"label_ids":
						tf.FixedLenFeature([], tf.int64),
				"qas_id":
						tf.FixedLenFeature([], tf.int64),
		}
		
		def _decode_record(record, name_to_features):
			"""Decodes a record to a TensorFlow example.
			"""
			example = tf.parse_single_example(record, name_to_features)

			# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
			# So cast all int64 to int32.
			for name in list(example.keys()):
				t = example[name]
				if t.dtype == tf.int64:
					t = tf.to_int32(t)
				example[name] = t
			
			return example 

		params = Bunch({})
		params.epoch = 2
		params.batch_size = 32

		eval_features = tf_data_utils.eval_input_fn(FLAGS.output_file,
									_decode_record, name_to_features, params)
		
		[_, eval_loss, eval_per_example_loss, eval_logits] = model_eval_fn(eval_features, [], tf.estimator.ModeKeys.EVAL)
		result = metric_fn(eval_features, eval_logits)
		
		model_io_fn.set_saver()
		
		init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
		sess.run(init_op)

		model_io_fn.load_model(sess, init_checkpoint)
		print(" ==succeeded in loading model== ")

		def eval_fn(result):
			i = 0
			pred_label, qas_id, prob = [], [], []
			while True:
				try:
					eval_result = sess.run(result)
					pred_label.extend(eval_result["pred_label"].tolist())
					qas_id.extend(eval_result["qas_id"].tolist())
					prob.extend(eval_result["prob"].tolist())
					i += 1
				except tf.errors.OutOfRangeError:
					print("End of dataset")
					break
			return pred_label, qas_id, prob
		
		print("===========begin to eval============")
		[pred_label, qas_id, prob] = eval_fn(result)
		result = dict(zip(qas_id, pred_label))

		print(FLAGS.result_file.split("."))
		tmp_output = FLAGS.result_file.split(".")[0] + ".json"
		print(tmp_output, "===temp output===")
		json.dump({"id":qas_id,
					"label":pred_label,
					"prob":prob},
					open(tmp_output, "w"))

		print(len(result), "=====valid result======")

		print(len(result), "=====valid result======")

		import pandas as pd
		df = pd.read_csv(FLAGS.eval_data_file)

		output = {}
		for index in range(df.shape[0]):
			output[df.loc[index]["id"]] = ""

		final_output = []

		cnt = 0
		for key in output:
			if key in result:
				final_output.append({"Id":key, 
					"Category":label_id["id2label"][str(result[key])]})
				cnt += 1
			else:
				final_output.append({"Id":key, "Category":"unrelated"})
		
		df_out = pd.DataFrame(final_output)
		df_out.to_csv(FLAGS.result_file)

		print(len(output), cnt, len(final_output), "======num of results from model==========")
Exemple #27
0
def main(_):

    hvd.init()

    sess_config = tf.ConfigProto()
    sess_config.gpu_options.visible_device_list = str(hvd.local_rank())

    graph = tf.Graph()
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    with graph.as_default():
        import json

        # config = json.load(open("/data/xuht/bert/chinese_L-12_H-768_A-12/bert_config.json", "r"))

        config = json.load(open(FLAGS.config_file, "r"))

        init_checkpoint = FLAGS.init_checkpoint
        print("===init checkoutpoint==={}".format(init_checkpoint))

        # init_checkpoint = "/data/xuht/bert/chinese_L-12_H-768_A-12/bert_model.ckpt"
        # init_checkpoint = "/data/xuht/concat/model_1/oqmrc.ckpt"
        config = Bunch(config)
        config.use_one_hot_embeddings = True
        config.scope = "esim/bert"
        config.dropout_prob = 0.1
        config.label_type = "single_label"
        config.lstm_dim = 128
        config.num_heads = 4

        import json
        label_dict = json.load(open(FLAGS.label_id))

        # label_tensor = np.asarray(label_dict["class_ratio"]).astype(np.float32)
        label_tensor = None
        # config.loss = "focal_loss"

        json.dump(config, open(FLAGS.model_output + "/config.json", "w"))

        # os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.gpu_id
        sess = tf.Session(config=sess_config)

        train_size = int(FLAGS.train_size / hvd.size())

        num_train_steps = int(train_size / FLAGS.batch_size * FLAGS.epoch)
        num_warmup_steps = int(num_train_steps * 0.01)

        num_storage_steps = int(train_size / FLAGS.batch_size)

        print(num_train_steps, num_warmup_steps, "=============")

        opt_config = Bunch({
            "init_lr": (5e-5 / hvd.size()),
            "num_train_steps": num_train_steps,
            "num_warmup_steps": num_warmup_steps,
            "train_op": "adam"
        })

        model_io_config = Bunch({"fix_lm": True})

        model_io_fn = model_io.ModelIO(model_io_config)

        num_choice = FLAGS.num_classes
        max_seq_length = FLAGS.max_length

        if FLAGS.model_type == "original":
            model_function = bert_order_classifier.classifier_model_fn_builder
        elif FLAGS.model_type == "attn":
            model_function = bert_order_classifier.classifier_attn_model_fn_builder
        elif FLAGS.model_type == "orignal_nonlinear":
            model_function = bert_order_classifier.classifier_model_fn_builder_v1
        elif FLAGS.model_type == "esim_bert":
            model_function = esim_bert.classifier_attn_model_fn_builder

        model_eval_fn = model_function(config,
                                       num_choice,
                                       init_checkpoint,
                                       model_reuse=None,
                                       load_pretrained=True,
                                       model_io_fn=model_io_fn,
                                       model_io_config=model_io_config,
                                       opt_config=opt_config,
                                       input_name=["a", "b"],
                                       label_tensor=label_tensor,
                                       not_storage_params=["adam", "adam_1"],
                                       exclude_scope_dict={"task": "esim"})

        def metric_fn(features, logits, loss):
            print(logits.get_shape(), "===logits shape===")
            pred_label = tf.argmax(logits, axis=-1, output_type=tf.int32)
            prob = tf.nn.softmax(logits)
            accuracy = correct = tf.equal(
                tf.cast(pred_label, tf.int32),
                tf.cast(features["label_ids"], tf.int32))
            accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
            return {
                "accuracy": accuracy,
                "loss": loss,
                "pred_label": pred_label,
                "label_ids": features["label_ids"]
            }

        name_to_features = {
            "input_ids_a": tf.FixedLenFeature([max_seq_length], tf.int64),
            "input_mask_a": tf.FixedLenFeature([max_seq_length], tf.int64),
            "segment_ids_a": tf.FixedLenFeature([max_seq_length], tf.int64),
            "input_ids_b": tf.FixedLenFeature([max_seq_length], tf.int64),
            "input_mask_b": tf.FixedLenFeature([max_seq_length], tf.int64),
            "segment_ids_b": tf.FixedLenFeature([max_seq_length], tf.int64),
            "label_ids": tf.FixedLenFeature([], tf.int64),
        }

        def _decode_record(record, name_to_features):
            """Decodes a record to a TensorFlow example.
            """
            example = tf.parse_single_example(record, name_to_features)

            # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
            # So cast all int64 to int32.
            for name in list(example.keys()):
                t = example[name]
                if t.dtype == tf.int64:
                    t = tf.to_int32(t)
                example[name] = t
            return example

        params = Bunch({})
        params.epoch = FLAGS.epoch
        params.batch_size = FLAGS.batch_size
        # train_features = tf_data_utils.train_input_fn("/data/xuht/wsdm19/data/train.tfrecords",
        #                             _decode_record, name_to_features, params)
        # eval_features = tf_data_utils.eval_input_fn("/data/xuht/wsdm19/data/dev.tfrecords",
        #                             _decode_record, name_to_features, params)

        # train_features = tf_data_utils.train_input_fn(FLAGS.train_file,
        #                             _decode_record, name_to_features, params)
        eval_features = tf_data_utils.eval_input_fn(FLAGS.dev_file,
                                                    _decode_record,
                                                    name_to_features, params)

        # [train_op, train_loss, train_per_example_loss, train_logits] = model_train_fn(train_features, [], tf.estimator.ModeKeys.TRAIN)
        [_, eval_loss, eval_per_example_loss,
         eval_logits] = model_eval_fn(eval_features, [],
                                      tf.estimator.ModeKeys.EVAL)
        result = metric_fn(eval_features, eval_logits, eval_loss)

        model_io_fn.set_saver()

        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())
        sess.run(init_op)

        model_io_fn.load_model(sess, init_checkpoint)
        print(" ==succeeded in loading model== ")

        sess.run(hvd.broadcast_global_variables(0))

        def eval_fn(result):
            i = 0
            total_accuracy = 0
            label, label_id = [], []
            # label_weight = []
            while True:
                try:
                    eval_result = sess.run(result)
                    total_accuracy += eval_result["accuracy"]
                    label_id.extend(eval_result["label_ids"])
                    label.extend(eval_result["pred_label"])
                    # for item in eval_result["label_ids"]:
                    #     label_weight.append(label_tensor[item])
                    i += 1
                except tf.errors.OutOfRangeError:
                    print("End of dataset")
                    break
            # f1 = f1_score(label_id, label, average="macro", sample_weight=label_weight)
            # accuracy = accuracy_score(label_id, label, sample_weight=label_weight)
            f1 = f1_score(label_id, label, average="macro")
            accuracy = accuracy_score(label_id, label)
            print("test accuracy accuracy {} {} f1 {}".format(
                total_accuracy / i, accuracy, f1))
            return total_accuracy / i, f1

        # print("===========begin to train============")
        # train_fn(train_op, train_loss)
        print("===========begin to eval============")
        accuracy, f1 = eval_fn(result)
        print("==accuracy {} f1 {}==".format(accuracy, f1))
Exemple #28
0
    def model_fn(features, labels, mode):

        task_type = 'all_neg'
        num_task = kargs.get('num_task', 1)

        model_io_fn = model_io.ModelIO(model_io_config)

        if mode == tf.estimator.ModeKeys.TRAIN:
            dropout_prob = model_config.dropout_prob
            is_training = True
        else:
            dropout_prob = 0.0
            is_training = False

        if model_io_config.fix_lm == True:
            scope = model_config.scope + "_finetuning"
        else:
            scope = model_config.scope

        if kargs.get("get_pooled_output", "pooled_output") == "pooled_output":
            pooled_feature = model.get_pooled_output()
        elif kargs.get("get_pooled_output", "task_output") == "task_output":
            pooled_feature_dict = model.get_task_output()
            pooled_feature = pooled_feature_dict['pooled_feature']

        if kargs.get('apply_head_proj', False):
            with tf.variable_scope(scope + "/head_proj", reuse=tf.AUTO_REUSE):
                feature_a = simclr_utils.projection_head(
                    pooled_feature_dict['feature_a'],
                    is_training,
                    head_proj_dim=128,
                    num_nlh_layers=1,
                    head_proj_mode='nonlinear',
                    name='head_contrastive')
                pooled_feature_dict['feature_a'] = feature_a

            with tf.variable_scope(scope + "/head_proj", reuse=tf.AUTO_REUSE):
                feature_b = simclr_utils.projection_head(
                    pooled_feature_dict['feature_b'],
                    is_training,
                    head_proj_dim=128,
                    num_nlh_layers=1,
                    head_proj_mode='nonlinear',
                    name='head_contrastive')
                pooled_feature_dict['feature_b'] = feature_b
            tf.logging.info(
                "****** apply contrastive feature projection *******")

        shape_list = bert_utils.get_shape_list(pooled_feature,
                                               expected_rank=[2])
        batch_size = shape_list[0]

        random_batch = tf.random_shuffle(tf.range(batch_size))

        input_a = features['input_ids_a']
        input_b = tf.gather(features['input_ids_b'], random_batch)

        not_equal = tf.cast(tf.not_equal(input_a, input_b), tf.int32)
        not_equal = tf.reduce_sum(not_equal, axis=-1)
        loss_mask = tf.cast(tf.not_equal(not_equal, tf.zeros_like(not_equal)),
                            tf.float32)

        feat_a = pooled_feature_dict['feature_a']
        feat_b = tf.gather(pooled_feature_dict['feature_b'], random_batch)

        pooled_feature_dict['feature_b'] = feat_b

        label_ids = tf.zeros_like(loss_mask)

        loss = tf.constant(0.0)

        params_size = model_io_fn.count_params(model_config.scope)
        print("==total encoder params==", params_size)

        if kargs.get("feature_distillation", True):
            universal_feature_a = features.get("input_ids_a_features", None)
            universal_feature_b = features.get("input_ids_b_features", None)

            if universal_feature_a is None or universal_feature_b is None:
                tf.logging.info(
                    "****** not apply feature distillation *******")
                feature_loss = tf.constant(0.0)
            else:
                feature_a = pooled_feature_dict['feature_a']
                feature_a_shape = bert_utils.get_shape_list(
                    feature_a, expected_rank=[2, 3])
                pretrain_feature_a_shape = bert_utils.get_shape_list(
                    universal_feature_a, expected_rank=[2, 3])
                if feature_a_shape[-1] != pretrain_feature_a_shape[-1]:
                    with tf.variable_scope(scope + "/feature_proj",
                                           reuse=tf.AUTO_REUSE):
                        proj_feature_a = tf.layers.dense(
                            feature_a, pretrain_feature_a_shape[-1])
                    # with tf.variable_scope(scope+"/feature_rec", reuse=tf.AUTO_REUSE):
                    # 	proj_feature_a_rec = tf.layers.dense(proj_feature_a, feature_a_shape[-1])
                    # loss += tf.reduce_mean(tf.reduce_sum(tf.square(proj_feature_a_rec-feature_a), axis=-1))/float(num_task)
                    tf.logging.info(
                        "****** apply auto-encoder for feature compression *******"
                    )
                else:
                    proj_feature_a = feature_a
                feature_a_norm = tf.stop_gradient(
                    tf.sqrt(
                        tf.reduce_sum(tf.pow(proj_feature_a, 2),
                                      axis=-1,
                                      keepdims=True)) + 1e-20)
                proj_feature_a /= feature_a_norm

                feature_b = pooled_feature_dict['feature_b']
                if feature_a_shape[-1] != pretrain_feature_a_shape[-1]:
                    with tf.variable_scope(scope + "/feature_proj",
                                           reuse=tf.AUTO_REUSE):
                        proj_feature_b = tf.layers.dense(
                            feature_b, pretrain_feature_a_shape[-1])
                    # with tf.variable_scope(scope+"/feature_rec", reuse=tf.AUTO_REUSE):
                    # 	proj_feature_b_rec = tf.layers.dense(proj_feature_b, feature_a_shape[-1])
                    # loss += tf.reduce_mean(tf.reduce_sum(tf.square(proj_feature_b_rec-feature_b), axis=-1))/float(num_task)
                    tf.logging.info(
                        "****** apply auto-encoder for feature compression *******"
                    )
                else:
                    proj_feature_b = feature_b

                feature_b_norm = tf.stop_gradient(
                    tf.sqrt(
                        tf.reduce_sum(tf.pow(proj_feature_b, 2),
                                      axis=-1,
                                      keepdims=True)) + 1e-20)
                proj_feature_b /= feature_b_norm

                feature_a_distillation = tf.reduce_mean(
                    tf.square(universal_feature_a - proj_feature_a), axis=-1)
                feature_b_distillation = tf.reduce_mean(
                    tf.square(universal_feature_b - proj_feature_b), axis=-1)

                feature_loss = tf.reduce_mean(
                    (feature_a_distillation + feature_b_distillation) /
                    2.0) / float(num_task)
                loss += feature_loss
                tf.logging.info(
                    "****** apply prertained feature distillation *******")

        if kargs.get("embedding_distillation", True):
            word_embed = model.emb_mat
            random_embed_shape = bert_utils.get_shape_list(
                word_embed, expected_rank=[2, 3])
            print("==random_embed_shape==", random_embed_shape)
            pretrained_embed = kargs.get('pretrained_embed', None)
            if pretrained_embed is None:
                tf.logging.info(
                    "****** not apply prertained feature distillation *******")
                embed_loss = tf.constant(0.0)
            else:
                pretrain_embed_shape = bert_utils.get_shape_list(
                    pretrained_embed, expected_rank=[2, 3])
                print("==pretrain_embed_shape==", pretrain_embed_shape)
                if random_embed_shape[-1] != pretrain_embed_shape[-1]:
                    with tf.variable_scope(scope + "/embedding_proj",
                                           reuse=tf.AUTO_REUSE):
                        proj_embed = tf.layers.dense(word_embed,
                                                     pretrain_embed_shape[-1])
                else:
                    proj_embed = word_embed

                embed_loss = tf.reduce_mean(
                    tf.reduce_mean(tf.square(proj_embed - pretrained_embed),
                                   axis=-1)) / float(num_task)
                loss += embed_loss
                tf.logging.info(
                    "****** apply prertained feature distillation *******")

        if kargs.get('loss', 'contrastive_loss') == 'contrastive_loss':
            feature_a = tf.nn.l2_normalize(pooled_feature_dict['feature_a'],
                                           axis=-1)
            feature_b = tf.nn.l2_normalize(pooled_feature_dict['feature_b'],
                                           axis=-1)
            per_example_loss, logits = loss_utils.contrastive_loss(
                label_ids, feature_a, feature_b, kargs.get('margin', 1.0))
            tf.logging.info("****** contrastive_loss *******")
        elif kargs.get(
                'loss',
                'contrastive_loss') == 'exponent_neg_manhattan_distance_mse':
            feature_a = tf.nn.l2_normalize(pooled_feature_dict['feature_a'],
                                           axis=-1)
            feature_b = tf.nn.l2_normalize(pooled_feature_dict['feature_b'],
                                           axis=-1)
            per_example_loss, logits = loss_utils.exponent_neg_manhattan_distance(
                label_ids, feature_a, feature_b, 'mse')
            tf.logging.info(
                "****** exponent_neg_manhattan_distance_mse *******")
        else:
            feature_a = tf.nn.l2_normalize(pooled_feature_dict['feature_a'],
                                           axis=-1)
            feature_b = tf.nn.l2_normalize(pooled_feature_dict['feature_b'],
                                           axis=-1)
            per_example_loss, logits = loss_utils.contrastive_loss(
                label_ids, feature_a, feature_b, kargs.get('margin', 1.0))
            tf.logging.info("****** contrastive_loss *******")

        masked_per_example_loss = per_example_loss * loss_mask
        task_loss = tf.reduce_sum(masked_per_example_loss) / (
            1e-10 + tf.reduce_sum(loss_mask))
        loss += task_loss / float(num_task)

        if mode == tf.estimator.ModeKeys.TRAIN:
            multi_task_config = kargs.get("multi_task_config", {})
            if multi_task_config.get(task_type,
                                     {}).get("lm_augumentation", False):
                print("==apply lm_augumentation==")
                masked_lm_positions = features["masked_lm_positions"]
                masked_lm_ids = features["masked_lm_ids"]
                masked_lm_weights = features["masked_lm_weights"]
                (masked_lm_loss, masked_lm_example_loss,
                 masked_lm_log_probs) = pretrain.get_masked_lm_output(
                     model_config,
                     model.get_sequence_output(),
                     model.get_embedding_table(),
                     masked_lm_positions,
                     masked_lm_ids,
                     masked_lm_weights,
                     reuse=model_reuse)

                masked_lm_loss_mask = tf.expand_dims(loss_mask, -1) * tf.ones(
                    (1,
                     multi_task_config[task_type]["max_predictions_per_seq"]))
                masked_lm_loss_mask = tf.reshape(masked_lm_loss_mask, (-1, ))

                masked_lm_label_weights = tf.reshape(masked_lm_weights, [-1])
                masked_lm_loss_mask *= tf.cast(masked_lm_label_weights,
                                               tf.float32)

                masked_lm_example_loss *= masked_lm_loss_mask  # multiply task_mask
                masked_lm_loss = tf.reduce_sum(masked_lm_example_loss) / (
                    1e-10 + tf.reduce_sum(masked_lm_loss_mask))
                loss += multi_task_config[task_type][
                    "masked_lm_loss_ratio"] * masked_lm_loss

                masked_lm_label_ids = tf.reshape(masked_lm_ids, [-1])

                print(masked_lm_log_probs.get_shape(),
                      "===masked lm log probs===")
                print(masked_lm_label_ids.get_shape(), "===masked lm ids===")
                print(masked_lm_label_weights.get_shape(),
                      "===masked lm mask===")

                lm_acc = build_accuracy(masked_lm_log_probs,
                                        masked_lm_label_ids,
                                        masked_lm_loss_mask)

        if kargs.get("task_invariant", "no") == "yes":
            print("==apply task adversarial training==")
            with tf.variable_scope(scope + "/dann_task_invariant",
                                   reuse=model_reuse):
                (_, task_example_loss,
                 task_logits) = distillation_utils.feature_distillation(
                     model.get_pooled_output(), 1.0, features["task_id"],
                     kargs.get("num_task", 7), dropout_prob, True)
                masked_task_example_loss = loss_mask * task_example_loss
                masked_task_loss = tf.reduce_sum(masked_task_example_loss) / (
                    1e-10 + tf.reduce_sum(loss_mask))
                loss += kargs.get("task_adversarial", 1e-2) * masked_task_loss

        tvars = model_io_fn.get_params(model_config.scope,
                                       not_storage_params=not_storage_params)

        if mode == tf.estimator.ModeKeys.TRAIN:
            multi_task_config = kargs.get("multi_task_config", {})
            if multi_task_config.get(task_type,
                                     {}).get("lm_augumentation", False):
                print("==apply lm_augumentation==")
                masked_lm_pretrain_tvars = model_io_fn.get_params(
                    "cls/predictions", not_storage_params=not_storage_params)
                tvars.extend(masked_lm_pretrain_tvars)

        try:
            params_size = model_io_fn.count_params(model_config.scope)
            print("==total params==", params_size)
        except:
            print("==not count params==")
        # print(tvars)
        if load_pretrained == "yes":
            model_io_fn.load_pretrained(tvars,
                                        init_checkpoint,
                                        exclude_scope=exclude_scope)

        if mode == tf.estimator.ModeKeys.TRAIN:

            acc = build_accuracy(logits,
                                 label_ids,
                                 loss_mask,
                                 loss_type=kargs.get('loss',
                                                     'contrastive_loss'))

            return_dict = {
                "loss": loss,
                "logits": logits,
                "task_num": tf.reduce_sum(loss_mask),
                "tvars": tvars
            }
            return_dict["{}_acc".format(task_type)] = acc
            if kargs.get("task_invariant", "no") == "yes":
                return_dict["{}_task_loss".format(
                    task_type)] = masked_task_loss
                task_acc = build_accuracy(task_logits, features["task_id"],
                                          loss_mask)
                return_dict["{}_task_acc".format(task_type)] = task_acc
            if multi_task_config.get(task_type,
                                     {}).get("lm_augumentation", False):
                return_dict["{}_masked_lm_loss".format(
                    task_type)] = masked_lm_loss
                return_dict["{}_masked_lm_acc".format(task_type)] = lm_acc
            if kargs.get("embedding_distillation", True):
                return_dict["embed_loss"] = embed_loss * float(num_task)
            else:
                return_dict["embed_loss"] = task_loss
            if kargs.get("feature_distillation", True):
                return_dict["feature_loss"] = feature_loss * float(num_task)
            else:
                return_dict["feature_loss"] = task_loss
            return_dict["task_loss"] = task_loss
            return return_dict
        elif mode == tf.estimator.ModeKeys.EVAL:
            eval_dict = {
                "loss": loss,
                "logits": logits,
                "feature": model.get_pooled_output()
            }
            if kargs.get("adversarial", "no") == "adversarial":
                eval_dict["task_logits"] = task_logits
            return eval_dict
Exemple #29
0
def export_model_v1(config):

	opt_config = Bunch({"init_lr":2e-5, "num_train_steps":1e30, "cycle":False})
	model_io_config = Bunch({"fix_lm":False})

	bert_config = json.load(open(config["config_file"], "r"))
	model_config = Bunch(bert_config)

	model_config.use_one_hot_embeddings = True
	model_config.scope = "bert"
	model_config.dropout_prob = 0.1
	model_config.label_type = "single_label"

	with open(config["label2id"], "r") as frobj:
		label_dict = json.load(frobj)

	num_classes = len(label_dict["id2label"])
	max_seq_length = config["max_length"]

	def serving_input_receiver_fn():
		# receive tensors
		receiver_tensors = {
				"input_ids":
						tf.placeholder(tf.int32, [None, max_seq_length], name='input_ids'),
				"input_mask":
						tf.placeholder(tf.int32, [None, max_seq_length], name='input_mask'),
				"segment_ids":
						tf.placeholder(tf.int32, [None, max_seq_length], name='segment_ids'),
				"label_ids":
						tf.placeholder(tf.int32, [None], name='label_ids'),
		}

		# Convert give inputs to adjust to the model.
		features = {}
		for key in receiver_tensors:
			features[key] = receiver_tensors[key]
		return tf.estimator.export.ServingInputReceiver(receiver_tensors=receiver_tensors,
													features=features)

	# def serving_input_receiver_fn():
	# 	receive serialized example
	# 	serialized_tf_example = tf.placeholder(dtype=tf.string,
	# 									shape=None,
	# 									name='input_example_tensor')
	# 	receiver_tensors = {'examples': serialized_tf_example}
	# 	features = tf.parse_example(serialized_tf_example, feature_spec)
	# 	return tf.estimator.export.ServingInputReceiver(features, receiver_tensors)

	model_io_fn = model_io.ModelIO(model_io_config)

	model_fn = bert_classifier_estimator.classifier_model_fn_builder(
									model_config, 
									num_classes, 
									config["init_checkpoint"], 
									reuse=None, 
									load_pretrained=True,
									model_io_fn=model_io_fn,
									model_io_config=model_io_config, 
									opt_config=opt_config)

	estimator = tf.estimator.Estimator(
				model_fn=model_fn,
				model_dir=config["model_dir"])

	export_dir = estimator.export_savedmodel(config["export_path"], 
									serving_input_receiver_fn,
									checkpoint_path=config["init_checkpoint"])

	print("===Succeeded in exporting saved model==={}".format(export_dir))
Exemple #30
0
    def model_fn(features, labels, mode):

        train_ops = []
        train_hooks = []
        logits_dict = {}
        losses_dict = {}
        features_dict = {}
        tvars = []
        task_num_dict = {}

        total_loss = tf.constant(0.0)

        task_num = 0

        encoder = {}
        hook_dict = {}

        print(task_type_dict.keys(), "==task type dict==")
        num_task = len(task_type_dict)

        for index, task_type in enumerate(task_type_dict.keys()):
            if model_config_dict[task_type].model_type in model_type_lst:
                reuse = True
            else:
                reuse = None
                model_type_lst.append(model_config_dict[task_type].model_type)
            if task_type_dict[task_type] == "cls_task":

                if model_config_dict[task_type].model_type not in encoder:
                    model_api = model_zoo(model_config_dict[task_type])

                    model = model_api(model_config_dict[task_type],
                                      features,
                                      labels,
                                      mode,
                                      target_dict[task_type],
                                      reuse=reuse)
                    encoder[model_config_dict[task_type].model_type] = model

                print(encoder, "==encode==")

                task_model_fn = cls_model_fn(
                    encoder[model_config_dict[task_type].model_type],
                    model_config_dict[task_type],
                    num_labels_dict[task_type],
                    init_checkpoint_dict[task_type],
                    reuse,
                    load_pretrained_dict[task_type],
                    model_io_config,
                    opt_config,
                    exclude_scope=exclude_scope_dict[task_type],
                    not_storage_params=not_storage_params_dict[task_type],
                    target=target_dict[task_type],
                    label_lst=None,
                    output_type=output_type,
                    task_layer_reuse=task_layer_reuse,
                    task_type=task_type,
                    num_task=num_task,
                    task_adversarial=1e-2,
                    **kargs)
                print("==SUCCEEDED IN LODING==", task_type)

                result_dict = task_model_fn(features, labels, mode)
                logits_dict[task_type] = result_dict["logits"]
                losses_dict[task_type] = result_dict["loss"]  # task loss
                for key in [
                        "masked_lm_loss", "task_loss", "acc", "task_acc",
                        "masked_lm_acc"
                ]:
                    name = "{}_{}".format(task_type, key)
                    if name in result_dict:
                        hook_dict[name] = result_dict[name]
                hook_dict["{}_loss".format(task_type)] = result_dict["loss"]
                total_loss += result_dict["loss"]

                if mode == tf.estimator.ModeKeys.TRAIN:
                    tvars.extend(result_dict["tvars"])
                    task_num += result_dict["task_num"]
                    task_num_dict[task_type] = result_dict["task_num"]
                elif mode == tf.estimator.ModeKeys.EVAL:
                    features[task_type] = result_dict["feature"]
            else:
                continue

        hook_dict["total_loss"] = total_loss

        if mode == tf.estimator.ModeKeys.TRAIN:
            model_io_fn = model_io.ModelIO(model_io_config)

            optimizer_fn = optimizer.Optimizer(opt_config)

            model_io_fn.print_params(list(set(tvars)),
                                     string=", trainable params")
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            print("==update_ops==", update_ops)

            with tf.control_dependencies(update_ops):
                train_op = optimizer_fn.get_train_op(
                    total_loss, list(set(tvars)), opt_config.init_lr,
                    opt_config.num_train_steps, **kargs)

                model_io_fn.set_saver(optimizer_fn.opt)

                if kargs.get("task_index", 1) == 0 and kargs.get(
                        "run_config", None):
                    model_io_fn.get_hooks(kargs.get("checkpoint_dir", None),
                                          kargs.get("num_storage_steps", 1000))

                    training_hooks = model_io_fn.checkpoint_hook
                elif kargs.get("task_index", 1) == 1:
                    training_hooks = []
                else:
                    training_hooks = []

                if len(optimizer_fn.distributed_hooks) >= 1:
                    training_hooks.extend(optimizer_fn.distributed_hooks)
                print(training_hooks, "==training_hooks==", "==task_index==",
                      kargs.get("task_index", 1))

            if output_type == "sess":
                return {
                    "train": {
                        "total_loss": total_loss,
                        "loss": losses_dict,
                        "logits": logits_dict,
                        "train_op": train_op,
                        "task_num_dict": task_num_dict
                    },
                    "hooks": train_hooks
                }
            elif output_type == "estimator":

                hook_dict['learning_rate'] = optimizer_fn.learning_rate
                logging_hook = tf.train.LoggingTensorHook(hook_dict,
                                                          every_n_iter=100)
                training_hooks.append(logging_hook)

                print("==hook_dict==")

                print(hook_dict)

                for key in hook_dict:
                    tf.summary.scalar(key, hook_dict[key])
                    for index, task_type in enumerate(task_type_dict.keys()):
                        tmp = "{}_loss".format(task_type)
                        if tmp == key:
                            tf.summary.scalar(
                                "loss_gap_{}".format(task_type),
                                hook_dict["total_loss"] - hook_dict[key])
                for key in task_num_dict:
                    tf.summary.scalar(key + "_task_num", task_num_dict[key])

                estimator_spec = tf.estimator.EstimatorSpec(mode=mode,
                                                            loss=total_loss,
                                                            train_op=train_op)
                # training_hooks=training_hooks)
                return estimator_spec

        elif mode == tf.estimator.ModeKeys.EVAL:  # eval execute for each class solo

            def metric_fn(logits, label_ids):
                """Computes the loss and accuracy of the model."""
                sentence_log_probs = tf.reshape(logits, [-1, logits.shape[-1]])
                sentence_predictions = tf.argmax(logits,
                                                 axis=-1,
                                                 output_type=tf.int32)
                sentence_labels = tf.reshape(label_ids, [-1])
                sentence_accuracy = tf.metrics.accuracy(
                    labels=label_ids, predictions=sentence_predictions)
                sentence_f = tf_metrics.f1(label_ids,
                                           sentence_predictions,
                                           num_labels,
                                           label_lst,
                                           average="macro")

                eval_metric_ops = {"f1": sentence_f, "acc": sentence_accuracy}

                return eval_metric_ops

            if output_type == "sess":
                return {
                    "eval": {
                        "logits": logits_dict,
                        "total_loss": total_loss,
                        "feature": features,
                        "loss": losses_dict
                    }
                }
            elif output_type == "estimator":
                eval_metric_ops = {}
                for key in logits:
                    eval_dict = metric_fn(logits[key],
                                          features_task_dict[key]["label_ids"])
                    for sub_key in eval_dict.keys():
                        eval_key = "{}_{}".format(key, sub_key)
                        eval_metric_ops[eval_key] = eval_dict[sub_key]
                estimator_spec = tf.estimator.EstimatorSpec(
                    mode=mode,
                    loss=total_loss / task_num,
                    eval_metric_ops=eval_metric_ops)
                return estimator_spec
        else:
            raise NotImplementedError()