def __init__(self, model_config, num_labels, init_checkpoint, load_pretrained=True, model_io_config={}, opt_config={}, exclude_scope="", not_storage_params=[], target="a", label_lst=None, output_type="sess", **kargs): self.model_config = model_config self.num_labels = num_labels self.init_checkpoint = init_checkpoint self.load_pretrained = load_pretrained self.model_io_config = model_io_config self.opt_config = opt_config self.exclude_scope = exclude_scope self.not_storage_params = not_storage_params self.target = target self.label_lst = label_lst self.output_type = output_type self.kargs = kargs self.model_io_fn = model_io.ModelIO(self.model_io_config) self.optimizer_fn = optimizer.Optimizer(self.opt_config)
def model_fn(features, labels, mode, params): model_api = model_zoo(model_config) model = model_api(model_config, features, labels, mode, target, reuse=tf.AUTO_REUSE, **kargs) if mode == tf.estimator.ModeKeys.TRAIN: dropout_prob = model_config.dropout_prob else: dropout_prob = 0.0 if model_io_config.fix_lm == True: scope = model_config.scope + "_finetuning" else: scope = model_config.scope logits = global_discriminator_logits(model_config, model.get_pooled_output(), reuse=tf.AUTO_REUSE, **kargs) model_io_fn = model_io.ModelIO(model_io_config) pretrained_tvars = model_io_fn.get_params( model_config.scope, not_storage_params=not_storage_params) global_prediction_tvars = model_io_fn.get_params( "cls/seq_global", not_storage_params=not_storage_params) pretrained_tvars.extend(global_prediction_tvars) tvars = pretrained_tvars print('==discriminator parameters==', tvars) if load_pretrained == "yes": use_tpu = 1 if kargs.get('use_tpu', False) else 0 scaffold_fn = model_io_fn.load_pretrained( tvars, init_checkpoint, exclude_scope=exclude_scope, use_tpu=use_tpu) else: scaffold_fn = None if mode == tf.estimator.ModeKeys.PREDICT: estimator_spec = tf.estimator.EstimatorSpec( mode=mode, predictions={"probs": tf.nn.softmax(logits)}, export_outputs={ "output": tf.estimator.export.PredictOutput( {"probs": tf.nn.softmax(logits)}) }) return estimator_spec
def export_model_v2(config): opt_config = Bunch({"init_lr":2e-5, "num_train_steps":1e30, "cycle":False}) model_io_config = Bunch({"fix_lm":False}) bert_config = json.load(open(config["config_file"], "r")) model_config = Bunch(bert_config) model_config.use_one_hot_embeddings = True model_config.scope = "bert" model_config.dropout_prob = 0.1 model_config.label_type = "single_label" with open(config["label2id"], "r") as frobj: label_dict = json.load(frobj) num_classes = len(label_dict["id2label"]) max_seq_length = config["max_length"] def serving_input_receiver_fn(): label_ids = tf.placeholder(tf.int32, [None], name='label_ids') input_ids = tf.placeholder(tf.int32, [None, max_seq_length], name='input_ids') input_mask = tf.placeholder(tf.int32, [None, max_seq_length], name='input_mask') segment_ids = tf.placeholder(tf.int32, [None, max_seq_length], name='segment_ids') input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({ 'label_ids': label_ids, 'input_ids': input_ids, 'input_mask': input_mask, 'segment_ids': segment_ids })() return input_fn model_io_fn = model_io.ModelIO(model_io_config) model_fn = bert_classifier_estimator.classifier_model_fn_builder( model_config, num_classes, config["init_checkpoint"], reuse=None, load_pretrained=True, model_io_fn=model_io_fn, model_io_config=model_io_config, opt_config=opt_config) estimator = tf.estimator.Estimator( model_fn=model_fn, model_dir=config["model_dir"]) export_dir = estimator.export_savedmodel(config["export_path"], serving_input_receiver_fn, checkpoint_path=config["init_checkpoint"]) print("===Succeeded in exporting saved model==={}".format(export_dir))
def model_fn(features, labels, mode): model_api = model_zoo(model_config) model = model_api(model_config, features, labels, mode, target, reuse=model_reuse) label_ids = features["label_ids"] if mode == tf.estimator.ModeKeys.TRAIN: dropout_prob = model_config.dropout_prob else: dropout_prob = 0.0 if model_io_config.fix_lm == True: scope = model_config.scope + "_finetuning" else: scope = model_config.scope with tf.variable_scope(scope): (loss, per_example_loss, logits) = classifier.classifier(model_config, model.get_pooled_output(), num_labels, label_ids, dropout_prob) model_io_fn = model_io.ModelIO(model_io_config) tvars = model_io_fn.get_params(model_config.scope, not_storage_params=not_storage_params) try: params_size = model_io_fn.count_params(model_config.scope) print("==total params==", params_size) except: print("==not count params==") if load_pretrained == "yes": model_io_fn.load_pretrained(tvars, init_checkpoint, exclude_scope='teacher') return_dict = { "loss": loss, "logits": logits, "tvars": tvars, "model": model, "per_example_loss": per_example_loss } return return_dict
def model_fn(features, labels, mode, params): model_api = model_zoo(model_config) model = model_api(model_config, features, labels, mode, target, reuse=tf.AUTO_REUSE, **kargs) if mode == tf.estimator.ModeKeys.TRAIN: dropout_prob = model_config.dropout_prob else: dropout_prob = 0.0 if model_io_config.fix_lm == True: scope = model_config.scope + "_finetuning" else: scope = model_config.scope logits = global_discriminator_logits(model_config, model.get_pooled_output(), reuse=tf.AUTO_REUSE, **kargs) model_io_fn = model_io.ModelIO(model_io_config) pretrained_tvars = model_io_fn.get_params(model_config.scope, not_storage_params=not_storage_params) global_prediction_tvars = model_io_fn.get_params("cls/seq_global", not_storage_params=not_storage_params) pretrained_tvars.extend(global_prediction_tvars) tvars = pretrained_tvars print('==discriminator parameters==', tvars) if load_pretrained == "yes": use_tpu = 1 if kargs.get('use_tpu', False) else 0 scaffold_fn = model_io_fn.load_pretrained(tvars, init_checkpoint, exclude_scope=exclude_scope, use_tpu=use_tpu, restore_var_name=model_config.get('restore_var_name', [])) else: scaffold_fn = None return_dict = { "logits":logits, "tvars":tvars, "model":model } return return_dict
def init_model(self): self.graph = tf.Graph() with self.graph.as_default(): init_checkpoint = self.config["init_checkpoint"] bert_config = json.load(open(self.config["bert_config"], "r")) self.model_config = Bunch(bert_config) self.model_config.use_one_hot_embeddings = True self.model_config.scope = "bert" self.model_config.dropout_prob = 0.1 self.model_config.label_type = "single_label" self.input_queue = Queue(maxsize=self.config.get("batch_size", 20)) self.output_queue = Queue( maxsize=self.config.get("batch_size", 20)) opt_config = Bunch({ "init_lr": 2e-5, "num_train_steps": 1e30, "cycle": False }) model_io_config = Bunch({"fix_lm": False}) self.num_classes = len(self.label_dict["id2label"]) self.max_seq_length = self.config["max_length"] self.tokenizer = tokenization.FullTokenizer( vocab_file=self.config["bert_vocab"], do_lower_case=True) self.sess = tf.Session() self.model_io_fn = model_io.ModelIO(model_io_config) model_fn = bert_classifier_estimator.classifier_model_fn_builder( self.model_config, self.num_classes, init_checkpoint, reuse=None, load_pretrained=True, model_io_fn=self.model_io_fn, model_io_config=model_io_config, opt_config=opt_config) self.estimator = tf.estimator.Estimator( model_fn=model_fn, model_dir=self.config["model_dir"])
def model_fn(features, labels, mode): original_loss = tf.constant(0.0) distilled_loss = tf.constant(0.0) st_model = st_model_fn(model_config_dict['student'], num_labels_dict['student'], init_checkpoint_dict['student'], model_reuse=None, load_pretrained=load_pretrained_dict['student'], model_io_config=model_io_config, opt_config=opt_config, exclude_scope=exclude_scope_dict.get('student', ""), not_storage_params=not_storage_params_dict.get('student', []), target=target_dict['student'], **kargs) st_dict = st_model(features, labels, mode) ta_model = ta_model_fn(model_config_dict['teacher'], num_labels_dict['teacher'], init_checkpoint_dict['teacher'], model_reuse=None, load_pretrained=load_pretrained_dict['teacher'], model_io_config=model_io_config, opt_config=opt_config, exclude_scope=exclude_scope_dict.get('teacher', ""), not_storage_params=not_storage_params_dict.get('teacher', []), target=target_dict['teacher'], **kargs) ta_dict = ta_model(features, labels, mode) studnet_logit = st_dict['logits'] teacher_logit = ta_dict['logits'] model_io_fn = model_io.ModelIO(model_io_config) feature_flag = False original_loss += st_dict['loss'] * (distillation_config.get('ce_loss', 1.0)) print(distillation_config.get('ce_loss', 1.0), '===ce_loss===')
sess = tf.Session() num_train_steps = int(FLAGS.train_size / FLAGS.batch_size * FLAGS.epoch) num_warmup_steps = int(num_train_steps * 0.1) num_storage_steps = int(FLAGS.train_size / FLAGS.batch_size) opt_config = Bunch({ "init_lr": 1e-5, "num_train_steps": num_train_steps, "num_warmup_steps": num_warmup_steps }) model_io_config = Bunch({"fix_lm": False}) model_io_fn = model_io.ModelIO(model_io_config) num_choice = FLAGS.num_classes max_seq_length = FLAGS.max_length model_train_fn = bert_classifier.multichoice_model_fn_builder( config, num_choice, init_checkpoint, reuse=None, load_pretrained=True, model_io_fn=model_io_fn, model_io_config=model_io_config, opt_config=opt_config) model_eval_fn = bert_classifier.multichoice_model_fn_builder(
def train_eval_fn(FLAGS, worker_count, task_index, is_chief, target, init_checkpoint, train_file, dev_file, checkpoint_dir, is_debug, **kargs): graph = tf.Graph() with graph.as_default(): import json # config = json.load(open(FLAGS.config_file, "r")) # config = Bunch(config) # config.use_one_hot_embeddings = True # config.scope = "bert" # config.dropout_prob = 0.1 # config.label_type = "single_label" # config.model = FLAGS.model_type config = model_config_parser(FLAGS) if FLAGS.if_shard == "0": train_size = FLAGS.train_size epoch = int(FLAGS.epoch / worker_count) elif FLAGS.if_shard == "1": print("==number of gpus==", kargs.get('num_gpus', 1)) train_size = int(FLAGS.train_size / worker_count / kargs.get('num_gpus', 1)) # train_size = int(FLAGS.train_size) epoch = FLAGS.epoch else: train_size = int(FLAGS.train_size / worker_count) epoch = FLAGS.epoch init_lr = FLAGS.init_lr distillation_dict = json.load(tf.gfile.Open(FLAGS.distillation_config)) distillation_config = Bunch( json.load(tf.gfile.Open(FLAGS.multi_task_config))) warmup_ratio = config.get('warmup', 0.1) num_train_steps = int(train_size / FLAGS.batch_size * epoch) if config.get('ln_type', 'postln') == 'postln': num_warmup_steps = int(num_train_steps * warmup_ratio) elif config.get('ln_type', 'preln') == 'postln': num_warmup_steps = 0 else: num_warmup_steps = int(num_train_steps * warmup_ratio) print('==num warmup steps==', num_warmup_steps) num_storage_steps = min([int(train_size / FLAGS.batch_size), 10000]) if num_storage_steps <= 100: num_storage_steps = 500 num_eval_steps = int(FLAGS.eval_size / FLAGS.batch_size) if is_debug == "0": num_storage_steps = 2 num_eval_steps = 10 num_train_steps = 10 print("num_train_steps {}, num_eval_steps {}, num_storage_steps {}". format(num_train_steps, num_eval_steps, num_storage_steps)) print(" model type {}".format(FLAGS.model_type)) print(num_train_steps, num_warmup_steps, "=============", kargs.get('num_gpus', 1), '==number of gpus==') if worker_count * kargs.get("num_gpus", 1) >= 2: clip_norm_scale = 1.0 lr_scale = 0.8 else: clip_norm_scale = 1.0 lr_scale = 1.0 lr = init_lr * worker_count * kargs.get("num_gpus", 1) * lr_scale if lr >= 1e-3: lr = 1e-3 print('==init lr==', lr) opt_config = Bunch({ "init_lr": lr, "num_train_steps": num_train_steps, "num_warmup_steps": num_warmup_steps, "worker_count": worker_count, "gpu_count": worker_count * kargs.get("num_gpus", 1), "opt_type": FLAGS.opt_type, "is_chief": is_chief, "train_op": kargs.get("train_op", "adam"), "decay": kargs.get("decay", "no"), "warmup": kargs.get("warmup", "no"), "clip_norm": config.get("clip_norm", 1.0), "grad_clip": config.get("grad_clip", "global_norm"), "epoch": FLAGS.epoch, "strategy": FLAGS.distribution_strategy }) anneal_config = Bunch({ "initial_value": 1.0, "num_train_steps": num_train_steps }) model_io_config = Bunch({"fix_lm": False}) model_io_fn = model_io.ModelIO(model_io_config) num_classes = FLAGS.num_classes if FLAGS.opt_type == "hvd" and hvd: checkpoint_dir = checkpoint_dir if task_index == 0 else None elif FLAGS.opt_type == "all_reduce": checkpoint_dir = checkpoint_dir elif FLAGS.opt_type == "collective_reduce": checkpoint_dir = checkpoint_dir if task_index == 0 else None elif FLAGS.opt_type == "ps" or FLAGS.opt_type == "ps_sync": checkpoint_dir = checkpoint_dir if task_index == 0 else None print("==checkpoint_dir==", checkpoint_dir, is_chief) model_config_dict = {} num_labels_dict = {} init_checkpoint_dict = {} load_pretrained_dict = {} exclude_scope_dict = {} not_storage_params_dict = {} target_dict = {} for task_type in FLAGS.multi_task_type.split(","): print("==task type==", task_type) model_config_dict[task_type] = model_config_parser( Bunch(distillation_config[task_type])) print(task_type, distillation_config[task_type], '=====task model config======') num_labels_dict[task_type] = distillation_config[task_type][ "num_labels"] init_checkpoint_dict[task_type] = os.path.join( FLAGS.buckets, distillation_config[task_type]["init_checkpoint"]) load_pretrained_dict[task_type] = distillation_config[task_type][ "load_pretrained"] exclude_scope_dict[task_type] = distillation_config[task_type][ "exclude_scope"] not_storage_params_dict[task_type] = distillation_config[ task_type]["not_storage_params"] target_dict[task_type] = distillation_config[task_type]["target"] model_fn = distillation_model_fn( model_config_dict, num_labels_dict, init_checkpoint_dict, load_pretrained_dict, model_io_config=model_io_config, opt_config=opt_config, exclude_scope_dict=exclude_scope_dict, not_storage_params_dict=not_storage_params_dict, target_dict=target_dict, output_type="estimator", distillation_config=distillation_dict, **kargs) name_to_features = data_interface(FLAGS) def _decode_record(record, name_to_features): """Decodes a record to a TensorFlow example. """ example = tf.parse_single_example(record, name_to_features) # tf.Example only supports tf.int64, but the TPU only supports tf.int32. # So cast all int64 to int32. for name in list(example.keys()): t = example[name] if t.dtype == tf.int64: t = tf.to_int32(t) example[name] = t return example def _decode_batch_record(record, name_to_features): example = tf.parse_example(record, name_to_features) # for name in list(example.keys()): # t = example[name] # if t.dtype == tf.int64: # t = tf.to_int32(t) # example[name] = t return example params = Bunch({}) params.epoch = FLAGS.epoch params.batch_size = FLAGS.batch_size if kargs.get("run_config", None): if kargs.get("parse_type", "parse_single") == "parse_single": train_features = lambda: tf_data_utils.all_reduce_train_input_fn( train_file, _decode_record, name_to_features, params, if_shard=FLAGS.if_shard, worker_count=worker_count, task_index=task_index) eval_features = lambda: tf_data_utils.all_reduce_eval_input_fn( dev_file, _decode_record, name_to_features, params, if_shard=FLAGS.if_shard, worker_count=worker_count, task_index=task_index) elif kargs.get("parse_type", "parse_single") == "parse_batch": print("==apply parse example==") train_features = lambda: tf_data_utils.all_reduce_train_batch_input_fn( train_file, _decode_batch_record, name_to_features, params, if_shard=FLAGS.if_shard, worker_count=worker_count, task_index=task_index) eval_features = lambda: tf_data_utils.all_reduce_eval_batch_input_fn( dev_file, _decode_batch_record, name_to_features, params, if_shard=FLAGS.if_shard, worker_count=worker_count, task_index=task_index) else: train_features = lambda: tf_data_utils.train_input_fn( train_file, _decode_record, name_to_features, params, if_shard=FLAGS.if_shard, worker_count=worker_count, task_index=task_index) eval_features = lambda: tf_data_utils.eval_input_fn( dev_file, _decode_record, name_to_features, params, if_shard=FLAGS.if_shard, worker_count=worker_count, task_index=task_index) train_hooks = [] eval_hooks = [] sess_config = tf.ConfigProto(allow_soft_placement=False, log_device_placement=False) if FLAGS.opt_type == "ps" or FLAGS.opt_type == "ps_sync": print("==no need for hook==") elif FLAGS.opt_type == "pai_soar" and pai: print("no need for hook") elif FLAGS.opt_type == "hvd" and hvd: sess_config.gpu_options.allow_growth = True sess_config.gpu_options.visible_device_list = str(hvd.local_rank()) print("==no need fo hook==") else: print("==no need for hooks==") if kargs.get("run_config", None): run_config = kargs.get("run_config", None) run_config = run_config.replace( save_checkpoints_steps=num_storage_steps) print("==run config==", run_config.save_checkpoints_steps) else: run_config = tf.estimator.RunConfig( model_dir=checkpoint_dir, save_checkpoints_steps=num_storage_steps, session_config=sess_config) if kargs.get("profiler", "profiler") == "profiler": if checkpoint_dir: hooks = tf.train.ProfilerHook( save_steps=100, save_secs=None, output_dir=os.path.join(checkpoint_dir, "profiler"), ) train_hooks.append(hooks) print("==add profiler hooks==") model_estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir=checkpoint_dir, config=run_config) train_being_time = time.time() tf.logging.info("==training distribution_strategy=={}".format( kargs.get("distribution_strategy", "MirroredStrategy"))) if kargs.get("distribution_strategy", "MirroredStrategy") == "MirroredStrategy": print("==apply single machine multi-card training==") train_spec = tf.estimator.TrainSpec(input_fn=train_features, max_steps=num_train_steps) eval_spec = tf.estimator.EvalSpec(input_fn=eval_features, steps=num_eval_steps) model_estimator.train(input_fn=train_features, max_steps=num_train_steps, hooks=train_hooks) # tf.estimator.train(model_estimator, train_spec) train_end_time = time.time() print("==training time==", train_end_time - train_being_time) tf.logging.info("==training time=={}".format(train_end_time - train_being_time)) eval_results = model_estimator.evaluate(input_fn=eval_features, steps=num_eval_steps) print(eval_results) elif kargs.get("distribution_strategy", "MirroredStrategy") in [ "ParameterServerStrategy", "CollectiveAllReduceStrategy" ]: print("==apply multi-machine machine multi-card training==") try: print(os.environ['TF_CONFIG'], "==tf_run_config==") except: print("==not tf config==") train_spec = tf.estimator.TrainSpec(input_fn=train_features, max_steps=num_train_steps) eval_spec = tf.estimator.EvalSpec(input_fn=eval_features, steps=num_eval_steps) # tf.estimator.train(model_estimator, train_spec) # tf 1.12 doesn't need evaluate tf.estimator.train_and_evaluate(model_estimator, train_spec, eval_spec)
def model_fn(features, labels, mode): original_loss = tf.constant(0.0) distilled_loss = tf.constant(0.0) st_model = st_model_fn( model_config_dict['student'], num_labels_dict['student'], init_checkpoint_dict['student'], model_reuse=None, load_pretrained=load_pretrained_dict['student'], model_io_config=model_io_config, opt_config=opt_config, exclude_scope=exclude_scope_dict.get('student', ""), not_storage_params=not_storage_params_dict.get('student', []), target=target_dict['student'], **kargs) st_dict = st_model(features, labels, mode) ta_model = ta_model_fn( model_config_dict['teacher'], num_labels_dict['teacher'], init_checkpoint_dict['teacher'], model_reuse=None, load_pretrained=load_pretrained_dict['teacher'], model_io_config=model_io_config, opt_config=opt_config, exclude_scope=exclude_scope_dict.get('teacher', ""), not_storage_params=not_storage_params_dict.get('teacher', []), target=target_dict['teacher'], **kargs) ta_dict = ta_model(features, labels, mode) studnet_logit = st_dict['logits'] teacher_logit = ta_dict['logits'] model_io_fn = model_io.ModelIO(model_io_config) feature_flag = False original_loss += st_dict['loss'] * (distillation_config.get( 'ce_loss', 1.0)) print(distillation_config.get('ce_loss', 1.0), '===ce_loss===') tf.summary.scalar("ce_loss", st_dict['loss']) if 'kl_logits' in distillation_config.get('distillation_type', ['kl_logits']): temperature = distillation_config.get('kl_temperature', 2.0) distilled_teacher_logit = tf.nn.log_softmax( (teacher_logit + 1e-10) / temperature) # log_softmax logits distilled_student_logit = tf.nn.log_softmax( (studnet_logit + 1e-10) / temperature) # log_softmax logits kl_distilled_loss = tf.reduce_mean( distillation_utils.kd(distilled_teacher_logit, distilled_student_logit)) tf.summary.scalar("kl_logits_loss", kl_distilled_loss) tf.logging.info( "***** with knowledge distillation %s tenperature *****", str(temperature)) # kl_distilled_loss *= np.power(temperature, 2) distilled_loss += kl_distilled_loss * distillation_config.get( 'kl_logits_ratio', 0.9) print(distillation_config.get('kl_logits_ratio', 0.9), '===kl_logits_ratio===') if 'rkd' in distillation_config.get('distillation_type', ['kl_logits']): source = ta_dict['model'].get_pooled_output() target = st_dict['model'].get_pooled_output() print("==apply rkd==") with tf.variable_scope("distillation", reuse=tf.AUTO_REUSE): rkd_loss = repo_distillation_utils.RKD(source, target, l=[25, 50]) tf.summary.scalar("rkd_loss", rkd_loss) distilled_loss += rkd_loss * distillation_config.get( "rkd_ratio", 0.1) if "attention_score_uniform" in distillation_config.get( 'distillation_type', ['kl_logits']): source_attention_score = ta_dict['model'].get_multihead_attention() target_attention_score = st_dict['model'].get_multihead_attention() print("==apply attention_score_uniform==") with tf.variable_scope("distillation", reuse=tf.AUTO_REUSE): attention_loss = uniform_mapping.attention_score_matching( source_attention_score, target_attention_score, features['input_mask'], 0) tf.summary.scalar("attention_score_uniform_loss", attention_loss) feature_flag = True distilled_loss += attention_loss * distillation_config.get( "attention_score_uniform", 0.1) print(distillation_config.get('attention_score_uniform', 0.1), '===attention_score_uniform===') if "hidden_uniform" in distillation_config.get('distillation_type', ['kl_logits']): source_hidden = ta_dict['model'].get_all_encoder_layers() target_hidden = st_dict['model'].get_all_encoder_layers() print("==apply hidden_uniform==") with tf.variable_scope("distillation", reuse=tf.AUTO_REUSE): hidden_loss = uniform_mapping.hidden_matching( source_hidden, target_hidden, features['input_mask'], 0) tf.summary.scalar("hidden_uniform_loss", hidden_loss) distilled_loss += hidden_loss * distillation_config.get( "hidden_uniform", 0.1) feature_flag = True print(distillation_config.get('hidden_uniform', 0.1), '===hidden_uniform===') if "hidden_cls_uniform" in distillation_config.get( 'distillation_type', ['kl_logits']): source_hidden = ta_dict['model'].get_all_encoder_layers() target_hidden = st_dict['model'].get_all_encoder_layers() print("==apply hidden_cls_uniform==") with tf.variable_scope("distillation", reuse=tf.AUTO_REUSE): hidden_cls_loss = uniform_mapping.hidden_cls_matching( source_hidden, target_hidden, 0) tf.summary.scalar("hidden_cls_uniform_loss", hidden_cls_loss) distilled_loss += hidden_cls_loss * distillation_config.get( "hidden_uniform", 0.1) feature_flag = True if "mdd" in distillation_config.get('distillation_type', ['mdd']): source = ta_dict['model'].get_pooled_output() target = st_dict['model'].get_pooled_output() print("==apply mdd==") if "cpc" in distillation_config.get('distillation_type', ['mdd']): source_hidden = ta_dict['model'].get_all_encoder_layers() target_hidden = st_dict['model'].get_all_encoder_layers() with tf.variable_scope("distillation", reuse=tf.AUTO_REUSE): cpc_loss = cpc_utils.CPC_Hidden(target_hidden, source_hidden, features['input_mask']) tf.summary.scalar("hidden_cpc_loss", cpc_loss) distilled_loss += cpc_loss + distillation_config.get( "cpc_hidden", 0.1) if "wpc" in distillation_config.get('distillation_type', ['mdd']): source_hidden = ta_dict['model'].get_all_encoder_layers() target_hidden = st_dict['model'].get_all_encoder_layers() with tf.variable_scope("distillation", reuse=tf.AUTO_REUSE): wpc_loss = cpc_utils.WPC_Hidden(target_hidden, source_hidden, features['input_mask']) tf.summary.scalar("hidden_wpc_loss", wpc_loss) distilled_loss += wpc_loss + distillation_config.get( "wpc_hidden", 0.1) total_loss = distilled_loss + original_loss tvars = [] tvars.extend(st_dict['tvars']) if feature_flag: distillation_vars = model_io_fn.get_params('distillation', not_storage_params=[]) tvars.extend(distillation_vars) if mode == tf.estimator.ModeKeys.TRAIN: optimizer_fn = optimizer.Optimizer(opt_config) model_io_fn.print_params(tvars, string=", trainable params") update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) print("==update_ops==", update_ops) print('==total trainable vars==', list(tvars)) with tf.control_dependencies(update_ops): train_op = optimizer_fn.get_train_op( total_loss, list(set(tvars)), opt_config.init_lr, opt_config.num_train_steps, **kargs) model_io_fn.set_saver() if kargs.get("task_index", 1) == 0 and kargs.get( "run_config", None): training_hooks = [] elif kargs.get("task_index", 1) == 0: model_io_fn.get_hooks(kargs.get("checkpoint_dir", None), kargs.get("num_storage_steps", 1000)) training_hooks = model_io_fn.checkpoint_hook else: training_hooks = [] if len(optimizer_fn.distributed_hooks) >= 1: training_hooks.extend(optimizer_fn.distributed_hooks) print(training_hooks, "==training_hooks==", "==task_index==", kargs.get("task_index", 1)) estimator_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, training_hooks=training_hooks) if output_type == "sess": return { "train": { "loss": total_loss, "logits": studnet_logit, "train_op": train_op }, "hooks": training_hooks } elif output_type == "estimator": return estimator_spec elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(per_example_loss, logits, label_ids, model_type): """Computes the loss and accuracy of the model.""" sentence_log_probs = tf.reshape(logits, [-1, logits.shape[-1]]) sentence_predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) sentence_labels = tf.reshape(label_ids, [-1]) sentence_accuracy = tf.metrics.accuracy( labels=label_ids, predictions=sentence_predictions) sentence_mean_loss = tf.metrics.mean(values=per_example_loss) sentence_f = tf_metrics.f1(label_ids, sentence_predictions, num_labels_dict['student'], None, average="macro") eval_metric_ops = { "{}_f1".format(model_type): sentence_f, "{}_acc".format(model_type): sentence_accuracy } return eval_metric_ops if output_type == "sess": return { "eval": { "per_example_loss": st_dict['logits']['per_example_loss'], "logits": studnet_logit, "loss": tf.reduce_mean(st_dict['logits']['per_example_loss']), "feature": st_dict['model'].get_pooled_output() } } elif output_type == "estimator": eval_metric_ops = metric_fn(st_dict['per_example_loss'], studnet_logit, features['label_ids'], "student") teacher_eval_metric_ops = metric_fn( ta_dict['per_example_loss'], teacher_logit, features['label_ids'], "teacher") eval_metric_ops.update(teacher_eval_metric_ops) estimator_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, eval_metric_ops=eval_metric_ops) return estimator_spec else: raise NotImplementedError()
def model_fn(features, labels, mode): model = bert_encoder(model_config, features, labels, mode, target, reuse=model_reuse) label_ids = features["label_ids"] if mode == tf.estimator.ModeKeys.TRAIN: dropout_prob = model_config.dropout_prob else: dropout_prob = 0.0 if model_io_config.fix_lm == True: scope = model_config.scope + "_finetuning" else: scope = model_config.scope with tf.variable_scope(scope, reuse=model_reuse): (loss, per_example_loss, logits) = classifier.classifier(model_config, model.get_pooled_output(), num_labels, label_ids, dropout_prob) model_io_fn = model_io.ModelIO(model_io_config) tvars = model_io_fn.get_params(model_config.scope, not_storage_params=not_storage_params) if load_pretrained: model_io_fn.load_pretrained(tvars, init_checkpoint, exclude_scope=exclude_scope) model_io_fn.set_saver(var_lst=tvars) if mode == tf.estimator.ModeKeys.TRAIN: optimizer_fn = optimizer.Optimizer(opt_config) model_io_fn.print_params(tvars, string=", trainable params") update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer_fn.get_train_op( loss, tvars, opt_config.init_lr, opt_config.num_train_steps) estimator_spec = tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) if output_type == "sess": return { "train": { "loss": loss, "logits": logits, "train_op": train_op } } elif output_type == "estimator": return estimator_spec elif mode == tf.estimator.ModeKeys.PREDICT: print(logits.get_shape(), "===logits shape===") pred_label = tf.argmax(logits, axis=-1, output_type=tf.int32) prob = tf.nn.softmax(logits) max_prob = tf.reduce_max(prob, axis=-1) estimator_spec = tf.estimator.EstimatorSpec( mode=mode, predictions={ 'pred_label': pred_label, "max_prob": max_prob }, export_outputs={ "output": tf.estimator.export.PredictOutput({ 'pred_label': pred_label, "max_prob": max_prob }) }) return estimator_spec elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(per_example_loss, logits, label_ids): """Computes the loss and accuracy of the model.""" sentence_log_probs = tf.reshape(logits, [-1, logits.shape[-1]]) sentence_predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) sentence_labels = tf.reshape(label_ids, [-1]) sentence_accuracy = tf.metrics.accuracy( labels=label_ids, predictions=sentence_predictions) sentence_mean_loss = tf.metrics.mean(values=per_example_loss) sentence_f = tf_metrics.f1(label_ids, sentence_predictions, num_labels, label_lst, average="macro") eval_metric_ops = {"f1": sentence_f, "acc": sentence_accuracy} return eval_metric_ops eval_metric_ops = metric_fn(per_example_loss, logits, label_ids) estimator_spec = tf.estimator.EstimatorSpec( mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) if output_type == "sess": return { "eval": { "per_example_loss": per_example_loss, "logits": logits, "loss": tf.reduce_mean(per_example_loss) } } elif output_type == "estimator": return estimator_spec else: raise NotImplementedError()
def model_fn(features, labels, mode): task_type = kargs.get("task_type", "cls") num_task = kargs.get('num_task', 1) temp = kargs.get('temp', 0.1) print("==task_type==", task_type) model_io_fn = model_io.ModelIO(model_io_config) label_ids = tf.cast(features["{}_label_ids".format(task_type)], dtype=tf.int32) if mode == tf.estimator.ModeKeys.TRAIN: dropout_prob = model_config.dropout_prob is_training = True else: dropout_prob = 0.0 is_training = False if model_io_config.fix_lm == True: scope = model_config.scope + "_finetuning" else: scope = model_config.scope if kargs.get("get_pooled_output", "pooled_output") == "pooled_output": pooled_feature = model.get_pooled_output() elif kargs.get("get_pooled_output", "task_output") == "task_output": pooled_feature_dict = model.get_task_output() pooled_feature = pooled_feature_dict['pooled_feature'] shape_list = bert_utils.get_shape_list( pooled_feature_dict['feature_a'], expected_rank=[2]) batch_size = shape_list[0] if kargs.get('apply_head_proj', False): with tf.variable_scope(scope + "/head_proj", reuse=tf.AUTO_REUSE): feature_a = simclr_utils.projection_head( pooled_feature_dict['feature_a'], is_training, head_proj_dim=128, num_nlh_layers=1, head_proj_mode='nonlinear', name='head_contrastive') pooled_feature_dict['feature_a'] = feature_a with tf.variable_scope(scope + "/head_proj", reuse=tf.AUTO_REUSE): feature_b = simclr_utils.projection_head( pooled_feature_dict['feature_b'], is_training, head_proj_dim=128, num_nlh_layers=1, head_proj_mode='nonlinear', name='head_contrastive') pooled_feature_dict['feature_b'] = feature_b tf.logging.info( "****** apply contrastive feature projection *******") else: feature_a = pooled_feature_dict['feature_a'] feature_b = pooled_feature_dict['feature_b'] tf.logging.info("****** not apply projection *******") # feature_a = tf.nn.l2_normalize(feature_a, axis=-1) # feature_b = tf.nn.l2_normalize(feature_b, axis=-1) # [batch_size, batch_size] if kargs.get("task_seperate_proj", False): if task_type == 'xquad' or task_type == 'wsdm': # for passage representation with tf.variable_scope( scope + "/{}/feature_output_b".format(task_type), reuse=tf.AUTO_REUSE): feature_b = tf.layers.dense( feature_b, 128, use_bias=True, activation=tf.tanh, kernel_initializer=tf.truncated_normal_initializer( stddev=0.01)) tf.logging.info("****** apply passage projection *******") if task_type == 'afqmc': # for anchor representation with tf.variable_scope( scope + "/{}/feature_output_a".format(task_type), reuse=tf.AUTO_REUSE): feature_a = tf.layers.dense( feature_a, 128, use_bias=True, activation=tf.tanh, kernel_initializer=tf.truncated_normal_initializer( stddev=0.01)) # for successor representation with tf.variable_scope( scope + "/{}/feature_output_b".format(task_type), reuse=tf.AUTO_REUSE): feature_b = tf.layers.dense( feature_b, 128, use_bias=True, activation=tf.tanh, kernel_initializer=tf.truncated_normal_initializer( stddev=0.01)) tf.logging.info( "****** apply cpc anchor, successor projection *******") cosine_score = tf.matmul(feature_a, tf.transpose(feature_b)) / model_config.get( 'temperature', 0.5) print("==cosine_score shape==", cosine_score.get_shape()) loss_mask = tf.cast(features["{}_loss_multipiler".format(task_type)], tf.float32) if task_type == 'xquad': neg_true_mask = tf.cast( triplet_loss_utils._get_anchor_negative_triplet_mask( label_ids), tf.float32) pos_true_mask = (1.0 - neg_true_mask) * tf.expand_dims( loss_mask, axis=-1) * tf.expand_dims(loss_mask, axis=0) neg_true_mask = neg_true_mask * tf.expand_dims( loss_mask, axis=-1) * tf.expand_dims(loss_mask, axis=0) elif task_type == 'wsdm': pos_label_mask = tf.cast( features["{}_label_ids".format(task_type)], dtype=tf.float32) loss_mask *= pos_label_mask pos_label_mask = tf.expand_dims(pos_label_mask, axis=-1) # batch x batch score_shape = bert_utils.get_shape_list(cosine_score, expected_rank=[2, 3]) pos_true_mask = pos_label_mask * tf.eye(score_shape[0]) neg_true_mask = tf.ones_like(cosine_score) - pos_true_mask pos_true_mask = pos_true_mask * tf.expand_dims( loss_mask, axis=-1) * tf.expand_dims(loss_mask, axis=0) neg_true_mask = neg_true_mask * tf.expand_dims( loss_mask, axis=-1) * tf.expand_dims(loss_mask, axis=0) elif task_type == 'afqmc': score_shape = bert_utils.get_shape_list(cosine_score, expected_rank=[2, 3]) not_equal_mask = get_labels_of_similarity(features['input_ids_a'], features['input_ids_b']) pos_true_mask = tf.expand_dims(loss_mask, axis=-1) * tf.eye( score_shape[0]) neg_true_mask = not_equal_mask * tf.expand_dims( loss_mask, axis=-1) * tf.expand_dims(loss_mask, axis=0) cosine_score_neg = neg_true_mask * cosine_score cosine_score_pos = -pos_true_mask * cosine_score y_pred_neg = cosine_score_neg - (1 - neg_true_mask) * 1e12 y_pred_pos = cosine_score_pos - (1 - pos_true_mask) * 1e12 # add circle-loss without margin and scale-factor joint_neg_loss = tf.reduce_logsumexp(y_pred_neg, axis=-1) joint_pos_loss = tf.reduce_logsumexp(y_pred_pos, axis=-1) logits = tf.nn.softplus(joint_neg_loss + joint_pos_loss) loss = tf.reduce_sum( logits * loss_mask) / (1e-10 + tf.reduce_sum(loss_mask)) task_loss = loss params_size = model_io_fn.count_params(model_config.scope) print("==total encoder params==", params_size) if kargs.get("feature_distillation", True): universal_feature_a = features.get("input_ids_a_features", None) universal_feature_b = features.get("input_ids_b_features", None) if universal_feature_a is None or universal_feature_b is None: tf.logging.info( "****** not apply feature distillation *******") feature_loss = tf.constant(0.0) else: feature_a = pooled_feature_dict['feature_a'] feature_a_shape = bert_utils.get_shape_list( feature_a, expected_rank=[2, 3]) pretrain_feature_a_shape = bert_utils.get_shape_list( universal_feature_a, expected_rank=[2, 3]) if feature_a_shape[-1] != pretrain_feature_a_shape[-1]: with tf.variable_scope(scope + "/feature_proj", reuse=tf.AUTO_REUSE): proj_feature_a = tf.layers.dense( feature_a, pretrain_feature_a_shape[-1]) # with tf.variable_scope(scope+"/feature_rec", reuse=tf.AUTO_REUSE): # proj_feature_a_rec = tf.layers.dense(proj_feature_a, feature_a_shape[-1]) # loss += tf.reduce_mean(tf.reduce_sum(tf.square(proj_feature_a_rec-feature_a), axis=-1))/float(num_task) tf.logging.info( "****** apply auto-encoder for feature compression *******" ) else: proj_feature_a = feature_a feature_a_norm = tf.stop_gradient( tf.sqrt( tf.reduce_sum(tf.pow(proj_feature_a, 2), axis=-1, keepdims=True)) + 1e-20) proj_feature_a /= feature_a_norm feature_b = pooled_feature_dict['feature_b'] if feature_a_shape[-1] != pretrain_feature_a_shape[-1]: with tf.variable_scope(scope + "/feature_proj", reuse=tf.AUTO_REUSE): proj_feature_b = tf.layers.dense( feature_b, pretrain_feature_a_shape[-1]) # with tf.variable_scope(scope+"/feature_rec", reuse=tf.AUTO_REUSE): # proj_feature_b_rec = tf.layers.dense(proj_feature_b, feature_a_shape[-1]) # loss += tf.reduce_mean(tf.reduce_sum(tf.square(proj_feature_b_rec-feature_b), axis=-1))/float(num_task) tf.logging.info( "****** apply auto-encoder for feature compression *******" ) else: proj_feature_b = feature_b feature_b_norm = tf.stop_gradient( tf.sqrt( tf.reduce_sum(tf.pow(proj_feature_b, 2), axis=-1, keepdims=True)) + 1e-20) proj_feature_b /= feature_b_norm feature_a_distillation = tf.reduce_mean( tf.square(universal_feature_a - proj_feature_a), axis=-1) feature_b_distillation = tf.reduce_mean( tf.square(universal_feature_b - proj_feature_b), axis=-1) feature_loss = tf.reduce_mean( (feature_a_distillation + feature_b_distillation) / 2.0) / float(num_task) loss += feature_loss tf.logging.info( "****** apply prertained feature distillation *******") if kargs.get("embedding_distillation", True): word_embed = model.emb_mat random_embed_shape = bert_utils.get_shape_list( word_embed, expected_rank=[2, 3]) print("==random_embed_shape==", random_embed_shape) pretrained_embed = kargs.get('pretrained_embed', None) if pretrained_embed is None: tf.logging.info( "****** not apply prertained feature distillation *******") embed_loss = tf.constant(0.0) else: pretrain_embed_shape = bert_utils.get_shape_list( pretrained_embed, expected_rank=[2, 3]) print("==pretrain_embed_shape==", pretrain_embed_shape) if random_embed_shape[-1] != pretrain_embed_shape[-1]: with tf.variable_scope(scope + "/embedding_proj", reuse=tf.AUTO_REUSE): proj_embed = tf.layers.dense(word_embed, pretrain_embed_shape[-1]) else: proj_embed = word_embed embed_loss = tf.reduce_mean( tf.reduce_mean(tf.square(proj_embed - pretrained_embed), axis=-1)) / float(num_task) loss += embed_loss tf.logging.info( "****** apply prertained feature distillation *******") if mode == tf.estimator.ModeKeys.TRAIN: multi_task_config = kargs.get("multi_task_config", {}) if multi_task_config.get(task_type, {}).get("lm_augumentation", False): print("==apply lm_augumentation==") masked_lm_positions = features["masked_lm_positions"] masked_lm_ids = features["masked_lm_ids"] masked_lm_weights = features["masked_lm_weights"] (masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs) = pretrain.get_masked_lm_output( model_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights, reuse=model_reuse) masked_lm_loss_mask = tf.expand_dims(loss_mask, -1) * tf.ones( (1, multi_task_config[task_type]["max_predictions_per_seq"])) masked_lm_loss_mask = tf.reshape(masked_lm_loss_mask, (-1, )) masked_lm_label_weights = tf.reshape(masked_lm_weights, [-1]) masked_lm_loss_mask *= tf.cast(masked_lm_label_weights, tf.float32) masked_lm_example_loss *= masked_lm_loss_mask # multiply task_mask masked_lm_loss = tf.reduce_sum(masked_lm_example_loss) / ( 1e-10 + tf.reduce_sum(masked_lm_loss_mask)) loss += multi_task_config[task_type][ "masked_lm_loss_ratio"] * masked_lm_loss masked_lm_label_ids = tf.reshape(masked_lm_ids, [-1]) print(masked_lm_log_probs.get_shape(), "===masked lm log probs===") print(masked_lm_label_ids.get_shape(), "===masked lm ids===") print(masked_lm_label_weights.get_shape(), "===masked lm mask===") lm_acc = build_accuracy(masked_lm_log_probs, masked_lm_label_ids, masked_lm_loss_mask) if kargs.get("task_invariant", "no") == "yes": print("==apply task adversarial training==") with tf.variable_scope(scope + "/dann_task_invariant", reuse=model_reuse): (_, task_example_loss, task_logits) = distillation_utils.feature_distillation( model.get_pooled_output(), 1.0, features["task_id"], kargs.get("num_task", 7), dropout_prob, True) masked_task_example_loss = loss_mask * task_example_loss masked_task_loss = tf.reduce_sum(masked_task_example_loss) / ( 1e-10 + tf.reduce_sum(loss_mask)) loss += kargs.get("task_adversarial", 1e-2) * masked_task_loss tvars = model_io_fn.get_params(model_config.scope, not_storage_params=not_storage_params) if mode == tf.estimator.ModeKeys.TRAIN: multi_task_config = kargs.get("multi_task_config", {}) if multi_task_config.get(task_type, {}).get("lm_augumentation", False): print("==apply lm_augumentation==") masked_lm_pretrain_tvars = model_io_fn.get_params( "cls/predictions", not_storage_params=not_storage_params) tvars.extend(masked_lm_pretrain_tvars) try: params_size = model_io_fn.count_params(model_config.scope) print("==total params==", params_size) except: print("==not count params==") # print(tvars) if load_pretrained == "yes": model_io_fn.load_pretrained(tvars, init_checkpoint, exclude_scope=exclude_scope) if mode == tf.estimator.ModeKeys.TRAIN: # acc = build_accuracy(logits, # label_ids, # loss_mask, # loss_type=kargs.get('loss', 'contrastive_loss')) return_dict = { "loss": loss, "logits": logits, "task_num": tf.reduce_sum(loss_mask), "{}_pos_num".format(task_type): tf.reduce_sum(pos_true_mask), "{}_neg_num".format(task_type): tf.reduce_sum(neg_true_mask), "tvars": tvars } # return_dict["{}_acc".format(task_type)] = acc if kargs.get("task_invariant", "no") == "yes": return_dict["{}_task_loss".format( task_type)] = masked_task_loss task_acc = build_accuracy(task_logits, features["task_id"], loss_mask) return_dict["{}_task_acc".format(task_type)] = task_acc if multi_task_config.get(task_type, {}).get("lm_augumentation", False): return_dict["{}_masked_lm_loss".format( task_type)] = masked_lm_loss return_dict["{}_masked_lm_acc".format(task_type)] = lm_acc if kargs.get("embedding_distillation", True): return_dict["embed_loss"] = embed_loss * float(num_task) else: return_dict["embed_loss"] = task_loss if kargs.get("feature_distillation", True): return_dict["feature_loss"] = feature_loss * float(num_task) else: return_dict["feature_loss"] = task_loss return_dict["task_loss"] = task_loss return return_dict elif mode == tf.estimator.ModeKeys.EVAL: eval_dict = { "loss": loss, "logits": logits, "feature": model.get_pooled_output() } if kargs.get("adversarial", "no") == "adversarial": eval_dict["task_logits"] = task_logits return eval_dict
def model_fn(features, labels, mode, params): model_api = model_zoo(model_config) if kargs.get('random_generator', '1') == '1': if mode in [ tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.TRAIN ]: input_ori_ids = features['input_ori_ids'] # [output_ids, # sampled_binary_mask] = random_input_ids_generation(model_config, # features['input_ori_ids'], # features['input_mask'], # mask_probability=0.2, # replace_probability=0.1, # original_probability=0.1, # **kargs) [output_ids, sampled_binary_mask] = hmm_input_ids_generation( model_config, features['input_ori_ids'], features['input_mask'], [ tf.cast(tf.constant(hmm_tran_prob), tf.float32) for hmm_tran_prob in hmm_tran_prob_list ], mask_probability=0.2, replace_probability=0.0, original_probability=0.0, mask_prior=tf.constant(mask_prior, tf.float32), **kargs) features['input_ids'] = output_ids tf.logging.info("****** do random generator *******") else: sampled_binary_mask = None else: sampled_binary_mask = None model = model_api(model_config, features, labels, mode, target, reuse=tf.AUTO_REUSE, **kargs) if mode == tf.estimator.ModeKeys.TRAIN: dropout_prob = model_config.dropout_prob else: dropout_prob = 0.0 if model_io_config.fix_lm == True: scope = model_config.scope + "_finetuning" else: scope = model_config.scope (nsp_loss, nsp_per_example_loss, nsp_log_prob) = pretrain.get_next_sentence_output( model_config, model.get_pooled_output(), features['next_sentence_labels'], reuse=tf.AUTO_REUSE, scope=generator_scope_prefix) masked_lm_positions = features["masked_lm_positions"] masked_lm_ids = features["masked_lm_ids"] masked_lm_weights = features["masked_lm_weights"] if model_config.model_type == 'bert': masked_lm_fn = pretrain.get_masked_lm_output seq_masked_lm_fn = pretrain.seq_mask_masked_lm_output print("==apply bert masked lm==") elif model_config.model_type == 'albert': masked_lm_fn = pretrain_albert.get_masked_lm_output seq_masked_lm_fn = pretrain_albert.seq_mask_masked_lm_output print("==apply albert masked lm==") else: masked_lm_fn = pretrain.get_masked_lm_output seq_masked_lm_fn = pretrain_albert.seq_mask_masked_lm_output print("==apply bert masked lm==") if sampled_binary_mask is not None: (masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs, masked_lm_mask) = seq_masked_lm_fn( model_config, model.get_sequence_output(), model.get_embedding_table(), features['input_mask'], features['input_ori_ids'], features['input_ids'], sampled_binary_mask, reuse=tf.AUTO_REUSE, embedding_projection=model.get_embedding_projection_table(), scope=generator_scope_prefix) masked_lm_ids = features['input_ori_ids'] else: (masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs, masked_lm_mask) = masked_lm_fn( model_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights, reuse=tf.AUTO_REUSE, embedding_projection=model.get_embedding_projection_table(), scope=generator_scope_prefix) print(model_config.lm_ratio, '==mlm lm_ratio==') loss = model_config.lm_ratio * masked_lm_loss + 0.0 * nsp_loss if kargs.get("resample_discriminator", False): input_ori_ids = features['input_ori_ids'] [output_ids, sampled_binary_mask ] = random_input_ids_generation(model_config, features['input_ori_ids'], features['input_mask'], mask_probability=0.2, replace_probability=0.1, original_probability=0.1) resample_features = {} for key in features: resample_features[key] = features[key] resample_features['input_ids'] = tf.identity(output_ids) model_resample = model_api(model_config, resample_features, labels, mode, target, reuse=tf.AUTO_REUSE, **kargs) tf.logging.info("**** apply discriminator resample **** ") else: model_resample = model resample_features = features tf.logging.info("**** not apply discriminator resample **** ") sampled_ids = token_generator(model_config, model_resample.get_sequence_output(), model_resample.get_embedding_table(), resample_features['input_ids'], resample_features['input_ori_ids'], resample_features['input_mask'], embedding_projection=model_resample. get_embedding_projection_table(), scope=generator_scope_prefix, mask_method='only_mask', use_tpu=kargs.get('use_tpu', True)) if model_config.get('gen_sample', 1) == 1: input_ids = features['input_ori_ids'] input_mask = features['input_mask'] segment_ids = features['segment_ids'] else: input_ids = tf.expand_dims(features['input_ori_ids'], axis=-1) # batch x seq_length x 1 input_ids = tf.einsum( 'abc,cd->abd', input_ids, tf.ones((1, model_config.get('gen_sample', 1)))) input_ids = tf.cast(input_ids, tf.int32) input_shape_list = bert_utils.get_shape_list(input_ids, expected_rank=3) batch_size = input_shape_list[0] seq_length = input_shape_list[1] gen_sample = input_shape_list[2] sampled_ids = tf.reshape(sampled_ids, [batch * gen_sample, seq_length]) input_ids = tf.reshape(input_ids, [batch * gen_sample, seq_length]) input_mask = tf.expand_dims(features['input_mask'], axis=-1) input_mask = tf.einsum( 'abc,cd->abd', input_mask, tf.ones((1, model_config.get('gen_sample', 1)))) input_mask = tf.cast(input_mask, tf.int32) segment_ids = tf.expand_dims(features['segmnet_ids'], axis=-1) segment_ids = tf.einsum( 'abc,cd->abd', segment_ids, tf.ones((1, model_config.get('gen_sample', 1)))) segment_ids = tf.cast(segment_ids, tf.int32) segment_ids = tf.reshape(segment_ids, [batch * gen_sample, seq_length]) input_mask = tf.reshape(input_mask, [batch * gen_sample, seq_length]) model_io_fn = model_io.ModelIO(model_io_config) pretrained_tvars = model_io_fn.get_params( model_config.scope, not_storage_params=not_storage_params) if generator_scope_prefix: """ "generator/cls/predictions" """ lm_pretrain_tvars = model_io_fn.get_params( generator_scope_prefix + "/cls/predictions", not_storage_params=not_storage_params) nsp_pretrain_vars = model_io_fn.get_params( generator_scope_prefix + "/cls/seq_relationship", not_storage_params=not_storage_params) else: lm_pretrain_tvars = model_io_fn.get_params( "cls/predictions", not_storage_params=not_storage_params) nsp_pretrain_vars = model_io_fn.get_params( "cls/seq_relationship", not_storage_params=not_storage_params) if model_config.get('embedding_scope', None) is not None: embedding_tvars = model_io_fn.get_params( model_config.get('embedding_scope', 'bert') + "/embeddings", not_storage_params=not_storage_params) pretrained_tvars.extend(embedding_tvars) pretrained_tvars.extend(lm_pretrain_tvars) pretrained_tvars.extend(nsp_pretrain_vars) tvars = pretrained_tvars print('==generator parameters==', tvars) if load_pretrained == "yes": use_tpu = 1 if kargs.get('use_tpu', False) else 0 scaffold_fn = model_io_fn.load_pretrained( tvars, init_checkpoint, exclude_scope=exclude_scope, use_tpu=use_tpu, restore_var_name=model_config.get('restore_var_name', [])) else: scaffold_fn = None tf.add_to_collection("generator_loss", loss) return_dict = { "loss": loss, "tvars": tvars, "model": model, "sampled_ids": sampled_ids, # batch x gen_sample, seg_length "sampled_input_ids": input_ids, # batch x gen_sample, seg_length, "sampled_input_mask": input_mask, "sampled_segment_ids": segment_ids, "masked_lm_ids": masked_lm_ids, "masked_lm_weights": masked_lm_mask, "masked_lm_log_probs": masked_lm_log_probs, "masked_lm_example_loss": masked_lm_example_loss, "next_sentence_example_loss": nsp_per_example_loss, "next_sentence_log_probs": nsp_log_prob, "next_sentence_labels": features['next_sentence_labels'], "sampled_binary_mask": sampled_binary_mask } return return_dict
def model_fn(features, labels, mode, params): train_op_type = kargs.get('train_op_type', 'joint') gen_disc_type = kargs.get('gen_disc_type', 'all_disc') mask_method = kargs.get('mask_method', 'only_mask') use_tpu = 1 if kargs.get('use_tpu', False) else 0 print(train_op_type, "===train op type===", gen_disc_type, "===generator loss type===") if mask_method == 'only_mask': tf.logging.info( "****** generator token generation mask type:%s with only masked token *******", mask_method) elif mask_method == 'all_mask': tf.logging.info( "****** generator token generation mask type:%s with all token *******", mask_method) else: mask_method = 'only_mask' tf.logging.info( "****** generator token generation mask type:%s with only masked token *******", mask_method) if kargs.get('optimization_type', 'grl') == 'grl': if_flip_grad = True train_op_type = 'joint' elif kargs.get('optimization_type', 'grl') == 'minmax': if_flip_grad = False else: if_flip_grad = True train_op_type = 'joint' generator_fn = generator( model_config_dict['generator'], num_labels_dict['generator'], init_checkpoint_dict['generator'], model_reuse=None, load_pretrained=load_pretrained_dict['generator'], model_io_config=model_io_config, opt_config=opt_config, exclude_scope=exclude_scope_dict.get('generator', ""), not_storage_params=not_storage_params_dict.get('generator', []), target=target_dict['generator'], if_flip_grad=if_flip_grad, # mask_method='only_mask', **kargs) tf.logging.info("****** train_op_type:%s *******", train_op_type) tf.logging.info("****** optimization_type:%s *******", kargs.get('optimization_type', 'grl')) generator_dict = generator_fn(features, labels, mode, params) discriminator_fn = discriminator_generator( model_config_dict['discriminator'], num_labels_dict['discriminator'], init_checkpoint_dict['discriminator'], model_reuse=None, load_pretrained=load_pretrained_dict['discriminator'], model_io_config=model_io_config, opt_config=opt_config, exclude_scope=exclude_scope_dict.get('discriminator', ""), not_storage_params=not_storage_params_dict.get( 'discriminator', []), target=target_dict['discriminator'], loss='cross_entropy', **kargs) discriminator_features = {} # minmax_mode in ['masked', 'corrupted'] minmax_mode = kargs.get('minmax_mode', 'corrupted') tf.logging.info("****** minmax mode for discriminator: %s *******", minmax_mode) if minmax_mode == 'corrupted': tf.logging.info("****** gumbel 3-D sampled_ids *******") elif minmax_mode == 'masked': discriminator_features['ori_sampled_ids'] = generator_dict[ 'output_ids'] discriminator_features['sampled_binary_mask'] = generator_dict[ 'sampled_binary_mask'] tf.logging.info("****** conditional sampled_ids *******") discriminator_features['input_ids'] = generator_dict['sampled_ids'] discriminator_features['input_mask'] = generator_dict[ 'sampled_input_mask'] discriminator_features['segment_ids'] = generator_dict[ 'sampled_segment_ids'] discriminator_features['input_ori_ids'] = generator_dict[ 'sampled_input_ids'] discriminator_features['next_sentence_labels'] = features[ 'next_sentence_labels'] discriminator_features['ori_input_ids'] = generator_dict['sampled_ids'] discriminator_dict = discriminator_fn(discriminator_features, labels, mode, params) [disc_loss, disc_logits, disc_per_example_loss ] = optimal_discriminator(model_config_dict['discriminator'], generator_dict, features, discriminator_dict, discriminator_features, use_tpu=use_tpu) [ equal_per_example_loss, equal_loss_all, equal_loss_self, not_equal_per_example_loss, not_equal_loss_all, not_equal_loss_self ] = modified_loss(disc_per_example_loss, disc_logits, discriminator_features['input_ori_ids'], discriminator_features['ori_input_ids'], discriminator_features['input_mask'], sampled_binary_mask=discriminator_features.get( 'sampled_binary_mask', None), **kargs) output_dict = {} output_dict['logits'] = disc_logits output_dict['per_example_loss'] = disc_per_example_loss output_dict['loss'] = disc_loss + 0.0 * discriminator_dict["loss"] output_dict["equal_per_example_loss"] = equal_per_example_loss, output_dict["equal_loss_all"] = equal_loss_all, output_dict["equal_loss_self"] = equal_loss_self, output_dict["not_equal_per_example_loss"] = not_equal_per_example_loss, output_dict["not_equal_loss_all"] = not_equal_loss_all, output_dict["not_equal_loss_self"] = not_equal_loss_self output_dict['tvars'] = discriminator_dict['tvars'] model_io_fn = model_io.ModelIO(model_io_config) tvars = [] loss = kargs.get('dis_loss', 1.0) * output_dict['loss'] tvars.extend(discriminator_dict['tvars']) if kargs.get('joint_train', '1') == '1': tf.logging.info( "****** joint generator and discriminator training *******") tvars.extend(generator_dict['tvars']) loss += generator_dict['loss'] tvars = list(set(tvars)) var_checkpoint_dict_list = [] for key in init_checkpoint_dict: if load_pretrained_dict[key] == "yes": if key == 'generator': tmp = { "tvars": generator_dict['tvars'], "init_checkpoint": init_checkpoint_dict['generator'], "exclude_scope": exclude_scope_dict[key], "restore_var_name": model_config_dict['generator'].get( 'restore_var_name', []) } if kargs.get("sharing_mode", "none") != "none": tmp['exclude_scope'] = '' var_checkpoint_dict_list.append(tmp) elif key == 'discriminator': tmp = { "tvars": discriminator_dict['tvars'], "init_checkpoint": init_checkpoint_dict['discriminator'], "exclude_scope": exclude_scope_dict[key], "restore_var_name": model_config_dict['discriminator'].get( 'restore_var_name', []) } var_checkpoint_dict_list.append(tmp) use_tpu = 1 if kargs.get('use_tpu', False) else 0 if len(var_checkpoint_dict_list) >= 1: scaffold_fn = model_io_fn.load_multi_pretrained( var_checkpoint_dict_list, use_tpu=use_tpu) else: scaffold_fn = None if mode == tf.estimator.ModeKeys.TRAIN: if not kargs.get('use_tpu', False): metric_dict = discriminator_metric_train( output_dict['per_example_loss'], output_dict['logits'], generator_dict['sampled_input_ids'], generator_dict['sampled_ids'], generator_dict['sampled_input_mask']) for key in metric_dict: tf.summary.scalar(key, metric_dict[key]) tf.summary.scalar("generator_loss", generator_dict['loss']) tf.summary.scalar("discriminator_loss", discriminator_dict['loss']) if kargs.get('use_tpu', False): optimizer_fn = optimizer.Optimizer(opt_config) use_tpu = 1 else: optimizer_fn = distributed_optimizer.Optimizer(opt_config) use_tpu = 0 model_io_fn.print_params(tvars, string=", trainable params") train_op = get_train_op(generator_dict, output_dict, optimizer_fn, opt_config, model_config_dict['generator'], model_config_dict['discriminator'], use_tpu=use_tpu, train_op_type=train_op_type, gen_disc_type=gen_disc_type) # update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) # with tf.control_dependencies(update_ops): # train_op = optimizer_fn.get_train_op(loss, list(set(tvars)), # opt_config.init_lr, # opt_config.num_train_steps, # use_tpu=use_tpu) if kargs.get('use_tpu', False): estimator_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, scaffold_fn=scaffold_fn # training_hooks=[logging_hook] ) else: estimator_spec = tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) return estimator_spec elif mode == tf.estimator.ModeKeys.EVAL: if kargs.get('joint_train', '0') == '1': def joint_metric(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, next_sentence_example_loss, next_sentence_log_probs, next_sentence_labels, per_example_loss, logits, input_ori_ids, input_ids, input_mask): generator_metric = generator_metric_fn_eval( masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, next_sentence_example_loss, next_sentence_log_probs, next_sentence_labels) discriminator_metric = discriminator_metric_eval( per_example_loss, logits, input_ori_ids, input_ids, input_mask) generator_metric.update(discriminator_metric) return generator_metric tpu_eval_metrics = (joint_metric, [ generator_dict['masked_lm_example_loss'], generator_dict['masked_lm_log_probs'], generator_dict['masked_lm_ids'], generator_dict['masked_lm_weights'], generator_dict.get('next_sentence_example_loss', None), generator_dict.get('next_sentence_log_probs', None), generator_dict.get('next_sentence_labels', None), discriminator_dict['per_example_loss'], discriminator_dict['logits'], generator_dict['sampled_input_ids'], generator_dict['sampled_ids'], generator_dict['sampled_input_mask'] ]) gpu_eval_metrics = joint_metric( generator_dict['masked_lm_example_loss'], generator_dict['masked_lm_log_probs'], generator_dict['masked_lm_ids'], generator_dict['masked_lm_weights'], generator_dict.get('next_sentence_example_loss', None), generator_dict.get('next_sentence_log_probs', None), generator_dict.get('next_sentence_labels', None), discriminator_dict['per_example_loss'], discriminator_dict['logits'], generator_dict['sampled_input_ids'], generator_dict['sampled_ids'], generator_dict['sampled_input_mask']) else: gpu_eval_metrics = discriminator_metric_eval( discriminator_dict['per_example_loss'], discriminator_dict['logits'], generator_dict['sampled_input_ids'], generator_dict['sampled_ids'], generator_dict['sampled_input_mask']) tpu_eval_metrics = (discriminator_metric_eval, [ discriminator_dict['per_example_loss'], discriminator_dict['logits'], generator_dict['sampled_input_ids'], generator_dict['sampled_ids'], generator_dict['sampled_input_mask'] ]) if kargs.get('use_tpu', False): estimator_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, eval_metrics=tpu_eval_metrics, scaffold_fn=scaffold_fn) else: estimator_spec = tf.estimator.EstimatorSpec( mode=mode, loss=loss, eval_metric_ops=gpu_eval_metrics) return estimator_spec else: raise NotImplementedError()
def model_fn(features, labels, mode): if model_io_config.fix_lm == True: scope = model_config.scope + "_finetuning" else: scope = model_config.scope if mode == tf.estimator.ModeKeys.TRAIN: dropout_prob = model_config.dropout_prob else: dropout_prob = 0.0 label_ids = features["label_ids"] model_lst = [] for index, name in enumerate(target): if index > 0: reuse = True else: reuse = model_reuse model_lst.append(bert_encoding(model_config, features, labels, mode, name, scope, dropout_rate, reuse=reuse)) [input_mask_a, repres_a] = model_lst[0] [input_mask_b, repres_b] = model_lst[1] output_a, output_b = alignment_aggerate(model_config, repres_a, repres_b, input_mask_a, input_mask_b, scope, reuse=model_reuse) if model_config.pooling == "ave_max_pooling": pooling_fn = ave_max_pooling elif model_config.pooling == "multihead_pooling": pooling_fn = multihead_pooling repres_a = pooling_fn(model_config, output_a, input_mask_a, scope, dropout_prob, reuse=model_reuse) repres_b = pooling_fn(model_config, output_b, input_mask_b, scope, dropout_prob, reuse=True) pair_repres = tf.concat([repres_a, repres_b, tf.abs(repres_a-repres_b), repres_b*repres_a], axis=-1) with tf.variable_scope(scope, reuse=model_reuse): (loss, per_example_loss, logits) = classifier.classifier(model_config, pair_repres, num_labels, label_ids, dropout_prob) model_io_fn = model_io.ModelIO(model_io_config) tvars = model_io_fn.get_params(model_config.scope, not_storage_params=not_storage_params) if load_pretrained: model_io_fn.load_pretrained(tvars, init_checkpoint, exclude_scope=exclude_scope) if mode == tf.estimator.ModeKeys.TRAIN: optimizer_fn = optimizer.Optimizer(opt_config) model_io_fn.print_params(tvars, string=", trainable params") update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer_fn.get_train_op(loss, tvars, opt_config.init_lr, opt_config.num_train_steps) model_io_fn.set_saver() if kargs.get("task_index", 1) == 0: model_io_fn.get_hooks(kargs.get("checkpoint_dir", None), kargs.get("num_storage_steps", 1000)) training_hooks = model_io_fn.checkpoint_hook else: training_hooks = [] if len(optimizer_fn.distributed_hooks) >= 1: training_hooks.extend(optimizer_fn.distributed_hooks) print(training_hooks) estimator_spec = tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, training_hooks=training_hooks) if output_type == "sess": return { "train":{ "loss":loss, "logits":logits, "train_op":train_op }, "hooks":training_hooks } elif output_type == "estimator": return estimator_spec elif mode == tf.estimator.ModeKeys.PREDICT: print(logits.get_shape(), "===logits shape===") pred_label = tf.argmax(logits, axis=-1, output_type=tf.int32) prob = tf.nn.softmax(logits) max_prob = tf.reduce_max(prob, axis=-1) estimator_spec = tf.estimator.EstimatorSpec( mode=mode, predictions={ 'pred_label':pred_label, "max_prob":max_prob }, export_outputs={ "output":tf.estimator.export.PredictOutput( { 'pred_label':pred_label, "max_prob":max_prob } ) } ) return estimator_spec elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(per_example_loss, logits, label_ids): """Computes the loss and accuracy of the model.""" sentence_log_probs = tf.reshape( logits, [-1, logits.shape[-1]]) sentence_predictions = tf.argmax( logits, axis=-1, output_type=tf.int32) sentence_labels = tf.reshape(label_ids, [-1]) sentence_accuracy = tf.metrics.accuracy( labels=label_ids, predictions=sentence_predictions) sentence_mean_loss = tf.metrics.mean( values=per_example_loss) sentence_f = tf_metrics.f1(label_ids, sentence_predictions, num_labels, label_lst, average="macro") eval_metric_ops = { "f1": sentence_f, "loss": sentence_mean_loss, "acc":sentence_accuracy } return eval_metric_ops eval_metric_ops = metric_fn( per_example_loss, logits, label_ids) estimator_spec = tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) if output_type == "sess": return { "eval":{ "per_example_loss":per_example_loss, "logits":logits, "loss":tf.reduce_mean(per_example_loss) } } elif output_type == "estimator": return estimator_spec else: raise NotImplementedError()
def model_fn(features, labels, mode): train_ops = [] train_hooks = [] logits_dict = {} losses_dict = {} features_dict = {} tvars = [] task_num_dict = {} multi_task_config = kargs.get('multi_task_config', {}) total_loss = tf.constant(0.0) task_num = 0 encoder = {} hook_dict = {} print(task_type_dict.keys(), "==task type dict==") num_task = len(task_type_dict) from data_generator import load_w2v flags = kargs.get('flags', Bunch({})) print(flags.pretrained_w2v_path, "===pretrain vocab path===") w2v_path = os.path.join(flags.buckets, flags.pretrained_w2v_path) vocab_path = os.path.join(flags.buckets, flags.vocab_file) # [w2v_embed, token2id, # id2token, is_extral_symbol, use_pretrained] = load_w2v.load_pretrained_w2v(vocab_path, w2v_path) # pretrained_embed = tf.cast(tf.constant(w2v_embed), tf.float32) pretrained_embed = None for index, task_type in enumerate(task_type_dict.keys()): if model_config_dict[task_type].model_type in model_type_lst: reuse = True else: reuse = None model_type_lst.append(model_config_dict[task_type].model_type) if model_config_dict[task_type].model_type not in encoder: model_api = model_zoo(model_config_dict[task_type]) model = model_api(model_config_dict[task_type], features, labels, mode, target_dict[task_type], reuse=reuse, cnn_type=model_config_dict[task_type].get( 'cnn_type', 'bi_dgcnn')) encoder[model_config_dict[task_type].model_type] = model # vae_kl_model = vae_model_fn(encoder[model_config_dict[task_type].model_type], # model_config_dict[task_type], # num_labels_dict[task_type], # init_checkpoint_dict[task_type], # reuse, # load_pretrained_dict[task_type], # model_io_config, # opt_config, # exclude_scope=exclude_scope_dict[task_type], # not_storage_params=not_storage_params_dict[task_type], # target=target_dict[task_type], # label_lst=None, # output_type=output_type, # task_layer_reuse=task_layer_reuse, # task_type=task_type, # num_task=num_task, # task_adversarial=1e-2, # get_pooled_output='task_output', # feature_distillation=False, # embedding_distillation=False, # pretrained_embed=pretrained_embed, # **kargs) # vae_result_dict = vae_kl_model(features, labels, mode) # tvars.extend(vae_result_dict['tvars']) # total_loss += vae_result_dict["loss"] # for key in vae_result_dict: # if key in ['perplexity', 'token_acc', 'kl_div']: # hook_dict[key] = vae_result_dict[key] print(encoder, "==encode==") if task_type_dict[task_type] == "cls_task": task_model_fn = cls_model_fn( encoder[model_config_dict[task_type].model_type], model_config_dict[task_type], num_labels_dict[task_type], init_checkpoint_dict[task_type], reuse, load_pretrained_dict[task_type], model_io_config, opt_config, exclude_scope=exclude_scope_dict[task_type], not_storage_params=not_storage_params_dict[task_type], target=target_dict[task_type], label_lst=None, output_type=output_type, task_layer_reuse=task_layer_reuse, task_type=task_type, num_task=num_task, task_adversarial=1e-2, get_pooled_output='task_output', feature_distillation=False, embedding_distillation=False, pretrained_embed=pretrained_embed, **kargs) result_dict = task_model_fn(features, labels, mode) tf.logging.info("****** task: *******", task_type_dict[task_type], task_type) elif task_type_dict[task_type] == "embed_task": task_model_fn = embed_model_fn( encoder[model_config_dict[task_type].model_type], model_config_dict[task_type], num_labels_dict[task_type], init_checkpoint_dict[task_type], reuse, load_pretrained_dict[task_type], model_io_config, opt_config, exclude_scope=exclude_scope_dict[task_type], not_storage_params=not_storage_params_dict[task_type], target=target_dict[task_type], label_lst=None, output_type=output_type, task_layer_reuse=task_layer_reuse, task_type=task_type, num_task=num_task, task_adversarial=1e-2, get_pooled_output='task_output', feature_distillation=False, embedding_distillation=False, pretrained_embed=pretrained_embed, loss='contrastive_loss', apply_head_proj=False, **kargs) result_dict = task_model_fn(features, labels, mode) tf.logging.info("****** task: *******", task_type_dict[task_type], task_type) # cpc_model_fn = embed_cpc_model_fn(encoder[model_config_dict[task_type].model_type], # model_config_dict[task_type], # num_labels_dict[task_type], # init_checkpoint_dict[task_type], # reuse, # load_pretrained_dict[task_type], # model_io_config, # opt_config, # exclude_scope=exclude_scope_dict[task_type], # not_storage_params=not_storage_params_dict[task_type], # target=target_dict[task_type], # label_lst=None, # output_type=output_type, # task_layer_reuse=task_layer_reuse, # task_type=task_type, # num_task=num_task, # task_adversarial=1e-2, # get_pooled_output='task_output', # feature_distillation=False, # embedding_distillation=False, # pretrained_embed=pretrained_embed, # loss='contrastive_loss', # apply_head_proj=False, # **kargs) # cpc_result_dict = cpc_model_fn(features, labels, mode) # result_dict['loss'] += cpc_result_dict['loss'] # result_dict['tvars'].extend(cpc_result_dict['tvars']) # hook_dict["{}_all_neg_loss".format(task_type)] = cpc_result_dict['loss'] # hook_dict["{}_all_neg_num".format(task_type)] = cpc_result_dict['task_num'] elif task_type_dict[task_type] == "cpc_task": task_model_fn = embed_cpc_v1_model_fn( encoder[model_config_dict[task_type].model_type], model_config_dict[task_type], num_labels_dict[task_type], init_checkpoint_dict[task_type], reuse, load_pretrained_dict[task_type], model_io_config, opt_config, exclude_scope=exclude_scope_dict[task_type], not_storage_params=not_storage_params_dict[task_type], target=target_dict[task_type], label_lst=None, output_type=output_type, task_layer_reuse=task_layer_reuse, task_type=task_type, num_task=num_task, task_adversarial=1e-2, get_pooled_output='task_output', feature_distillation=False, embedding_distillation=False, pretrained_embed=pretrained_embed, loss='contrastive_loss', apply_head_proj=False, task_seperate_proj=True, **kargs) result_dict = task_model_fn(features, labels, mode) tf.logging.info("****** task: *******", task_type_dict[task_type], task_type) elif task_type_dict[task_type] == "regression_task": task_model_fn = regression_model_fn( encoder[model_config_dict[task_type].model_type], model_config_dict[task_type], num_labels_dict[task_type], init_checkpoint_dict[task_type], reuse, load_pretrained_dict[task_type], model_io_config, opt_config, exclude_scope=exclude_scope_dict[task_type], not_storage_params=not_storage_params_dict[task_type], target=target_dict[task_type], label_lst=None, output_type=output_type, task_layer_reuse=task_layer_reuse, task_type=task_type, num_task=num_task, task_adversarial=1e-2, get_pooled_output='task_output', feature_distillation=False, embedding_distillation=False, pretrained_embed=pretrained_embed, loss='contrastive_loss', apply_head_proj=False, **kargs) result_dict = task_model_fn(features, labels, mode) tf.logging.info("****** task: *******", task_type_dict[task_type], task_type) else: continue print("==SUCCEEDED IN LODING==", task_type) # result_dict = task_model_fn(features, labels, mode) logits_dict[task_type] = result_dict["logits"] losses_dict[task_type] = result_dict["loss"] # task loss for key in [ "pos_num", "neg_num", "masked_lm_loss", "task_loss", "acc", "task_acc", "masked_lm_acc" ]: name = "{}_{}".format(task_type, key) if name in result_dict: hook_dict[name] = result_dict[name] hook_dict["{}_loss".format(task_type)] = result_dict["loss"] hook_dict["{}_num".format(task_type)] = result_dict["task_num"] print("==loss ratio==", task_type, multi_task_config[task_type].get('loss_ratio', 1.0)) total_loss += result_dict["loss"] * multi_task_config[ task_type].get('loss_ratio', 1.0) hook_dict['embed_loss'] = result_dict["embed_loss"] hook_dict['feature_loss'] = result_dict["feature_loss"] hook_dict["{}_task_loss".format( task_type)] = result_dict["task_loss"] if 'positive_label' in result_dict: hook_dict["{}_task_positive_label".format( task_type)] = result_dict["positive_label"] if mode == tf.estimator.ModeKeys.TRAIN: tvars.extend(result_dict["tvars"]) task_num += result_dict["task_num"] task_num_dict[task_type] = result_dict["task_num"] elif mode == tf.estimator.ModeKeys.EVAL: features[task_type] = result_dict["feature"] hook_dict["total_loss"] = total_loss if mode == tf.estimator.ModeKeys.TRAIN: model_io_fn = model_io.ModelIO(model_io_config) optimizer_fn = optimizer.Optimizer(opt_config) model_io_fn.print_params(list(set(tvars)), string=", trainable params") update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) print("==update_ops==", update_ops) with tf.control_dependencies(update_ops): train_op = optimizer_fn.get_train_op( total_loss, list(set(tvars)), opt_config.init_lr, opt_config.num_train_steps, **kargs) model_io_fn.set_saver(optimizer_fn.opt) if kargs.get("task_index", 1) == 1 and kargs.get( "run_config", None): model_io_fn.get_hooks(kargs.get("checkpoint_dir", None), kargs.get("num_storage_steps", 1000)) training_hooks = model_io_fn.checkpoint_hook elif kargs.get("task_index", 1) == 1: training_hooks = [] else: training_hooks = [] if len(optimizer_fn.distributed_hooks) >= 1: training_hooks.extend(optimizer_fn.distributed_hooks) print(training_hooks, "==training_hooks==", "==task_index==", kargs.get("task_index", 1)) if output_type == "sess": return { "train": { "total_loss": total_loss, "loss": losses_dict, "logits": logits_dict, "train_op": train_op, "task_num_dict": task_num_dict }, "hooks": train_hooks } elif output_type == "estimator": hook_dict['learning_rate'] = optimizer_fn.learning_rate logging_hook = tf.train.LoggingTensorHook(hook_dict, every_n_iter=100) training_hooks.append(logging_hook) print("==hook_dict==") print(hook_dict) for key in hook_dict: tf.summary.scalar(key, hook_dict[key]) for index, task_type in enumerate(task_type_dict.keys()): tmp = "{}_loss".format(task_type) if tmp == key: tf.summary.scalar( "loss_gap_{}".format(task_type), hook_dict["total_loss"] - hook_dict[key]) for key in task_num_dict: tf.summary.scalar(key + "_task_num", task_num_dict[key]) estimator_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, training_hooks=training_hooks) return estimator_spec elif mode == tf.estimator.ModeKeys.EVAL: # eval execute for each class solo def metric_fn(logits, label_ids): """Computes the loss and accuracy of the model.""" sentence_log_probs = tf.reshape(logits, [-1, logits.shape[-1]]) sentence_predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) sentence_labels = tf.reshape(label_ids, [-1]) sentence_accuracy = tf.metrics.accuracy( labels=label_ids, predictions=sentence_predictions) sentence_f = tf_metrics.f1(label_ids, sentence_predictions, num_labels, label_lst, average="macro") eval_metric_ops = {"f1": sentence_f, "acc": sentence_accuracy} return eval_metric_ops if output_type == "sess": return { "eval": { "logits": logits_dict, "total_loss": total_loss, "feature": features, "loss": losses_dict } } elif output_type == "estimator": eval_metric_ops = {} for key in logits_dict: eval_dict = metric_fn(logits_dict[key], features_task_dict[key]["label_ids"]) for sub_key in eval_dict.keys(): eval_key = "{}_{}".format(key, sub_key) eval_metric_ops[eval_key] = eval_dict[sub_key] estimator_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss / task_num, eval_metric_ops=eval_metric_ops) return estimator_spec else: raise NotImplementedError()
def train_eval_fn(FLAGS, worker_count, task_index, is_chief, target, init_checkpoint, train_file, dev_file, checkpoint_dir, is_debug, **kargs): graph = tf.Graph() with graph.as_default(): import json config = json.load(open(FLAGS.config_file, "r")) config = Bunch(config) config.use_one_hot_embeddings = True config.scope = "bert" config.dropout_prob = 0.1 config.label_type = "single_label" config.model = FLAGS.model_type config.init_lr = 1e-4 config.ln_type = FLAGS.ln_type config.loss = 'entropy' print('==init learning rate==', config.init_lr) if FLAGS.if_shard == "0": train_size = FLAGS.train_size epoch = int(FLAGS.epoch / worker_count) elif FLAGS.if_shard == "1": train_size = int(FLAGS.train_size / worker_count) epoch = FLAGS.epoch else: train_size = int(FLAGS.train_size / worker_count) epoch = FLAGS.epoch init_lr = config.init_lr label_dict = json.load(tf.gfile.Open(FLAGS.label_id)) num_train_steps = int(train_size / FLAGS.batch_size * epoch) num_warmup_steps = int(num_train_steps * 0.1) num_storage_steps = int(train_size / FLAGS.batch_size) num_eval_steps = int(FLAGS.eval_size / FLAGS.batch_size) if is_debug == "0": num_storage_steps = 2 num_eval_steps = 10 num_train_steps = 10 print("num_train_steps {}, num_eval_steps {}, num_storage_steps {}". format(num_train_steps, num_eval_steps, num_storage_steps)) print(" model type {}".format(FLAGS.model_type)) print(num_train_steps, num_warmup_steps, "=============") if worker_count * kargs.get("num_gpus", 1) >= 2: clip_norm_scale = 1.0 lr_scale = 0.75 else: clip_norm_scale = 1.0 lr_scale = 1.0 lr = init_lr * worker_count * kargs.get("num_gpus", 1) * lr_scale # if lr >= 1e-3: # lr = 1e-3 lr = config.init_lr print('--training learning rate--', lr) opt_config = Bunch({ "init_lr": lr, "num_train_steps": num_train_steps, "num_warmup_steps": num_warmup_steps, "worker_count": worker_count, "opt_type": FLAGS.opt_type, "is_chief": is_chief, "train_op": kargs.get("train_op", "adam"), "decay": kargs.get("decay", "no"), "warmup": kargs.get("warmup", "no"), "clip_norm": 1 }) anneal_config = Bunch({ "initial_value": 1.0, "num_train_steps": num_train_steps }) model_io_config = Bunch({"fix_lm": False}) model_io_fn = model_io.ModelIO(model_io_config) num_classes = FLAGS.num_classes if FLAGS.opt_type == "hvd" and hvd: checkpoint_dir = checkpoint_dir if task_index == 0 else None elif FLAGS.opt_type == "all_reduce": checkpoint_dir = checkpoint_dir elif FLAGS.opt_type == "collective_reduce": checkpoint_dir = checkpoint_dir if task_index == 0 else None elif FLAGS.opt_type == "ps" or FLAGS.opt_type == "ps_sync": checkpoint_dir = checkpoint_dir if task_index == 0 else None print("==checkpoint_dir==", checkpoint_dir, is_chief) model_fn = model_fn_builder(config, num_classes, init_checkpoint, model_reuse=None, load_pretrained=FLAGS.load_pretrained, model_io_config=model_io_config, opt_config=opt_config, model_io_fn=model_io_fn, exclude_scope="", not_storage_params=[], target=kargs.get("input_target", ""), output_type="estimator", checkpoint_dir=checkpoint_dir, num_storage_steps=num_storage_steps, task_index=task_index, anneal_config=anneal_config, **kargs) name_to_features = { "input_ids": tf.FixedLenFeature([FLAGS.max_length], tf.int64), "input_mask": tf.FixedLenFeature([FLAGS.max_length], tf.int64), "segment_ids": tf.FixedLenFeature([FLAGS.max_length], tf.int64), "label_ids": tf.FixedLenFeature([], tf.int64), } def _decode_record(record, name_to_features): """Decodes a record to a TensorFlow example. """ example = tf.parse_single_example(record, name_to_features) # tf.Example only supports tf.int64, but the TPU only supports tf.int32. # So cast all int64 to int32. for name in list(example.keys()): t = example[name] if t.dtype == tf.int64: t = tf.to_int32(t) example[name] = t return example def _decode_batch_record(record, name_to_features): example = tf.parse_example(record, name_to_features) # for name in list(example.keys()): # t = example[name] # if t.dtype == tf.int64: # t = tf.to_int32(t) # example[name] = t return example params = Bunch({}) params.epoch = FLAGS.epoch params.batch_size = FLAGS.batch_size train_features = lambda: tf_data_utils.all_reduce_train_batch_input_fn( train_file, _decode_batch_record, name_to_features, params, if_shard=FLAGS.if_shard, worker_count=worker_count, task_index=task_index) eval_features = lambda: tf_data_utils.all_reduce_eval_batch_input_fn( dev_file, _decode_batch_record, name_to_features, params, if_shard=FLAGS.if_shard, worker_count=worker_count, task_index=task_index) sess_config = tf.ConfigProto(allow_soft_placement=False, log_device_placement=False) if FLAGS.opt_type == "ps" or FLAGS.opt_type == "ps_sync": print("==no need for hook==") elif FLAGS.opt_type == "pai_soar" and pai: print("no need for hook") elif FLAGS.opt_type == "hvd" and hvd: sess_config.gpu_options.allow_growth = True sess_config.gpu_options.visible_device_list = str(hvd.local_rank()) print("==no need fo hook==") else: print("==no need for hooks==") if kargs.get("run_config", None): run_config = kargs.get("run_config", None) run_config = run_config.replace( save_checkpoints_steps=num_storage_steps) print("==run config==", run_config.save_checkpoints_steps) else: run_config = tf.estimator.RunConfig( model_dir=checkpoint_dir, save_checkpoints_steps=num_storage_steps, session_config=sess_config) train_hooks = [] if kargs.get("profiler", "profiler") == "profiler": if checkpoint_dir: hooks = tf.train.ProfilerHook( save_steps=100, save_secs=None, output_dir=os.path.join(checkpoint_dir, "profiler"), ) train_hooks.append(hooks) print("==add profiler hooks==") model_estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir=checkpoint_dir, config=run_config) train_being_time = time.time() tf.logging.info("==training distribution_strategy=={}".format( kargs.get("distribution_strategy", "MirroredStrategy"))) if kargs.get("distribution_strategy", "MirroredStrategy") == "MirroredStrategy": print("==apply single machine multi-card training==") # model_estimator.train(input_fn=train_features, # max_steps=num_train_steps, # hooks=train_hooks) train_spec = tf.estimator.TrainSpec(input_fn=train_features, max_steps=num_train_steps) eval_spec = tf.estimator.EvalSpec(input_fn=eval_features, steps=num_eval_steps) model_estimator.train(input_fn=train_features, max_steps=num_train_steps, hooks=train_hooks) # tf.estimator.train(model_estimator, train_spec) # tf.estimator.evaluate(model_estimator, eval_spec) train_end_time = time.time() print("==training time==", train_end_time - train_being_time) tf.logging.info("==training time=={}".format(train_end_time - train_being_time)) eval_results = model_estimator.evaluate(input_fn=eval_features, steps=num_eval_steps) # print(eval_results) elif kargs.get("distribution_strategy", "MirroredStrategy") in [ "ParameterServerStrategy", "CollectiveAllReduceStrategy" ]: print("==apply multi-machine machine multi-card training==") try: print(os.environ['TF_CONFIG'], "==tf_run_config==") except: print("==not tf config==") train_spec = tf.estimator.TrainSpec(input_fn=train_features, max_steps=num_train_steps) eval_spec = tf.estimator.EvalSpec(input_fn=eval_features, steps=num_eval_steps) tf.estimator.train_and_evaluate(model_estimator, train_spec, eval_spec) train_end_time = time.time() print("==training time==", train_end_time - train_being_time)
def model_fn(features, labels, mode, params): train_op_type = kargs.get('train_op_type', 'joint') ebm_noise_fce = EBM_NOISE_NCE( model_config_dict, num_labels_dict, init_checkpoint_dict, load_pretrained_dict, model_io_config=model_io_config, opt_config=opt_config, exclude_scope_dict=exclude_scope_dict, not_storage_params_dict=not_storage_params_dict, target_dict=target_dict, **kargs) model_io_fn = model_io.ModelIO(model_io_config) use_tpu = 1 if kargs.get('use_tpu', False) else 0 if mode == tf.estimator.ModeKeys.TRAIN: if kargs.get('use_tpu', False): optimizer_fn = optimizer.Optimizer(opt_config) use_tpu = 1 else: optimizer_fn = distributed_optimizer.Optimizer(opt_config) use_tpu = 0 train_op = get_train_op(ebm_noise_fce, optimizer_fn, opt_config, model_config_dict['ebm_dist'], model_config_dict['noise_dist'], model_config_dict['generator'], features, labels, mode, params, use_tpu=use_tpu, train_op_type=train_op_type, alternate_order=['ebm', 'generator']) ebm_noise_fce.load_pretrained_model(**kargs) var_checkpoint_dict_list = ebm_noise_fce.var_checkpoint_dict_list loss = ebm_noise_fce.loss tvars = ebm_noise_fce.tvars if len(var_checkpoint_dict_list) >= 1: scaffold_fn = model_io_fn.load_multi_pretrained( var_checkpoint_dict_list, use_tpu=use_tpu) else: scaffold_fn = None metric_dict = ebm_train_metric( ebm_noise_fce.true_ebm_dist_dict['logits'], ebm_noise_fce.fake_ebm_dist_dict['logits']) if not kargs.get('use_tpu', False): for key in metric_dict: tf.summary.scalar(key, metric_dict[key]) tf.summary.scalar("ebm_loss", ebm_noise_fce.ebm_opt_dict['ebm_loss']) tf.summary.scalar("mlm_loss", ebm_noise_fce.ebm_opt_dict['mlm_loss']) tf.summary.scalar("all_loss", ebm_noise_fce.ebm_opt_dict['all_loss']) model_io_fn.print_params(tvars, string=", trainable params") if kargs.get('use_tpu', False): estimator_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, scaffold_fn=scaffold_fn) else: estimator_spec = tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) return estimator_spec elif mode == tf.estimator.ModeKeys.EVAL: ebm_noise_fce.get_loss(features, labels, mode, params, **kargs) ebm_noise_fce.load_pretrained_model(**kargs) var_checkpoint_dict_list = ebm_noise_fce.var_checkpoint_dict_list loss = ebm_noise_fce.loss if len(var_checkpoint_dict_list) >= 1: scaffold_fn = model_io_fn.load_multi_pretrained( var_checkpoint_dict_list, use_tpu=use_tpu) else: scaffold_fn = None tpu_eval_metrics = (ebm_eval_metric, [ ebm_noise_fce.true_ebm_dist_dict['logits'], ebm_noise_fce.fake_ebm_dist_dict['logits'] ]) gpu_eval_metrics = ebm_eval_metric( ebm_noise_fce.true_ebm_dist_dict['logits'], ebm_noise_fce.fake_ebm_dist_dict['logits']) if kargs.get('use_tpu', False): estimator_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, eval_metrics=tpu_eval_metrics, scaffold_fn=scaffold_fn) else: estimator_spec = tf.estimator.EstimatorSpec( mode=mode, loss=loss, eval_metric_ops=gpu_eval_metrics) return estimator_spec else: raise NotImplementedError()
def model_fn(features, labels, mode, params): model_api = model_zoo(model_config) model = model_api(model_config, features, labels, mode, target, reuse=tf.AUTO_REUSE) if mode == tf.estimator.ModeKeys.TRAIN: dropout_prob = model_config.dropout_prob else: dropout_prob = 0.0 if model_io_config.fix_lm == True: scope = model_config.scope + "_finetuning" else: scope = model_config.scope (nsp_loss, nsp_per_example_loss, nsp_log_prob) = pretrain.get_next_sentence_output(model_config, model.get_pooled_output(), features['next_sentence_labels'], reuse=tf.AUTO_REUSE) masked_lm_positions = features["masked_lm_positions"] masked_lm_ids = features["masked_lm_ids"] masked_lm_weights = features["masked_lm_weights"] if model_config.model_type == 'bert': masked_lm_fn = pretrain.get_masked_lm_output print("==apply bert masked lm==") elif model_config.model_type == 'albert': masked_lm_fn = pretrain_albert.get_masked_lm_output print("==apply albert masked lm==") else: masked_lm_fn = pretrain.get_masked_lm_output print("==apply bert masked lm==") (masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs, masked_lm_mask) = masked_lm_fn( model_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights, reuse=tf.AUTO_REUSE, embedding_projection=model.get_embedding_projection_table()) print(model_config.lm_ratio, '==mlm lm_ratio==') loss = model_config.lm_ratio * masked_lm_loss #+ model_config.nsp_ratio * nsp_loss model_io_fn = model_io.ModelIO(model_io_config) pretrained_tvars = model_io_fn.get_params(model_config.scope, not_storage_params=not_storage_params) lm_pretrain_tvars = model_io_fn.get_params("cls/predictions", not_storage_params=not_storage_params) pretrained_tvars.extend(lm_pretrain_tvars) if load_pretrained == "yes": scaffold_fn = model_io_fn.load_pretrained(pretrained_tvars, init_checkpoint, exclude_scope=exclude_scope, use_tpu=1) else: scaffold_fn = None print("******* scaffold fn *******", scaffold_fn) if mode == tf.estimator.ModeKeys.TRAIN: optimizer_fn = optimizer.Optimizer(opt_config) tvars = pretrained_tvars model_io_fn.print_params(tvars, string=", trainable params") # update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) # with tf.control_dependencies(update_ops): print('==gpu count==', opt_config.get('gpu_count', 1)) train_op = optimizer_fn.get_train_op(loss, tvars, opt_config.init_lr, opt_config.num_train_steps, use_tpu=opt_config.use_tpu) train_metric_dict = train_metric_fn( masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, nsp_per_example_loss, nsp_log_prob, features['next_sentence_labels'], masked_lm_mask=masked_lm_mask ) # for key in train_metric_dict: # tf.summary.scalar(key, train_metric_dict[key]) # tf.summary.scalar('learning_rate', optimizer_fn.learning_rate) estimator_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, scaffold_fn=scaffold_fn) return estimator_spec elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, next_sentence_example_loss, next_sentence_log_probs, next_sentence_labels): """Computes the loss and accuracy of the model.""" masked_lm_log_probs = tf.reshape(masked_lm_log_probs, [-1, masked_lm_log_probs.shape[-1]]) masked_lm_predictions = tf.argmax( masked_lm_log_probs, axis=-1, output_type=tf.int32) masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1]) masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) masked_lm_accuracy = tf.metrics.accuracy( labels=masked_lm_ids, predictions=masked_lm_predictions, weights=masked_lm_weights) masked_lm_mean_loss = tf.metrics.mean( values=masked_lm_example_loss, weights=masked_lm_weights) next_sentence_log_probs = tf.reshape( next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]]) next_sentence_predictions = tf.argmax( next_sentence_log_probs, axis=-1, output_type=tf.int32) next_sentence_labels = tf.reshape(next_sentence_labels, [-1]) next_sentence_accuracy = tf.metrics.accuracy( labels=next_sentence_labels, predictions=next_sentence_predictions) next_sentence_mean_loss = tf.metrics.mean( values=next_sentence_example_loss) return { "masked_lm_accuracy": masked_lm_accuracy, "masked_lm_loss": masked_lm_mean_loss, "next_sentence_accuracy": next_sentence_accuracy, "next_sentence_loss": next_sentence_mean_loss } eval_metrics = (metric_fn, [ masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, nsp_per_example_loss, nsp_log_prob, features['next_sentence_labels'] ]) estimator_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) return estimator_spec else: raise NotImplementedError()
def model_fn(features, labels, mode): # model = bert_encoder(model_config, features, labels, # mode, target, reuse=model_reuse) model = albert_encoder(model_config, features, labels, mode, target, reuse=model_reuse) label_ids = features["label_ids"] if mode == tf.estimator.ModeKeys.TRAIN: dropout_prob = model_config.dropout_prob else: dropout_prob = 0.0 if model_io_config.fix_lm == True: scope = model_config.scope + "_finetuning" else: scope = model_config.scope with tf.variable_scope(scope, reuse=model_reuse): (loss, per_example_loss, logits) = classifier.classifier(model_config, model.get_pooled_output(), num_labels, label_ids, dropout_prob) model_io_fn = model_io.ModelIO(model_io_config) tvars = model_io_fn.get_params(model_config.scope, not_storage_params=not_storage_params) if load_pretrained == "yes": model_io_fn.load_pretrained(tvars, init_checkpoint, exclude_scope=exclude_scope) if mode == tf.estimator.ModeKeys.TRAIN: optimizer_fn = optimizer.Optimizer(opt_config) model_io_fn.print_params(tvars, string=", trainable params") update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) print("==update_ops==", update_ops) with tf.control_dependencies(update_ops): train_op = optimizer_fn.get_train_op(loss, tvars, opt_config.init_lr, opt_config.num_train_steps, **kargs) # train_op, hooks = model_io_fn.get_ema_hooks(train_op, # tvars, # kargs.get('params_moving_average_decay', 0.99), # scope, mode, # first_stage_steps=opt_config.num_warmup_steps, # two_stage=True) model_io_fn.set_saver() estimator_spec = tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) return estimator_spec elif mode == tf.estimator.ModeKeys.EVAL: # _, hooks = model_io_fn.get_ema_hooks(None, # None, # kargs.get('params_moving_average_decay', 0.99), # scope, mode) hooks = None def metric_fn(per_example_loss, logits, label_ids): """Computes the loss and accuracy of the model.""" sentence_log_probs = tf.reshape( logits, [-1, logits.shape[-1]]) sentence_predictions = tf.argmax( logits, axis=-1, output_type=tf.int32) sentence_labels = tf.reshape(label_ids, [-1]) sentence_accuracy = tf.metrics.accuracy( labels=label_ids, predictions=sentence_predictions) sentence_mean_loss = tf.metrics.mean( values=per_example_loss) sentence_f = tf_metrics.f1(label_ids, sentence_predictions, num_labels, label_lst, average="macro") eval_metric_ops = { "f1": sentence_f, "acc":sentence_accuracy } return eval_metric_ops eval_metric_ops = metric_fn( per_example_loss, logits, label_ids) eval_hooks = [hooks] if hooks else [] estimator_spec = tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops, evaluation_hooks=eval_hooks ) if output_type == "sess": return { "eval":{ "per_example_loss":per_example_loss, "logits":logits, "loss":tf.reduce_mean(per_example_loss) } } elif output_type == "estimator": return estimator_spec else: raise NotImplementedError()
def model_fn(features, labels, mode): model_api = model_zoo(model_config) model = model_api(model_config, features, labels, mode, target, reuse=model_reuse, **kargs) label_ids = features["label_ids"] if mode == tf.estimator.ModeKeys.TRAIN: dropout_prob = model_config.dropout_prob else: dropout_prob = 0.0 if model_io_config.fix_lm == True: scope = model_config.scope + "_finetuning" else: scope = model_config.scope with tf.variable_scope(scope, reuse=model_reuse): (loss, per_example_loss, logits) = classifier.classifier(model_config, model.get_pooled_output(), num_labels, label_ids, dropout_prob) model_io_fn = model_io.ModelIO(model_io_config) tvars = model_io_fn.get_params(model_config.scope, not_storage_params=not_storage_params) try: params_size = model_io_fn.count_params(model_config.scope) print("==total params==", params_size) except: print("==not count params==") print(tvars) if load_pretrained == "yes": model_io_fn.load_pretrained(tvars, init_checkpoint, exclude_scope=exclude_scope) if mode == tf.estimator.ModeKeys.TRAIN: optimizer_fn = optimizer.Optimizer(opt_config) model_io_fn.print_params(tvars, string=", trainable params") update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) print("==update_ops==", update_ops) with tf.control_dependencies(update_ops): train_op = optimizer_fn.get_train_op( loss, tvars, opt_config.init_lr, opt_config.num_train_steps, **kargs) model_io_fn.set_saver() if kargs.get("task_index", 1) == 0 and kargs.get( "run_config", None): training_hooks = [] elif kargs.get("task_index", 1) == 0: model_io_fn.get_hooks(kargs.get("checkpoint_dir", None), kargs.get("num_storage_steps", 1000)) training_hooks = model_io_fn.checkpoint_hook else: training_hooks = [] if len(optimizer_fn.distributed_hooks) >= 1: training_hooks.extend(optimizer_fn.distributed_hooks) print(training_hooks, "==training_hooks==", "==task_index==", kargs.get("task_index", 1)) estimator_spec = tf.estimator.EstimatorSpec( mode=mode, loss=loss, train_op=train_op, training_hooks=training_hooks) print(tf.global_variables(), "==global_variables==") if output_type == "sess": return { "train": { "loss": loss, "logits": logits, "train_op": train_op }, "hooks": training_hooks } elif output_type == "estimator": return estimator_spec elif mode == tf.estimator.ModeKeys.PREDICT: # if model_config.get('label_type', 'single_label') == 'single_label': # print(logits.get_shape(), "===logits shape===") # pred_label = tf.argmax(logits, axis=-1, output_type=tf.int32) # prob = tf.nn.softmax(logits) # max_prob = tf.reduce_max(prob, axis=-1) # estimator_spec = tf.estimator.EstimatorSpec( # mode=mode, # predictions={ # 'pred_label':pred_label, # "max_prob":max_prob # }, # export_outputs={ # "output":tf.estimator.export.PredictOutput( # { # 'pred_label':pred_label, # "max_prob":max_prob # } # ) # } # ) if model_config.get('label_type', 'single_label') == 'multi_label': prob = tf.nn.sigmoid(logits) estimator_spec = tf.estimator.EstimatorSpec( mode=mode, predictions={ 'pred_label': prob, "max_prob": prob }, export_outputs={ "output": tf.estimator.export.PredictOutput({ 'pred_label': prob, "max_prob": prob }) }) elif model_config.get('label_type', 'single_label') == "single_label": prob = tf.nn.softmax(logits) estimator_spec = tf.estimator.EstimatorSpec( mode=mode, predictions={ 'pred_label': prob, "max_prob": prob }, export_outputs={ "output": tf.estimator.export.PredictOutput({ 'pred_label': prob, "max_prob": prob }) }) return estimator_spec elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(per_example_loss, logits, label_ids): """Computes the loss and accuracy of the model.""" sentence_log_probs = tf.reshape(logits, [-1, logits.shape[-1]]) sentence_predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) sentence_labels = tf.reshape(label_ids, [-1]) sentence_accuracy = tf.metrics.accuracy( labels=label_ids, predictions=sentence_predictions) sentence_mean_loss = tf.metrics.mean(values=per_example_loss) sentence_f = tf_metrics.f1(label_ids, sentence_predictions, num_labels, label_lst, average="macro") eval_metric_ops = {"f1": sentence_f, "acc": sentence_accuracy} return eval_metric_ops if output_type == "sess": return { "eval": { "per_example_loss": per_example_loss, "logits": logits, "loss": tf.reduce_mean(per_example_loss), "feature": model.get_pooled_output() } } elif output_type == "estimator": eval_metric_ops = metric_fn(per_example_loss, logits, label_ids) estimator_spec = tf.estimator.EstimatorSpec( mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) return estimator_spec else: raise NotImplementedError()
def model_fn(features, labels, mode): shape_lst_a = bert_utils.get_shape_list(features['input_ids_a']) batch_size_a = shape_lst_a[0] total_length_a = shape_lst_a[1] shape_lst_b = bert_utils.get_shape_list(features['input_ids_b']) batch_size_b = shape_lst_b[0] total_length_b = shape_lst_b[1] features['input_ids_a'] = tf.reshape(features['input_ids_a'], [-1, model_config.max_length]) features['segment_ids_a'] = tf.reshape(features['segment_ids_a'], [-1, model_config.max_length]) features['input_mask_a'] = tf.cast( tf.not_equal(features['input_ids_a'], kargs.get('[PAD]', 0)), tf.int64) features['input_ids_b'] = tf.reshape( features['input_ids_b'], [-1, model_config.max_predictions_per_seq]) features['segment_ids_b'] = tf.reshape( features['segment_ids_b'], [-1, model_config.max_predictions_per_seq]) features['input_mask_b'] = tf.cast( tf.not_equal(features['input_ids_b'], kargs.get('[PAD]', 0)), tf.int64) features['batch_size'] = batch_size_a features['total_length_a'] = total_length_a features['total_length_b'] = total_length_b model_dict = {} for target in ["a", "b"]: model = bert_encoder(model_config, features, labels, mode, target, reuse=tf.AUTO_REUSE) model_dict[target] = model if mode == tf.estimator.ModeKeys.TRAIN: dropout_prob = model_config.dropout_prob else: dropout_prob = 0.0 if model_io_config.fix_lm == True: scope = model_config.scope + "_finetuning" else: scope = model_config.scope with tf.variable_scope(scope, reuse=model_reuse): (loss, per_example_loss, logits, transition_params) = multi_position_crf_classifier( model_config, features, model_dict, num_labels, dropout_prob) model_io_fn = model_io.ModelIO(model_io_config) tvars = model_io_fn.get_params(model_config.scope, not_storage_params=not_storage_params) try: params_size = model_io_fn.count_params(model_config.scope) print("==total params==", params_size) except: print("==not count params==") print(tvars) if load_pretrained == "yes": model_io_fn.load_pretrained(tvars, init_checkpoint, exclude_scope=exclude_scope) if mode == tf.estimator.ModeKeys.TRAIN: optimizer_fn = optimizer.Optimizer(opt_config) model_io_fn.print_params(tvars, string=", trainable params") update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) print("==update_ops==", update_ops) with tf.control_dependencies(update_ops): train_op = optimizer_fn.get_train_op( loss, tvars, opt_config.init_lr, opt_config.num_train_steps, **kargs) train_op, hooks = model_io_fn.get_ema_hooks( train_op, tvars, kargs.get('params_moving_average_decay', 0.99), scope, mode, first_stage_steps=opt_config.num_warmup_steps, two_stage=True) model_io_fn.set_saver() if kargs.get("task_index", 1) == 0 and kargs.get( "run_config", None): training_hooks = [] elif kargs.get("task_index", 1) == 0: model_io_fn.get_hooks(kargs.get("checkpoint_dir", None), kargs.get("num_storage_steps", 1000)) training_hooks = model_io_fn.checkpoint_hook else: training_hooks = [] if len(optimizer_fn.distributed_hooks) >= 1: training_hooks.extend(optimizer_fn.distributed_hooks) print(training_hooks, "==training_hooks==", "==task_index==", kargs.get("task_index", 1)) estimator_spec = tf.estimator.EstimatorSpec( mode=mode, loss=loss, train_op=train_op, training_hooks=training_hooks) print(tf.global_variables(), "==global_variables==") if output_type == "sess": return { "train": { "loss": loss, "logits": logits, "train_op": train_op }, "hooks": training_hooks } elif output_type == "estimator": return estimator_spec elif mode == tf.estimator.ModeKeys.PREDICT: print(logits.get_shape(), "===logits shape===") label_weights = tf.cast(features['label_weights'], tf.int32) label_seq_length = tf.reduce_sum(label_weights, axis=-1) decode_tags, best_score = tf.contrib.crf.crf_decode( logits, transition_params, label_seq_length) _, hooks = model_io_fn.get_ema_hooks( None, None, kargs.get('params_moving_average_decay', 0.99), scope, mode) estimator_spec = tf.estimator.EstimatorSpec( mode=mode, predictions={ 'decode_tags': decode_tags, "best_score": best_score, "transition_params": transition_params, "logits": logits }, export_outputs={ "output": tf.estimator.export.PredictOutput({ 'decode_tags': decode_tags, "best_score": best_score, "transition_params": transition_params, "logits": logits }) }, prediction_hooks=[hooks]) return estimator_spec elif mode == tf.estimator.ModeKeys.EVAL: _, hooks = model_io_fn.get_ema_hooks( None, None, kargs.get('params_moving_average_decay', 0.99), scope, mode) eval_hooks = [] if output_type == "sess": return { "eval": { "per_example_loss": per_example_loss, "logits": logits, "loss": tf.reduce_mean(per_example_loss), "feature": model.get_pooled_output() } } elif output_type == "estimator": eval_metric_ops = eval_logtis(logits, features, num_labels, transition_params) estimator_spec = tf.estimator.EstimatorSpec( mode=mode, loss=loss, eval_metric_ops=eval_metric_ops, evaluation_hooks=eval_hooks) return estimator_spec else: raise NotImplementedError()
def model_fn(features, labels, mode): model_api = model_zoo(model_config) model = model_api(model_config, features, labels, mode, target, reuse=model_reuse) label_ids = features["label_ids"] if mode == tf.estimator.ModeKeys.TRAIN: dropout_prob = model_config.dropout_prob else: dropout_prob = 0.0 if model_io_config.fix_lm == True: scope = model_config.scope + "_finetuning" else: scope = model_config.scope with tf.variable_scope(scope, reuse=model_reuse): (loss, per_example_loss, logits) = classifier.classifier(model_config, model.get_pooled_output(), num_labels, label_ids, dropout_prob) label_loss = tf.reduce_sum( per_example_loss * features["label_ratio"]) / ( 1e-10 + tf.reduce_sum(features["label_ratio"])) if mode == tf.estimator.ModeKeys.TRAIN: distillation_api = distill.KnowledgeDistillation( kargs.get( "disitllation_config", Bunch({ "logits_ratio_decay": "constant", "logits_ratio": 0.5, "logits_decay_rate": 0.999, "distillation": ['relation_kd', 'logits'], "feature_ratio": 0.5, "feature_ratio_decay": "constant", "feature_decay_rate": 0.999, "kd_type": "kd", "scope": scope }))) # get teacher logits teacher_logit = tf.log(features["label_probs"] + 1e-10) / kargs.get( "temperature", 2.0) # log_softmax logits student_logit = tf.nn.log_softmax( logits / kargs.get("temperature", 2.0)) # log_softmax logits distillation_features = { "student_logits_tensor": student_logit, "teacher_logits_tensor": teacher_logit, "student_feature_tensor": model.get_pooled_output(), "teacher_feature_tensor": features["distillation_feature"], "student_label": tf.ones_like(label_ids, dtype=tf.int32), "teacher_label": tf.zeros_like(label_ids, dtype=tf.int32), "logits_ratio": kargs.get("logits_ratio", 0.5), "feature_ratio": kargs.get("logits_ratio", 0.5), "distillation_ratio": features["distillation_ratio"], "src_f_logit": logits, "tgt_f_logit": logits, "src_tensor": model.get_pooled_output(), "tgt_tensor": features["distillation_feature"] } distillation_loss = distillation_api.distillation( distillation_features, 2, dropout_prob, model_reuse, opt_config.num_train_steps, feature_ratio=1.0, logits_ratio_decay="constant", feature_ratio_decay="constant", feature_decay_rate=0.999, logits_decay_rate=0.999, logits_ratio=0.5, scope=scope + "/adv_classifier", num_classes=num_labels, gamma=kargs.get("gamma", 4)) loss = label_loss + distillation_loss["distillation_loss"] model_io_fn = model_io.ModelIO(model_io_config) tvars = model_io_fn.get_params(model_config.scope, not_storage_params=not_storage_params) print(tvars) if load_pretrained == "yes": model_io_fn.load_pretrained(tvars, init_checkpoint, exclude_scope=exclude_scope) if mode == tf.estimator.ModeKeys.TRAIN: optimizer_fn = optimizer.Optimizer(opt_config) model_io_fn.print_params(tvars, string=", trainable params") update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer_fn.get_train_op( loss, tvars, opt_config.init_lr, opt_config.num_train_steps, **kargs) model_io_fn.set_saver() if kargs.get("task_index", 1) == 0 and kargs.get( "run_config", None): training_hooks = [] elif kargs.get("task_index", 1) == 0: model_io_fn.get_hooks(kargs.get("checkpoint_dir", None), kargs.get("num_storage_steps", 1000)) training_hooks = model_io_fn.checkpoint_hook else: training_hooks = [] if len(optimizer_fn.distributed_hooks) >= 1: training_hooks.extend(optimizer_fn.distributed_hooks) print(training_hooks, "==training_hooks==", "==task_index==", kargs.get("task_index", 1)) estimator_spec = tf.estimator.EstimatorSpec( mode=mode, loss=loss, train_op=train_op, training_hooks=training_hooks) if output_type == "sess": return { "train": { "loss": loss, "logits": logits, "train_op": train_op, "cross_entropy": label_loss, "distillation_loss": distillation_loss["distillation_loss"], "kd_num": tf.reduce_sum(features["distillation_ratio"]), "ce_num": tf.reduce_sum(features["label_ratio"]), "label_ratio": features["label_ratio"], "distilaltion_logits_loss": distillation_loss["distillation_logits_loss"], "distilaltion_feature_loss": distillation_loss["distillation_feature_loss"], "rkd_loss": distillation_loss["rkd_loss"] }, "hooks": training_hooks } elif output_type == "estimator": return estimator_spec elif mode == tf.estimator.ModeKeys.PREDICT: print(logits.get_shape(), "===logits shape===") pred_label = tf.argmax(logits, axis=-1, output_type=tf.int32) prob = tf.nn.softmax(logits) max_prob = tf.reduce_max(prob, axis=-1) estimator_spec = tf.estimator.EstimatorSpec( mode=mode, predictions={ 'pred_label': pred_label, "max_prob": max_prob }, export_outputs={ "output": tf.estimator.export.PredictOutput({ 'pred_label': pred_label, "max_prob": max_prob }) }) return estimator_spec elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(per_example_loss, logits, label_ids): """Computes the loss and accuracy of the model.""" sentence_log_probs = tf.reshape(logits, [-1, logits.shape[-1]]) sentence_predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) sentence_labels = tf.reshape(label_ids, [-1]) sentence_accuracy = tf.metrics.accuracy( labels=label_ids, predictions=sentence_predictions) sentence_mean_loss = tf.metrics.mean(values=per_example_loss) sentence_f = tf_metrics.f1(label_ids, sentence_predictions, num_labels, label_lst, average="macro") eval_metric_ops = {"f1": sentence_f, "acc": sentence_accuracy} return eval_metric_ops eval_metric_ops = metric_fn(per_example_loss, logits, label_ids) estimator_spec = tf.estimator.EstimatorSpec( mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) if output_type == "sess": return { "eval": { "per_example_loss": per_example_loss, "logits": logits, "loss": tf.reduce_mean(per_example_loss) } } elif output_type == "estimator": return estimator_spec else: raise NotImplementedError()
def main(_): hvd.init() sess_config = tf.ConfigProto() sess_config.gpu_options.visible_device_list = str(hvd.local_rank()) graph = tf.Graph() with graph.as_default(): import json config = json.load(open(FLAGS.config_file, "r")) init_checkpoint = FLAGS.init_checkpoint config = Bunch(config) config.use_one_hot_embeddings = True config.scope = "bert" config.dropout_prob = 0.1 config.label_type = "single_label" if FLAGS.if_shard == "0": train_size = FLAGS.train_size epoch = int(FLAGS.epoch / hvd.size()) elif FLAGS.if_shard == "1": train_size = int(FLAGS.train_size/hvd.size()) epoch = FLAGS.epoch tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.lower_case) classifier_data_api = classifier_processor.EvaluationProcessor() classifier_data_api.get_labels(FLAGS.label_id) train_examples = classifier_data_api.get_train_examples(FLAGS.train_file) write_to_tfrecords.convert_classifier_examples_to_features(train_examples, classifier_data_api.label2id, FLAGS.max_length, tokenizer, FLAGS.eval_data_file) init_lr = 2e-5 num_train_steps = int( train_size / FLAGS.batch_size * epoch) num_warmup_steps = int(num_train_steps * 0.1) num_storage_steps = int(train_size / FLAGS.batch_size) print(" model type {}".format(FLAGS.model_type)) print(num_train_steps, num_warmup_steps, "=============") opt_config = Bunch({"init_lr":init_lr/hvd.size(), "num_train_steps":num_train_steps, "num_warmup_steps":num_warmup_steps}) sess = tf.Session(config=sess_config) model_io_config = Bunch({"fix_lm":False}) model_io_fn = model_io.ModelIO(model_io_config) optimizer_fn = optimizer.Optimizer(opt_config) num_classes = FLAGS.num_classes model_eval_fn = bert_classifier.classifier_model_fn_builder(config, num_classes, init_checkpoint, reuse=False, load_pretrained=True, model_io_fn=model_io_fn, optimizer_fn=optimizer_fn, model_io_config=model_io_config, opt_config=opt_config) def metric_fn(features, logits, loss): print(logits.get_shape(), "===logits shape===") pred_label = tf.argmax(logits, axis=-1, output_type=tf.int32) prob = tf.nn.softmax(logits) accuracy = correct = tf.equal( tf.cast(pred_label, tf.int32), tf.cast(features["label_ids"], tf.int32) ) accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) return {"accuracy":accuracy, "loss":loss, "pred_label":pred_label, "label_ids":features["label_ids"], "prob":prob} name_to_features = { "input_ids": tf.FixedLenFeature([FLAGS.max_length], tf.int64), "input_mask": tf.FixedLenFeature([FLAGS.max_length], tf.int64), "segment_ids": tf.FixedLenFeature([FLAGS.max_length], tf.int64), "label_ids": tf.FixedLenFeature([], tf.int64), } def _decode_record(record, name_to_features): """Decodes a record to a TensorFlow example. """ example = tf.parse_single_example(record, name_to_features) # tf.Example only supports tf.int64, but the TPU only supports tf.int32. # So cast all int64 to int32. for name in list(example.keys()): t = example[name] if t.dtype == tf.int64: t = tf.to_int32(t) example[name] = t return example params = Bunch({}) params.epoch = FLAGS.epoch params.batch_size = FLAGS.batch_size eval_features = tf_data_utils.eval_input_fn(FLAGS.eval_data_file, _decode_record, name_to_features, params, if_shard=FLAGS.if_shard) [_, eval_loss, eval_per_example_loss, eval_logits] = model_eval_fn(eval_features, [], tf.estimator.ModeKeys.EVAL) result = metric_fn(eval_features, eval_logits, eval_loss) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) sess.run(hvd.broadcast_global_variables(0)) print("===horovod rank==={}".format(hvd.rank())) def eval_fn(result): i = 0 total_accuracy = 0 label, label_id, prob = [], [], [] while True: try: eval_result = sess.run(result) total_accuracy += eval_result["accuracy"] label_id.extend(eval_result["label_ids"]) label.extend(eval_result["pred_label"]) prob.extend(eval_result["prob"]) i += 1 except tf.errors.OutOfRangeError: print("End of dataset") break macro_f1 = f1_score(label_id, label, average="macro") micro_f1 = f1_score(label_id, label, average="micro") macro_precision = precision_score(label_id, label, average="macro") micro_precision = precision_score(label_id, label, average="micro") macro_recall = recall_score(label_id, label, average="macro") micro_recall = recall_score(label_id, label, average="micro") accuracy = accuracy_score(label_id, label) print("test accuracy {} macro_f1 score {} micro_f1 {} accuracy {}".format(total_accuracy/ i, macro_f1, micro_f1, accuracy)) return total_accuracy/ i, label_id, label, prob import time import time start = time.time() acc, true_label, pred_label, prob = eval_fn(result) end = time.time() print("==total time {} numbers of devices {}".format(end - start, hvd.size())) if hvd.rank() == 0: import _pickle as pkl pkl.dump({"true_label":true_label, "pred_label":pred_label, "prob":prob}, open(FLAGS.model_output+"/predict.pkl", "wb"))
def model_fn(features, labels, mode): model = gpt_encoder(model_config, features, labels, mode, target, reuse=tf.AUTO_REUSE) scope = model_config.scope if mode == tf.estimator.ModeKeys.TRAIN: # batch x seq_length sequence_mask = tf.to_float(tf.not_equal(features['input_ids'][:, 1:], kargs.get('[PAD]', 0))) # batch x seq_length seq_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=features['input_ids'][:, 1:], logits=model.get_sequence_output_logits()[:, :-1]) per_example_loss = tf.reduce_sum(seq_loss*sequence_mask, axis=-1) / (tf.reduce_sum(sequence_mask, axis=-1)+1e-10) loss = tf.reduce_mean(per_example_loss) model_io_fn = model_io.ModelIO(model_io_config) tvars = model_io_fn.get_params(model_config.scope, not_storage_params=not_storage_params) try: params_size = model_io_fn.count_params(model_config.scope) print("==total params==", params_size) except: print("==not count params==") print(tvars) if load_pretrained == "yes": model_io_fn.load_pretrained(tvars, init_checkpoint, exclude_scope=exclude_scope) if mode == tf.estimator.ModeKeys.TRAIN: optimizer_fn = optimizer.Optimizer(opt_config) model_io_fn.print_params(tvars, string=", trainable params") update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) print("==update_ops==", update_ops) with tf.control_dependencies(update_ops): train_op = optimizer_fn.get_train_op(loss, tvars, opt_config.init_lr, opt_config.num_train_steps, **kargs) model_io_fn.set_saver() if kargs.get("task_index", 1) == 0 and kargs.get("run_config", None): training_hooks = [] elif kargs.get("task_index", 1) == 0: model_io_fn.get_hooks(kargs.get("checkpoint_dir", None), kargs.get("num_storage_steps", 1000)) training_hooks = model_io_fn.checkpoint_hook else: training_hooks = [] if len(optimizer_fn.distributed_hooks) >= 1: training_hooks.extend(optimizer_fn.distributed_hooks) print(training_hooks, "==training_hooks==", "==task_index==", kargs.get("task_index", 1)) train_metric_dict = train_metric(features['input_ids'], model.get_sequence_output_logits(), **kargs) for key in train_metric_dict: tf.summary.scalar(key, train_metric_dict[key]) tf.summary.scalar('learning_rate', optimizer_fn.learning_rate) tf.summary.scalar('seq_length', tf.reduce_mean(tf.reduce_sum(sequence_mask, axis=-1))) estimator_spec = tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, training_hooks=training_hooks) print(tf.global_variables(), "==global_variables==") if output_type == "sess": return { "train":{ "loss":loss, "logits":logits, "train_op":train_op }, "hooks":training_hooks } elif output_type == "estimator": return estimator_spec elif mode == tf.estimator.ModeKeys.PREDICT: if kargs.get('predict_type', 'sample_sequence') == 'sample_sequence': results = sample.sample_sequence( gpt_encoder, hparams=model_config, length=kargs.get('max_length', 64), start_token=None, batch_size=10, context=features['input_ids'], temperature=2, top_k=10) sampled_token = results['tokens'][:, 1:] sampled_token_logits = results['logits'][:, 1:] estimator_spec = tf.estimator.EstimatorSpec( mode=mode, predictions={ 'token':sampled_token, "logits":sampled_token_logits }, export_outputs={ "output":tf.estimator.export.PredictOutput( { 'token':sampled_token, "logits":sampled_token_logits } ) } ) return estimator_spec elif kargs.get('predict_type', 'sample_sequence') == 'infer_inputs': sequence_mask = tf.to_float(tf.not_equal(features['input_ids'][:, 1:], kargs.get('[PAD]', 0))) output_logits = model.get_sequence_output_logits()[:, :-1] # output_logits = tf.nn.log_softmax(output_logits, axis=-1) output_id_logits = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=features['input_ids'][:, 1:], logits=output_logits) per_example_perplexity = tf.reduce_sum(output_id_logits * sequence_mask, axis=-1) # batch per_example_perplexity /= tf.reduce_sum(sequence_mask, axis=-1) # batch perplexity = tf.exp(per_example_perplexity) estimator_spec = tf.estimator.EstimatorSpec( mode=mode, predictions={ 'token':features['input_ids'][:, 1:], "logits":output_id_logits, 'perplexity':perplexity }, export_outputs={ "output":tf.estimator.export.PredictOutput( { 'token':features['input_ids'][:,1:], "logits":output_id_logits, 'perplexity':perplexity } ) } ) return estimator_spec elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(per_example_loss, logits, label_ids): """Computes the loss and accuracy of the model.""" sentence_log_probs = tf.reshape( logits, [-1, logits.shape[-1]]) sentence_predictions = tf.argmax( logits, axis=-1, output_type=tf.int32) sentence_labels = tf.reshape(label_ids, [-1]) sentence_accuracy = tf.metrics.accuracy( labels=label_ids, predictions=sentence_predictions) sentence_mean_loss = tf.metrics.mean( values=per_example_loss) sentence_f = tf_metrics.f1(label_ids, sentence_predictions, num_labels, label_lst, average="macro") eval_metric_ops = { "f1": sentence_f, "acc":sentence_accuracy } return eval_metric_ops if output_type == "sess": return { "eval":{ "per_example_loss":per_example_loss, "logits":logits, "loss":tf.reduce_mean(per_example_loss), "feature":model.get_pooled_output() } } elif output_type == "estimator": eval_metric_ops = metric_fn( per_example_loss, logits, label_ids) estimator_spec = tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) return estimator_spec else: raise NotImplementedError()
def main(_): graph = tf.Graph() # from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score with graph.as_default(): import json tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=True) classifier_data_api = classifier_processor.PiarInteractionProcessor() eval_examples = classifier_data_api.get_test_examples(FLAGS.eval_data_file, FLAGS.lang) print(eval_examples[0].guid) label_tensor = None label_id = json.load(open(FLAGS.label_id, "r")) num_choice = 3 write_to_tfrecords.convert_interaction_classifier_examples_to_features_v1( eval_examples, label_id["label2id"], FLAGS.max_length, tokenizer, FLAGS.output_file) config = json.load(open(FLAGS.config_file, "r")) init_checkpoint = FLAGS.init_checkpoint max_seq_length = FLAGS.max_length * 2 + 3 print("===init checkoutpoint==={}".format(init_checkpoint)) config = Bunch(config) config.use_one_hot_embeddings = True config.scope = "esim/bert" config.dropout_prob = 0.2 config.label_type = "single_label" config.lstm_dim = 128 config.num_heads = 12 config.num_units = 768 # os.environ["CUDA_VISIBLE_DEVICES"] = "2" sess = tf.Session() opt_config = Bunch({"init_lr":(5e-5), "num_train_steps":0, "num_warmup_steps":0, "train_op":"adam"}) model_io_config = Bunch({"fix_lm":False}) model_io_fn = model_io.ModelIO(model_io_config) model_function = bert_esim.classifier_attn_model_fn_builder model_eval_fn = model_function( config, num_choice, init_checkpoint, model_reuse=None, load_pretrained=True, model_io_fn=model_io_fn, model_io_config=model_io_config, opt_config=opt_config, input_name=["a", "b"], label_tensor=label_tensor, not_storage_params=["adam", "adam_1"], exclude_scope_dict={"task":"esim"}) # def metric_fn(features, logits): # print(logits.get_shape(), "===logits shape===") # pred_label = tf.argmax(logits, axis=-1, output_type=tf.int32) # return {"pred_label":pred_label, "qas_id":features["qas_id"]} def metric_fn(features, logits): print(logits.get_shape(), "===logits shape===") pred_label = tf.argmax(logits, axis=-1, output_type=tf.int32) prob = tf.exp(tf.nn.log_softmax(logits)) return {"pred_label":pred_label, "qas_id":features["qas_id"], "prob":prob} name_to_features = { "input_ids_a": tf.FixedLenFeature([max_seq_length], tf.int64), "input_mask_a": tf.FixedLenFeature([max_seq_length], tf.int64), "segment_ids_a": tf.FixedLenFeature([max_seq_length], tf.int64), "input_ids_b": tf.FixedLenFeature([max_seq_length], tf.int64), "input_mask_b": tf.FixedLenFeature([max_seq_length], tf.int64), "segment_ids_b": tf.FixedLenFeature([max_seq_length], tf.int64), "label_ids": tf.FixedLenFeature([], tf.int64), "qas_id": tf.FixedLenFeature([], tf.int64), } def _decode_record(record, name_to_features): """Decodes a record to a TensorFlow example. """ example = tf.parse_single_example(record, name_to_features) # tf.Example only supports tf.int64, but the TPU only supports tf.int32. # So cast all int64 to int32. for name in list(example.keys()): t = example[name] if t.dtype == tf.int64: t = tf.to_int32(t) example[name] = t return example params = Bunch({}) params.epoch = 2 params.batch_size = 32 eval_features = tf_data_utils.eval_input_fn(FLAGS.output_file, _decode_record, name_to_features, params) [_, eval_loss, eval_per_example_loss, eval_logits] = model_eval_fn(eval_features, [], tf.estimator.ModeKeys.EVAL) result = metric_fn(eval_features, eval_logits) model_io_fn.set_saver() init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) model_io_fn.load_model(sess, init_checkpoint) print(" ==succeeded in loading model== ") def eval_fn(result): i = 0 pred_label, qas_id, prob = [], [], [] while True: try: eval_result = sess.run(result) pred_label.extend(eval_result["pred_label"].tolist()) qas_id.extend(eval_result["qas_id"].tolist()) prob.extend(eval_result["prob"].tolist()) i += 1 except tf.errors.OutOfRangeError: print("End of dataset") break return pred_label, qas_id, prob print("===========begin to eval============") [pred_label, qas_id, prob] = eval_fn(result) result = dict(zip(qas_id, pred_label)) print(FLAGS.result_file.split(".")) tmp_output = FLAGS.result_file.split(".")[0] + ".json" print(tmp_output, "===temp output===") json.dump({"id":qas_id, "label":pred_label, "prob":prob}, open(tmp_output, "w")) print(len(result), "=====valid result======") print(len(result), "=====valid result======") import pandas as pd df = pd.read_csv(FLAGS.eval_data_file) output = {} for index in range(df.shape[0]): output[df.loc[index]["id"]] = "" final_output = [] cnt = 0 for key in output: if key in result: final_output.append({"Id":key, "Category":label_id["id2label"][str(result[key])]}) cnt += 1 else: final_output.append({"Id":key, "Category":"unrelated"}) df_out = pd.DataFrame(final_output) df_out.to_csv(FLAGS.result_file) print(len(output), cnt, len(final_output), "======num of results from model==========")
def main(_): hvd.init() sess_config = tf.ConfigProto() sess_config.gpu_options.visible_device_list = str(hvd.local_rank()) graph = tf.Graph() from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score with graph.as_default(): import json # config = json.load(open("/data/xuht/bert/chinese_L-12_H-768_A-12/bert_config.json", "r")) config = json.load(open(FLAGS.config_file, "r")) init_checkpoint = FLAGS.init_checkpoint print("===init checkoutpoint==={}".format(init_checkpoint)) # init_checkpoint = "/data/xuht/bert/chinese_L-12_H-768_A-12/bert_model.ckpt" # init_checkpoint = "/data/xuht/concat/model_1/oqmrc.ckpt" config = Bunch(config) config.use_one_hot_embeddings = True config.scope = "esim/bert" config.dropout_prob = 0.1 config.label_type = "single_label" config.lstm_dim = 128 config.num_heads = 4 import json label_dict = json.load(open(FLAGS.label_id)) # label_tensor = np.asarray(label_dict["class_ratio"]).astype(np.float32) label_tensor = None # config.loss = "focal_loss" json.dump(config, open(FLAGS.model_output + "/config.json", "w")) # os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.gpu_id sess = tf.Session(config=sess_config) train_size = int(FLAGS.train_size / hvd.size()) num_train_steps = int(train_size / FLAGS.batch_size * FLAGS.epoch) num_warmup_steps = int(num_train_steps * 0.01) num_storage_steps = int(train_size / FLAGS.batch_size) print(num_train_steps, num_warmup_steps, "=============") opt_config = Bunch({ "init_lr": (5e-5 / hvd.size()), "num_train_steps": num_train_steps, "num_warmup_steps": num_warmup_steps, "train_op": "adam" }) model_io_config = Bunch({"fix_lm": True}) model_io_fn = model_io.ModelIO(model_io_config) num_choice = FLAGS.num_classes max_seq_length = FLAGS.max_length if FLAGS.model_type == "original": model_function = bert_order_classifier.classifier_model_fn_builder elif FLAGS.model_type == "attn": model_function = bert_order_classifier.classifier_attn_model_fn_builder elif FLAGS.model_type == "orignal_nonlinear": model_function = bert_order_classifier.classifier_model_fn_builder_v1 elif FLAGS.model_type == "esim_bert": model_function = esim_bert.classifier_attn_model_fn_builder model_eval_fn = model_function(config, num_choice, init_checkpoint, model_reuse=None, load_pretrained=True, model_io_fn=model_io_fn, model_io_config=model_io_config, opt_config=opt_config, input_name=["a", "b"], label_tensor=label_tensor, not_storage_params=["adam", "adam_1"], exclude_scope_dict={"task": "esim"}) def metric_fn(features, logits, loss): print(logits.get_shape(), "===logits shape===") pred_label = tf.argmax(logits, axis=-1, output_type=tf.int32) prob = tf.nn.softmax(logits) accuracy = correct = tf.equal( tf.cast(pred_label, tf.int32), tf.cast(features["label_ids"], tf.int32)) accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) return { "accuracy": accuracy, "loss": loss, "pred_label": pred_label, "label_ids": features["label_ids"] } name_to_features = { "input_ids_a": tf.FixedLenFeature([max_seq_length], tf.int64), "input_mask_a": tf.FixedLenFeature([max_seq_length], tf.int64), "segment_ids_a": tf.FixedLenFeature([max_seq_length], tf.int64), "input_ids_b": tf.FixedLenFeature([max_seq_length], tf.int64), "input_mask_b": tf.FixedLenFeature([max_seq_length], tf.int64), "segment_ids_b": tf.FixedLenFeature([max_seq_length], tf.int64), "label_ids": tf.FixedLenFeature([], tf.int64), } def _decode_record(record, name_to_features): """Decodes a record to a TensorFlow example. """ example = tf.parse_single_example(record, name_to_features) # tf.Example only supports tf.int64, but the TPU only supports tf.int32. # So cast all int64 to int32. for name in list(example.keys()): t = example[name] if t.dtype == tf.int64: t = tf.to_int32(t) example[name] = t return example params = Bunch({}) params.epoch = FLAGS.epoch params.batch_size = FLAGS.batch_size # train_features = tf_data_utils.train_input_fn("/data/xuht/wsdm19/data/train.tfrecords", # _decode_record, name_to_features, params) # eval_features = tf_data_utils.eval_input_fn("/data/xuht/wsdm19/data/dev.tfrecords", # _decode_record, name_to_features, params) # train_features = tf_data_utils.train_input_fn(FLAGS.train_file, # _decode_record, name_to_features, params) eval_features = tf_data_utils.eval_input_fn(FLAGS.dev_file, _decode_record, name_to_features, params) # [train_op, train_loss, train_per_example_loss, train_logits] = model_train_fn(train_features, [], tf.estimator.ModeKeys.TRAIN) [_, eval_loss, eval_per_example_loss, eval_logits] = model_eval_fn(eval_features, [], tf.estimator.ModeKeys.EVAL) result = metric_fn(eval_features, eval_logits, eval_loss) model_io_fn.set_saver() init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) model_io_fn.load_model(sess, init_checkpoint) print(" ==succeeded in loading model== ") sess.run(hvd.broadcast_global_variables(0)) def eval_fn(result): i = 0 total_accuracy = 0 label, label_id = [], [] # label_weight = [] while True: try: eval_result = sess.run(result) total_accuracy += eval_result["accuracy"] label_id.extend(eval_result["label_ids"]) label.extend(eval_result["pred_label"]) # for item in eval_result["label_ids"]: # label_weight.append(label_tensor[item]) i += 1 except tf.errors.OutOfRangeError: print("End of dataset") break # f1 = f1_score(label_id, label, average="macro", sample_weight=label_weight) # accuracy = accuracy_score(label_id, label, sample_weight=label_weight) f1 = f1_score(label_id, label, average="macro") accuracy = accuracy_score(label_id, label) print("test accuracy accuracy {} {} f1 {}".format( total_accuracy / i, accuracy, f1)) return total_accuracy / i, f1 # print("===========begin to train============") # train_fn(train_op, train_loss) print("===========begin to eval============") accuracy, f1 = eval_fn(result) print("==accuracy {} f1 {}==".format(accuracy, f1))
def model_fn(features, labels, mode): task_type = 'all_neg' num_task = kargs.get('num_task', 1) model_io_fn = model_io.ModelIO(model_io_config) if mode == tf.estimator.ModeKeys.TRAIN: dropout_prob = model_config.dropout_prob is_training = True else: dropout_prob = 0.0 is_training = False if model_io_config.fix_lm == True: scope = model_config.scope + "_finetuning" else: scope = model_config.scope if kargs.get("get_pooled_output", "pooled_output") == "pooled_output": pooled_feature = model.get_pooled_output() elif kargs.get("get_pooled_output", "task_output") == "task_output": pooled_feature_dict = model.get_task_output() pooled_feature = pooled_feature_dict['pooled_feature'] if kargs.get('apply_head_proj', False): with tf.variable_scope(scope + "/head_proj", reuse=tf.AUTO_REUSE): feature_a = simclr_utils.projection_head( pooled_feature_dict['feature_a'], is_training, head_proj_dim=128, num_nlh_layers=1, head_proj_mode='nonlinear', name='head_contrastive') pooled_feature_dict['feature_a'] = feature_a with tf.variable_scope(scope + "/head_proj", reuse=tf.AUTO_REUSE): feature_b = simclr_utils.projection_head( pooled_feature_dict['feature_b'], is_training, head_proj_dim=128, num_nlh_layers=1, head_proj_mode='nonlinear', name='head_contrastive') pooled_feature_dict['feature_b'] = feature_b tf.logging.info( "****** apply contrastive feature projection *******") shape_list = bert_utils.get_shape_list(pooled_feature, expected_rank=[2]) batch_size = shape_list[0] random_batch = tf.random_shuffle(tf.range(batch_size)) input_a = features['input_ids_a'] input_b = tf.gather(features['input_ids_b'], random_batch) not_equal = tf.cast(tf.not_equal(input_a, input_b), tf.int32) not_equal = tf.reduce_sum(not_equal, axis=-1) loss_mask = tf.cast(tf.not_equal(not_equal, tf.zeros_like(not_equal)), tf.float32) feat_a = pooled_feature_dict['feature_a'] feat_b = tf.gather(pooled_feature_dict['feature_b'], random_batch) pooled_feature_dict['feature_b'] = feat_b label_ids = tf.zeros_like(loss_mask) loss = tf.constant(0.0) params_size = model_io_fn.count_params(model_config.scope) print("==total encoder params==", params_size) if kargs.get("feature_distillation", True): universal_feature_a = features.get("input_ids_a_features", None) universal_feature_b = features.get("input_ids_b_features", None) if universal_feature_a is None or universal_feature_b is None: tf.logging.info( "****** not apply feature distillation *******") feature_loss = tf.constant(0.0) else: feature_a = pooled_feature_dict['feature_a'] feature_a_shape = bert_utils.get_shape_list( feature_a, expected_rank=[2, 3]) pretrain_feature_a_shape = bert_utils.get_shape_list( universal_feature_a, expected_rank=[2, 3]) if feature_a_shape[-1] != pretrain_feature_a_shape[-1]: with tf.variable_scope(scope + "/feature_proj", reuse=tf.AUTO_REUSE): proj_feature_a = tf.layers.dense( feature_a, pretrain_feature_a_shape[-1]) # with tf.variable_scope(scope+"/feature_rec", reuse=tf.AUTO_REUSE): # proj_feature_a_rec = tf.layers.dense(proj_feature_a, feature_a_shape[-1]) # loss += tf.reduce_mean(tf.reduce_sum(tf.square(proj_feature_a_rec-feature_a), axis=-1))/float(num_task) tf.logging.info( "****** apply auto-encoder for feature compression *******" ) else: proj_feature_a = feature_a feature_a_norm = tf.stop_gradient( tf.sqrt( tf.reduce_sum(tf.pow(proj_feature_a, 2), axis=-1, keepdims=True)) + 1e-20) proj_feature_a /= feature_a_norm feature_b = pooled_feature_dict['feature_b'] if feature_a_shape[-1] != pretrain_feature_a_shape[-1]: with tf.variable_scope(scope + "/feature_proj", reuse=tf.AUTO_REUSE): proj_feature_b = tf.layers.dense( feature_b, pretrain_feature_a_shape[-1]) # with tf.variable_scope(scope+"/feature_rec", reuse=tf.AUTO_REUSE): # proj_feature_b_rec = tf.layers.dense(proj_feature_b, feature_a_shape[-1]) # loss += tf.reduce_mean(tf.reduce_sum(tf.square(proj_feature_b_rec-feature_b), axis=-1))/float(num_task) tf.logging.info( "****** apply auto-encoder for feature compression *******" ) else: proj_feature_b = feature_b feature_b_norm = tf.stop_gradient( tf.sqrt( tf.reduce_sum(tf.pow(proj_feature_b, 2), axis=-1, keepdims=True)) + 1e-20) proj_feature_b /= feature_b_norm feature_a_distillation = tf.reduce_mean( tf.square(universal_feature_a - proj_feature_a), axis=-1) feature_b_distillation = tf.reduce_mean( tf.square(universal_feature_b - proj_feature_b), axis=-1) feature_loss = tf.reduce_mean( (feature_a_distillation + feature_b_distillation) / 2.0) / float(num_task) loss += feature_loss tf.logging.info( "****** apply prertained feature distillation *******") if kargs.get("embedding_distillation", True): word_embed = model.emb_mat random_embed_shape = bert_utils.get_shape_list( word_embed, expected_rank=[2, 3]) print("==random_embed_shape==", random_embed_shape) pretrained_embed = kargs.get('pretrained_embed', None) if pretrained_embed is None: tf.logging.info( "****** not apply prertained feature distillation *******") embed_loss = tf.constant(0.0) else: pretrain_embed_shape = bert_utils.get_shape_list( pretrained_embed, expected_rank=[2, 3]) print("==pretrain_embed_shape==", pretrain_embed_shape) if random_embed_shape[-1] != pretrain_embed_shape[-1]: with tf.variable_scope(scope + "/embedding_proj", reuse=tf.AUTO_REUSE): proj_embed = tf.layers.dense(word_embed, pretrain_embed_shape[-1]) else: proj_embed = word_embed embed_loss = tf.reduce_mean( tf.reduce_mean(tf.square(proj_embed - pretrained_embed), axis=-1)) / float(num_task) loss += embed_loss tf.logging.info( "****** apply prertained feature distillation *******") if kargs.get('loss', 'contrastive_loss') == 'contrastive_loss': feature_a = tf.nn.l2_normalize(pooled_feature_dict['feature_a'], axis=-1) feature_b = tf.nn.l2_normalize(pooled_feature_dict['feature_b'], axis=-1) per_example_loss, logits = loss_utils.contrastive_loss( label_ids, feature_a, feature_b, kargs.get('margin', 1.0)) tf.logging.info("****** contrastive_loss *******") elif kargs.get( 'loss', 'contrastive_loss') == 'exponent_neg_manhattan_distance_mse': feature_a = tf.nn.l2_normalize(pooled_feature_dict['feature_a'], axis=-1) feature_b = tf.nn.l2_normalize(pooled_feature_dict['feature_b'], axis=-1) per_example_loss, logits = loss_utils.exponent_neg_manhattan_distance( label_ids, feature_a, feature_b, 'mse') tf.logging.info( "****** exponent_neg_manhattan_distance_mse *******") else: feature_a = tf.nn.l2_normalize(pooled_feature_dict['feature_a'], axis=-1) feature_b = tf.nn.l2_normalize(pooled_feature_dict['feature_b'], axis=-1) per_example_loss, logits = loss_utils.contrastive_loss( label_ids, feature_a, feature_b, kargs.get('margin', 1.0)) tf.logging.info("****** contrastive_loss *******") masked_per_example_loss = per_example_loss * loss_mask task_loss = tf.reduce_sum(masked_per_example_loss) / ( 1e-10 + tf.reduce_sum(loss_mask)) loss += task_loss / float(num_task) if mode == tf.estimator.ModeKeys.TRAIN: multi_task_config = kargs.get("multi_task_config", {}) if multi_task_config.get(task_type, {}).get("lm_augumentation", False): print("==apply lm_augumentation==") masked_lm_positions = features["masked_lm_positions"] masked_lm_ids = features["masked_lm_ids"] masked_lm_weights = features["masked_lm_weights"] (masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs) = pretrain.get_masked_lm_output( model_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights, reuse=model_reuse) masked_lm_loss_mask = tf.expand_dims(loss_mask, -1) * tf.ones( (1, multi_task_config[task_type]["max_predictions_per_seq"])) masked_lm_loss_mask = tf.reshape(masked_lm_loss_mask, (-1, )) masked_lm_label_weights = tf.reshape(masked_lm_weights, [-1]) masked_lm_loss_mask *= tf.cast(masked_lm_label_weights, tf.float32) masked_lm_example_loss *= masked_lm_loss_mask # multiply task_mask masked_lm_loss = tf.reduce_sum(masked_lm_example_loss) / ( 1e-10 + tf.reduce_sum(masked_lm_loss_mask)) loss += multi_task_config[task_type][ "masked_lm_loss_ratio"] * masked_lm_loss masked_lm_label_ids = tf.reshape(masked_lm_ids, [-1]) print(masked_lm_log_probs.get_shape(), "===masked lm log probs===") print(masked_lm_label_ids.get_shape(), "===masked lm ids===") print(masked_lm_label_weights.get_shape(), "===masked lm mask===") lm_acc = build_accuracy(masked_lm_log_probs, masked_lm_label_ids, masked_lm_loss_mask) if kargs.get("task_invariant", "no") == "yes": print("==apply task adversarial training==") with tf.variable_scope(scope + "/dann_task_invariant", reuse=model_reuse): (_, task_example_loss, task_logits) = distillation_utils.feature_distillation( model.get_pooled_output(), 1.0, features["task_id"], kargs.get("num_task", 7), dropout_prob, True) masked_task_example_loss = loss_mask * task_example_loss masked_task_loss = tf.reduce_sum(masked_task_example_loss) / ( 1e-10 + tf.reduce_sum(loss_mask)) loss += kargs.get("task_adversarial", 1e-2) * masked_task_loss tvars = model_io_fn.get_params(model_config.scope, not_storage_params=not_storage_params) if mode == tf.estimator.ModeKeys.TRAIN: multi_task_config = kargs.get("multi_task_config", {}) if multi_task_config.get(task_type, {}).get("lm_augumentation", False): print("==apply lm_augumentation==") masked_lm_pretrain_tvars = model_io_fn.get_params( "cls/predictions", not_storage_params=not_storage_params) tvars.extend(masked_lm_pretrain_tvars) try: params_size = model_io_fn.count_params(model_config.scope) print("==total params==", params_size) except: print("==not count params==") # print(tvars) if load_pretrained == "yes": model_io_fn.load_pretrained(tvars, init_checkpoint, exclude_scope=exclude_scope) if mode == tf.estimator.ModeKeys.TRAIN: acc = build_accuracy(logits, label_ids, loss_mask, loss_type=kargs.get('loss', 'contrastive_loss')) return_dict = { "loss": loss, "logits": logits, "task_num": tf.reduce_sum(loss_mask), "tvars": tvars } return_dict["{}_acc".format(task_type)] = acc if kargs.get("task_invariant", "no") == "yes": return_dict["{}_task_loss".format( task_type)] = masked_task_loss task_acc = build_accuracy(task_logits, features["task_id"], loss_mask) return_dict["{}_task_acc".format(task_type)] = task_acc if multi_task_config.get(task_type, {}).get("lm_augumentation", False): return_dict["{}_masked_lm_loss".format( task_type)] = masked_lm_loss return_dict["{}_masked_lm_acc".format(task_type)] = lm_acc if kargs.get("embedding_distillation", True): return_dict["embed_loss"] = embed_loss * float(num_task) else: return_dict["embed_loss"] = task_loss if kargs.get("feature_distillation", True): return_dict["feature_loss"] = feature_loss * float(num_task) else: return_dict["feature_loss"] = task_loss return_dict["task_loss"] = task_loss return return_dict elif mode == tf.estimator.ModeKeys.EVAL: eval_dict = { "loss": loss, "logits": logits, "feature": model.get_pooled_output() } if kargs.get("adversarial", "no") == "adversarial": eval_dict["task_logits"] = task_logits return eval_dict
def export_model_v1(config): opt_config = Bunch({"init_lr":2e-5, "num_train_steps":1e30, "cycle":False}) model_io_config = Bunch({"fix_lm":False}) bert_config = json.load(open(config["config_file"], "r")) model_config = Bunch(bert_config) model_config.use_one_hot_embeddings = True model_config.scope = "bert" model_config.dropout_prob = 0.1 model_config.label_type = "single_label" with open(config["label2id"], "r") as frobj: label_dict = json.load(frobj) num_classes = len(label_dict["id2label"]) max_seq_length = config["max_length"] def serving_input_receiver_fn(): # receive tensors receiver_tensors = { "input_ids": tf.placeholder(tf.int32, [None, max_seq_length], name='input_ids'), "input_mask": tf.placeholder(tf.int32, [None, max_seq_length], name='input_mask'), "segment_ids": tf.placeholder(tf.int32, [None, max_seq_length], name='segment_ids'), "label_ids": tf.placeholder(tf.int32, [None], name='label_ids'), } # Convert give inputs to adjust to the model. features = {} for key in receiver_tensors: features[key] = receiver_tensors[key] return tf.estimator.export.ServingInputReceiver(receiver_tensors=receiver_tensors, features=features) # def serving_input_receiver_fn(): # receive serialized example # serialized_tf_example = tf.placeholder(dtype=tf.string, # shape=None, # name='input_example_tensor') # receiver_tensors = {'examples': serialized_tf_example} # features = tf.parse_example(serialized_tf_example, feature_spec) # return tf.estimator.export.ServingInputReceiver(features, receiver_tensors) model_io_fn = model_io.ModelIO(model_io_config) model_fn = bert_classifier_estimator.classifier_model_fn_builder( model_config, num_classes, config["init_checkpoint"], reuse=None, load_pretrained=True, model_io_fn=model_io_fn, model_io_config=model_io_config, opt_config=opt_config) estimator = tf.estimator.Estimator( model_fn=model_fn, model_dir=config["model_dir"]) export_dir = estimator.export_savedmodel(config["export_path"], serving_input_receiver_fn, checkpoint_path=config["init_checkpoint"]) print("===Succeeded in exporting saved model==={}".format(export_dir))
def model_fn(features, labels, mode): train_ops = [] train_hooks = [] logits_dict = {} losses_dict = {} features_dict = {} tvars = [] task_num_dict = {} total_loss = tf.constant(0.0) task_num = 0 encoder = {} hook_dict = {} print(task_type_dict.keys(), "==task type dict==") num_task = len(task_type_dict) for index, task_type in enumerate(task_type_dict.keys()): if model_config_dict[task_type].model_type in model_type_lst: reuse = True else: reuse = None model_type_lst.append(model_config_dict[task_type].model_type) if task_type_dict[task_type] == "cls_task": if model_config_dict[task_type].model_type not in encoder: model_api = model_zoo(model_config_dict[task_type]) model = model_api(model_config_dict[task_type], features, labels, mode, target_dict[task_type], reuse=reuse) encoder[model_config_dict[task_type].model_type] = model print(encoder, "==encode==") task_model_fn = cls_model_fn( encoder[model_config_dict[task_type].model_type], model_config_dict[task_type], num_labels_dict[task_type], init_checkpoint_dict[task_type], reuse, load_pretrained_dict[task_type], model_io_config, opt_config, exclude_scope=exclude_scope_dict[task_type], not_storage_params=not_storage_params_dict[task_type], target=target_dict[task_type], label_lst=None, output_type=output_type, task_layer_reuse=task_layer_reuse, task_type=task_type, num_task=num_task, task_adversarial=1e-2, **kargs) print("==SUCCEEDED IN LODING==", task_type) result_dict = task_model_fn(features, labels, mode) logits_dict[task_type] = result_dict["logits"] losses_dict[task_type] = result_dict["loss"] # task loss for key in [ "masked_lm_loss", "task_loss", "acc", "task_acc", "masked_lm_acc" ]: name = "{}_{}".format(task_type, key) if name in result_dict: hook_dict[name] = result_dict[name] hook_dict["{}_loss".format(task_type)] = result_dict["loss"] total_loss += result_dict["loss"] if mode == tf.estimator.ModeKeys.TRAIN: tvars.extend(result_dict["tvars"]) task_num += result_dict["task_num"] task_num_dict[task_type] = result_dict["task_num"] elif mode == tf.estimator.ModeKeys.EVAL: features[task_type] = result_dict["feature"] else: continue hook_dict["total_loss"] = total_loss if mode == tf.estimator.ModeKeys.TRAIN: model_io_fn = model_io.ModelIO(model_io_config) optimizer_fn = optimizer.Optimizer(opt_config) model_io_fn.print_params(list(set(tvars)), string=", trainable params") update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) print("==update_ops==", update_ops) with tf.control_dependencies(update_ops): train_op = optimizer_fn.get_train_op( total_loss, list(set(tvars)), opt_config.init_lr, opt_config.num_train_steps, **kargs) model_io_fn.set_saver(optimizer_fn.opt) if kargs.get("task_index", 1) == 0 and kargs.get( "run_config", None): model_io_fn.get_hooks(kargs.get("checkpoint_dir", None), kargs.get("num_storage_steps", 1000)) training_hooks = model_io_fn.checkpoint_hook elif kargs.get("task_index", 1) == 1: training_hooks = [] else: training_hooks = [] if len(optimizer_fn.distributed_hooks) >= 1: training_hooks.extend(optimizer_fn.distributed_hooks) print(training_hooks, "==training_hooks==", "==task_index==", kargs.get("task_index", 1)) if output_type == "sess": return { "train": { "total_loss": total_loss, "loss": losses_dict, "logits": logits_dict, "train_op": train_op, "task_num_dict": task_num_dict }, "hooks": train_hooks } elif output_type == "estimator": hook_dict['learning_rate'] = optimizer_fn.learning_rate logging_hook = tf.train.LoggingTensorHook(hook_dict, every_n_iter=100) training_hooks.append(logging_hook) print("==hook_dict==") print(hook_dict) for key in hook_dict: tf.summary.scalar(key, hook_dict[key]) for index, task_type in enumerate(task_type_dict.keys()): tmp = "{}_loss".format(task_type) if tmp == key: tf.summary.scalar( "loss_gap_{}".format(task_type), hook_dict["total_loss"] - hook_dict[key]) for key in task_num_dict: tf.summary.scalar(key + "_task_num", task_num_dict[key]) estimator_spec = tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op) # training_hooks=training_hooks) return estimator_spec elif mode == tf.estimator.ModeKeys.EVAL: # eval execute for each class solo def metric_fn(logits, label_ids): """Computes the loss and accuracy of the model.""" sentence_log_probs = tf.reshape(logits, [-1, logits.shape[-1]]) sentence_predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) sentence_labels = tf.reshape(label_ids, [-1]) sentence_accuracy = tf.metrics.accuracy( labels=label_ids, predictions=sentence_predictions) sentence_f = tf_metrics.f1(label_ids, sentence_predictions, num_labels, label_lst, average="macro") eval_metric_ops = {"f1": sentence_f, "acc": sentence_accuracy} return eval_metric_ops if output_type == "sess": return { "eval": { "logits": logits_dict, "total_loss": total_loss, "feature": features, "loss": losses_dict } } elif output_type == "estimator": eval_metric_ops = {} for key in logits: eval_dict = metric_fn(logits[key], features_task_dict[key]["label_ids"]) for sub_key in eval_dict.keys(): eval_key = "{}_{}".format(key, sub_key) eval_metric_ops[eval_key] = eval_dict[sub_key] estimator_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss / task_num, eval_metric_ops=eval_metric_ops) return estimator_spec else: raise NotImplementedError()