def test_adam(self): with self.test_session() as sess: w = tf.get_variable( "w", shape=[3], initializer=tf.constant_initializer([0.1, -0.2, -0.1])) x = tf.constant([0.4, 0.2, -0.5]) loss = tf.reduce_mean(tf.square(x - w)) tvars = tf.trainable_variables() grads = tf.gradients(loss, tvars) global_step = tf.train.get_or_create_global_step() optimizer = optimization.AdamWeightDecayOptimizer(learning_rate=0.2) train_op = optimizer.apply_gradients(zip(grads, tvars), global_step) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) for _ in range(100): sess.run(train_op) w_np = sess.run(w) self.assertAllClose(w_np.flat, [0.4, 0.2, -0.5], rtol=1e-2, atol=1e-2)
def __init__(self, bert_config, num_labels, seq_length, init_checkpoint): self.bert_config = bert_config self.num_labels = num_labels self.seq_length = seq_length self.tower_grads = [] self.losses = [] self.input_ids = tf.placeholder(tf.int32, [None, self.seq_length], name='input_ids') self.input_mask = tf.placeholder(tf.int32, [None, self.seq_length], name='input_mask') self.segment_ids = tf.placeholder(tf.int32, [None, self.seq_length], name='segment_ids') self.labels = tf.placeholder(tf.int32, [None], name='labels') self.batch_size = tf.placeholder(tf.int32, shape=[], name='batch_size') self.is_training = tf.placeholder(tf.bool, shape=[], name='is_training') print(self.batch_size) self.gpu_step = self.batch_size // gpu_nums global_step = tf.train.get_or_create_global_step() learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) # Implements linear decay of the learning rate. learning_rate = tf.train.polynomial_decay(learning_rate, global_step, num_train_steps, end_learning_rate=0.0, power=1.0, cycle=False) if num_warmup_steps: global_steps_int = tf.cast(global_step, tf.int32) warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) global_steps_float = tf.cast(global_steps_int, tf.float32) warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) warmup_percent_done = global_steps_float / warmup_steps_float warmup_learning_rate = init_lr * warmup_percent_done is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) learning_rate = ((1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) optimizer = optimization.AdamWeightDecayOptimizer( learning_rate=learning_rate, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) with tf.variable_scope(tf.get_variable_scope()) as outer_scope: pred = [] label = [] for d in range(gpu_nums): with tf.device("/gpu:%s" % d), tf.name_scope("%s_%s" % ("tower", d)): self.model = modeling.BertModel( config=self.bert_config, is_training=self.is_training, input_ids=self.input_ids[d * self.gpu_step:(d + 1) * self.gpu_step], input_mask=self.input_mask[d * self.gpu_step:(d + 1) * self.gpu_step], token_type_ids=self.segment_ids[d * self.gpu_step:(d + 1) * self.gpu_step]) print("GPU:", d) tvars = tf.trainable_variables() initialized_variable_names = {} if init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_layer = self.model.get_pooled_output() logging.info(output_layer) if self.is_training == True: output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) match_1 = tf.strided_slice(output_layer, [0], [self.gpu_step], [2]) match_2 = tf.strided_slice(output_layer, [1], [self.gpu_step], [2]) match = tf.concat([match_1, match_2], 1) self.logits = tf.layers.dense(match, self.num_labels, name='fc', reuse=tf.AUTO_REUSE) #预测标签 self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1, name="pred") logging.info(self.y_pred_cls) #真实标签 self.r_labels = tf.strided_slice( self.labels[d * self.gpu_step:(d + 1) * self.gpu_step], [0], [self.gpu_step], [2]) logging.info(self.r_labels) one_hot_labels = tf.one_hot(self.r_labels, depth=self.num_labels, dtype=tf.float32) log_probs = tf.nn.log_softmax(self.logits, axis=-1) per_example_loss = - (30*one_hot_labels[:,0] * log_probs[:,0]) \ - (9*one_hot_labels[:,1] * log_probs[:,1]) \ - (2*one_hot_labels[:,2] * log_probs[:,2]) \ - (2*one_hot_labels[:,3] * log_probs[:,3]) \ - (9*one_hot_labels[:,4] * log_probs[:,4]) \ + 1e-10 self.loss = tf.reduce_mean(per_example_loss) #self.optim = optimization.create_optimizer(self.loss, learning_rate, num_train_steps, num_warmup_steps, False) tvars = tf.trainable_variables() grads = tf.gradients(self.loss, tvars) (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) self.tower_grads.append(list(zip(grads, tvars))) self.losses.append(self.loss) label.append(self.r_labels) pred.append(self.y_pred_cls) outer_scope.reuse_variables() with tf.name_scope("apply_gradients"), tf.device("/cpu:0"): gradients = self.average_gradients(self.tower_grads) train_op = optimizer.apply_gradients(gradients, global_step=global_step) new_global_step = global_step + 1 self.train_op = tf.group(train_op, [global_step.assign(new_global_step)]) self.losses = tf.reduce_mean(self.losses) self.pred = tf.concat(pred, 0) self.label = tf.concat(label, 0) logging.info(self.pred) logging.info(self.label)
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): """Creates an optimizer training op.""" global_step = tf.train.get_or_create_global_step() learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) # Implements linear decay of the learning rate. learning_rate = tf.train.polynomial_decay( learning_rate, global_step, num_train_steps, end_learning_rate=0.0, power=1.0, cycle=False) # Implements linear warmup. I.e., if global_step < num_warmup_steps, the # learning rate will be `global_step/num_warmup_steps * init_lr`. if num_warmup_steps: global_steps_int = tf.cast(global_step, tf.int32) warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) global_steps_float = tf.cast(global_steps_int, tf.float32) warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) warmup_percent_done = global_steps_float / warmup_steps_float warmup_learning_rate = init_lr * warmup_percent_done is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) learning_rate = ( (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) # It is recommended that you use this optimizer for fine tuning, since this # is how the model was trained (note that the Adam m/v variables are NOT # loaded from init_checkpoint.) optimizer = optimization.AdamWeightDecayOptimizer( learning_rate=learning_rate, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) if use_tpu: optimizer = contrib_tpu.CrossShardOptimizer(optimizer) tvars = tf.trainable_variables() print(tvars) tvars = [v for v in tvars if "bert" not in v.name] print("no bert") print(tvars) grads = tf.gradients(loss, tvars) # This is how the model was pre-trained. (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=global_step) # Normally the global step update is done inside of `apply_gradients`. # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use # a different optimizer, you should probably take this line out. new_global_step = global_step + 1 train_op = tf.group(train_op, [global_step.assign(new_global_step)]) return train_op
def compile(self): """Define operations for loss, measures, optimization. and create session, initialize variables. """ config = self.config # define operations for loss, measures, optimization self.loss = self.__compute_loss() self.accuracy, self.precision, self.recall, self.f1 = self.__compute_measures() with tf.variable_scope('optimization'): self.global_step = tf.train.get_or_create_global_step() if 'bert' in config.emb_class: from bert import optimization if config.use_bert_optimization: self.learning_rate = tf.constant(value=config.starter_learning_rate, shape=[], dtype=tf.float32) self.train_op = optimization.create_optimizer(self.loss, config.starter_learning_rate, config.num_train_steps, config.num_warmup_steps, False) else: # exponential decay of the learning rate self.learning_rate = tf.train.exponential_decay(config.starter_learning_rate, self.global_step, config.decay_steps, config.decay_rate, staircase=True) # linear warmup, if global_step < num_warmup_steps, then # learning rate = (global_step / num_warmup_steps) * starter_learning_rate global_steps_int = tf.cast(self.global_step, tf.int32) warmup_steps_int = tf.constant(config.num_warmup_steps, dtype=tf.int32) global_steps_float = tf.cast(global_steps_int, tf.float32) warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) warmup_percent_done = global_steps_float / warmup_steps_float warmup_learning_rate = config.starter_learning_rate * warmup_percent_done is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) self.learning_rate = ((1.0 - is_warmup) * self.learning_rate + is_warmup * warmup_learning_rate) # Adam optimizer with correct L2 weight decay optimizer = optimization.AdamWeightDecayOptimizer( learning_rate=self.learning_rate, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), config.clip_norm) train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=self.global_step) new_global_step = self.global_step + 1 self.train_op = tf.group(train_op, [self.global_step.assign(new_global_step)]) else: # exponential decay of the learning rate self.learning_rate = tf.train.exponential_decay(config.starter_learning_rate, self.global_step, config.decay_steps, config.decay_rate, staircase=True) # linear warmup, if global_step < num_warmup_steps, then # learning rate = (global_step / num_warmup_steps) * starter_learning_rate global_steps_int = tf.cast(self.global_step, tf.int32) warmup_steps_int = tf.constant(config.num_warmup_steps, dtype=tf.int32) global_steps_float = tf.cast(global_steps_int, tf.float32) warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) warmup_percent_done = global_steps_float / warmup_steps_float warmup_learning_rate = config.starter_learning_rate * warmup_percent_done is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) self.learning_rate = ((1.0 - is_warmup) * self.learning_rate + is_warmup * warmup_learning_rate) # Adam optimizer optimizer = tf.train.AdamOptimizer(self.learning_rate) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), config.clip_norm) self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=self.global_step) ''' # Adam optimizer with cyclical learning rate import clr # https://github.com/mhmoodlan/cyclic-learning-rate self.learning_rate = clr.cyclic_learning_rate(global_step=self.global_step, learning_rate=config.starter_learning_rate * 0.3, # 0.0003 max_lr=config.starter_learning_rate, # 0.001 step_size=5000, mode='triangular') optimizer = tf.train.AdamOptimizer(self.learning_rate) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), config.clip_norm) self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=self.global_step) ''' # create session, initialize variables. this should be placed at the end of graph definitions. session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, inter_op_parallelism_threads=0, intra_op_parallelism_threads=0) session_conf.gpu_options.allow_growth = True sess = tf.Session(config=session_conf) feed_dict = {self.wrd_embeddings_init: config.embvec.wrd_embeddings} sess.run(tf.global_variables_initializer(), feed_dict=feed_dict) # feed large embedding data sess.run(tf.local_variables_initializer()) # for tf_metrics self.sess = sess
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu, exclude_bert): """Creates an optimizer training op, optionally excluding BERT vars.""" global_step = tf.train.get_or_create_global_step() learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) # Implements linear decay of the learning rate. learning_rate = tf.train.polynomial_decay(learning_rate, global_step, num_train_steps, end_learning_rate=0.0, power=1.0, cycle=False) # Implements linear warmup. I.e., if global_step < num_warmup_steps, the # learning rate will be `global_step/num_warmup_steps * init_lr`. if num_warmup_steps: global_steps_int = tf.cast(global_step, tf.int32) warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) global_steps_float = tf.cast(global_steps_int, tf.float32) warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) warmup_percent_done = global_steps_float / warmup_steps_float warmup_learning_rate = init_lr * warmup_percent_done is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) learning_rate = ((1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) # It is recommended that you use this optimizer for fine tuning, since this # is how the model was trained (note that the Adam m/v variables are NOT # loaded from init_checkpoint.) optimizer = optimization.AdamWeightDecayOptimizer( learning_rate=learning_rate, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) if use_tpu: optimizer = tf.estimator.tpu.CrossShardOptimizer(optimizer) tvars = tf.trainable_variables() if exclude_bert: bert_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "bert") tvars = [vv for vv in tvars if vv not in bert_vars] tf.logging.info("Training the following variables:") for vv in tvars: tf.logging.info(vv.name) grads = tf.gradients(loss, tvars, colocate_gradients_with_ops=True) # This is how the model was pre-trained. (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) new_global_step = global_step + 1 train_op = tf.group(train_op, [global_step.assign(new_global_step)]) return train_op
def main(_): logging.set_verbosity(logging.INFO) bert_config = bert_modeling.BertConfig.from_json_file( FLAGS.bert_config_file) validate_flags_or_throw(bert_config) tf.gfile.MakeDirs(FLAGS.output_dir) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf_contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf_contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf_contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf_contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, per_host_input_for_training=is_per_host)) num_train_steps = None num_warmup_steps = None '''if FLAGS.do_train: with tf.gfile.Open(FLAGS.record_count_file, "r") as f: num_train_features = int(f.read().strip()) num_train_steps = int(num_train_features / FLAGS.train_batch_size * FLAGS.num_train_epochs) logging.info("record_count_file: %s", FLAGS.record_count_file) logging.info("num_records (features): %d", num_train_features) logging.info("num_train_epochs: %d", FLAGS.num_train_epochs) logging.info("train_batch_size: %d", FLAGS.train_batch_size) logging.info("num_train_steps: %d", num_train_steps) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = tydi_modeling.model_fn_builder( bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this falls back to normal Estimator on CPU or GPU. estimator = tf_contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.predict_batch_size)''' if FLAGS.do_train: with tf.gfile.Open(FLAGS.record_count_file, "r") as f: num_train_features = int(f.read().strip()) num_train_steps = int(num_train_features / FLAGS.train_batch_size * FLAGS.num_train_epochs) logging.info("record_count_file: %s", FLAGS.record_count_file) logging.info("num_records (features): %d", num_train_features) logging.info("num_train_epochs: %d", FLAGS.num_train_epochs) logging.info("train_batch_size: %d", FLAGS.train_batch_size) logging.info("num_train_steps: %d", num_train_steps) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) logging.info("Running training on precomputed features") logging.info(" Num split examples = %d", num_train_features) logging.info(" Batch size = %d", FLAGS.train_batch_size) logging.info(" Num steps = %d", num_train_steps) train_filenames = tf.gfile.Glob(FLAGS.train_records_file) model_fn = tydi_modeling.model_fn_builder( bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) def input_fn_builder(input_file, seq_length, is_training, drop_remainder): """Creates an `input_fn` closure to be passed to TPUEstimator.""" # This needs to be kept in sync with `FeatureWriter`. name_to_features = { "language_id": tf.FixedLenFeature([], tf.int64), "unique_ids": tf.FixedLenFeature([], tf.int64), "input_ids": tf.FixedLenFeature([seq_length], tf.int64), "input_mask": tf.FixedLenFeature([seq_length], tf.int64), "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), } if is_training: name_to_features["start_positions"] = tf.FixedLenFeature( [], tf.int64) name_to_features["end_positions"] = tf.FixedLenFeature( [], tf.int64) name_to_features["answer_types"] = tf.FixedLenFeature([], tf.int64) def _decode_record(record, name_to_features): example = tf.parse_single_example(record, name_to_features) # tf.Example only supports tf.int64, but the TPU only supports tf.int32. # So cast all int64 to int32. for name in list(example.keys()): t = example[name] if t.dtype == tf.int64: t = tf.to_int32(t) example[name] = t return example # For training, we want a lot of parallel reading and shuffling. # For eval, we want no shuffling and parallel reading doesn't matter. d = tf.data.TFRecordDataset(input_file) d = d.shuffle(buffer_size=100) d = d.map(lambda record: _decode_record(record, name_to_features)) return d def split_train_dev(dataset, train_port, num_dev): train_size = int(DATASET_SIZE * train_port) sub_dev_size = int(DATASET_SIZE * (1 - train_port) / num_dev) full_dataset = dataset.shuffle(100) train_dataset = full_dataset.take(train_size) test_dataset = full_dataset.skip(train_size) shard_devs = [] for i in range(num_dev): shard_devs.append( test_dataset.shard(num_shards=num_dev, index=i)) return train_dataset, shard_devs def count_dataset(dataset): cnt = 0 for i in dataset.repeat(1).make_one_shot_iterator(): # if cnt % 2000==0: # print(cnt) cnt += 1 return cnt def split_langs(dataset): def dataset_fn(ds, i): return ds.filter(lambda x: tf.equal(x['language_id'], i)) data_set_lst = [] for i in range(11): dataset_filter_lang = dataset.apply(lambda x: dataset_fn(x, i)) data_set_lst.append(dataset_filter_lang) return data_set_lst DATASET_SIZE = num_train_features NUM_LANGS = 11 tf_dataset = input_fn_builder(FLAGS.train_records_file, 512, True, False) train_set, dev_shards = split_train_dev(tf_dataset, FLAGS.train_size, FLAGS.num_dev_sets) total_num_train_samples = FLAGS.train_size * DATASET_SIZE total_num_dev_samples = DATASET_SIZE - total_num_train_samples logging.info("Nums of examples in train dataset = %d", total_num_train_samples) logging.info("Total numbers of examples in dev set = %d", total_num_dev_samples) train_set_langs = split_langs(train_set) dev_set_langs = [] for div_set in dev_shards: dev_set_langs.append(split_langs(div_set)) #train_set_langs is a 1d lst, and div_set_langs is a 2d lst, notice that you could find #corresponding languages ids in data file def sample_lang_id(lang_freq): #print(lang_freq) #print(list(range(NUM_LANGS))) return choices(list(range(NUM_LANGS)), lang_freq) # count number of languages in each language lang_sample_dist = [] for lang in train_set_langs: lang_cnt = count_dataset(lang) print(lang_cnt) lang_sample_dist.append(lang_cnt / total_num_train_samples) train_samplers = list( map(lambda x: x.repeat().batch(1).make_one_shot_iterator(), train_set_langs)) dev_samplers = [] for dev_set in dev_set_langs: dev_samplers.append( list( map( lambda x: iter(x.repeat().batch(1). make_one_shot_iterator()), dev_set))) global_step = tf.train.get_or_create_global_step() learning_rate = tf.constant(value=FLAGS.learning_rate, shape=[], dtype=tf.float32) learning_rate = tf.train.polynomial_decay(learning_rate, global_step, num_train_steps, end_learning_rate=0.0, power=1.0, cycle=False) #uncomment this to enable warm-up steps '''if num_warmup_steps: global_steps_int = tf.cast(global_step, tf.int32) warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) global_steps_float = tf.cast(global_steps_int, tf.float32) warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) warmup_percent_done = global_steps_float / warmup_steps_float warmup_learning_rate = FLAGS.learning_rate * warmup_percent_done is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) learning_rate = ( (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)''' optimizer = opt.AdamWeightDecayOptimizer( learning_rate=learning_rate, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) if FLAGS.use_tpu: optimizer = contrib_tpu.CrossShardOptimizer(optimizer) #do MultiDDS training #initialize sample distribution phis phi = tf.get_variable( "phi", [11], initializer=tf.truncated_normal_initializer(stddev=0.02)) opt_scorer = tf.train.AdamOptimizer(learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam') while global_step < num_train_steps: if not tf.equal(global_step, 0): lang_sample_dist = list(tf.nn.softmax(phi).numpy()) #load training data with phi logging.info('We are sampling from train data') data_lst = [] while len(data_lst) < FLAGS.M: #choose a langue to sample cur_lang = sample_lang_id(lang_sample_dist) data_lst.append(train_samplers[cur_lang[0]].get_next()) logging.info('Train mBert for multiple steps') for data in data_lst: with tf.GradientTape() as tape: tvars, loss = model_fn(data, _, tf.estimator.ModeKeys.TRAIN, _, global_step) #print(loss) grads = tape.gradient(loss, tvars) (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) logging.info('Estimate the effect of each language') rewards = [] for i in range(NUM_LANGS): gradient_dev = 0 gradient_train = 0 #Some languages might not have samples try: train_test = train_samplers[i].get_next() with tf.GradientTape() as tape: tvars, loss = model_fn(data, _, tf.estimator.ModeKeys.TRAIN, _, global_step) grads = tape.gradient(loss, tvars) gradient_train = grads #Not sure whether to add this line or not #TODO: modify me to allow functions (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) logging.info("Testing effect on other languages") for k in range(len(dev_samplers)): for j in range(NUM_LANGS): try: dev_data = dev_samplers[k][j].get_next() with tf.GradientTape() as tape: tvars, loss = model_fn( data, _, tf.estimator.ModeKeys.TRAIN, _, global_step) grads = tape.gradient(loss, tvars) gradient_dev += grads except: print(j, 'language not exist in dataset', k) except: print("No data in this train language!!!") #append scores of each language to reward list print(gradient_train, gradient_dev) normalize_a = tf.nn.l2_normalize(gradient_dev, 0) normalize_b = tf.nn.l2_normalize(gradient_train, 0) cos_similarity = tf.reduce_sum( tf.multiply(normalize_a, normalize_b)) rewards.append(cos_similarity) logging.info("Optimize phi!") grad_phi = 0 for i in range(NUM_LANGS): log_i = tf.log(tf.nn.softmax(phi))[i] with tf.GradientTape() as tape: grads = tape.gradient(log_i, phi) grad_phi += grads * rewards[i] opt_scorer.apply_gradient(zip(grad_phi, phi), global_step=global_step) new_global_step = global_step + 1 global_step.assign(new_global_step) #estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_predict: if not FLAGS.precomputed_predict_file: predict_examples_iter = preproc.read_tydi_examples( input_file=FLAGS.predict_file, is_training=False, max_passages=FLAGS.max_passages, max_position=FLAGS.max_position, fail_on_invalid=FLAGS.fail_on_invalid, open_fn=tf_io.gopen) shards_iter = write_tf_feature_files(predict_examples_iter) else: # Uses zeros for example and feature counts since they're unknown, and # we only use them for logging anyway. shards_iter = enumerate( ((f, 0, 0) for f in tf.gfile.Glob(FLAGS.precomputed_predict_file)), 1) # Accumulates all of the prediction results to be written to the output. full_tydi_pred_dict = {} total_num_examples = 0 for shard_num, (shard_filename, shard_num_examples, shard_num_features) in shards_iter: total_num_examples += shard_num_examples logging.info( "Shard %d: Running prediction for %s; %d examples, %d features.", shard_num, shard_filename, shard_num_examples, shard_num_features) # Runs the model on the shard and store the individual results. # If running predict on TPU, you will need to specify the number of steps. predict_input_fn = tf_io.input_fn_builder( input_file=[shard_filename], seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False) all_results = [] for result in estimator.predict(predict_input_fn, yield_single_examples=True): if len(all_results) % 10000 == 0: logging.info("Shard %d: Predicting for feature %d/%s", shard_num, len(all_results), shard_num_features) unique_id = int(result["unique_ids"]) start_logits = [float(x) for x in result["start_logits"].flat] end_logits = [float(x) for x in result["end_logits"].flat] answer_type_logits = [ float(x) for x in result["answer_type_logits"].flat ] all_results.append( tydi_modeling.RawResult( unique_id=unique_id, start_logits=start_logits, end_logits=end_logits, answer_type_logits=answer_type_logits)) # Reads the prediction candidates from the (entire) prediction input file. candidates_dict = read_candidates(FLAGS.predict_file) predict_features = [ tf.train.Example.FromString(r) for r in tf.python_io.tf_record_iterator(shard_filename) ] logging.info("Shard %d: Post-processing predictions.", shard_num) logging.info( " Num candidate examples loaded (includes all shards): %d", len(candidates_dict)) logging.info(" Num candidate features loaded: %d", len(predict_features)) logging.info(" Num prediction result features: %d", len(all_results)) logging.info(" Num shard features: %d", shard_num_features) tydi_pred_dict = postproc.compute_pred_dict( candidates_dict, predict_features, [r._asdict() for r in all_results], candidate_beam=FLAGS.candidate_beam) logging.info("Shard %d: Post-processed predictions.", shard_num) logging.info(" Num shard examples: %d", shard_num_examples) logging.info(" Num post-processed results: %d", len(tydi_pred_dict)) if shard_num_examples != len(tydi_pred_dict): logging.warning(" Num missing predictions: %d", shard_num_examples - len(tydi_pred_dict)) for key, value in tydi_pred_dict.items(): if key in full_tydi_pred_dict: logging.warning( "ERROR: '%s' already in full_tydi_pred_dict!", key) full_tydi_pred_dict[key] = value logging.info("Prediction finished for all shards.") logging.info(" Total input examples: %d", total_num_examples) logging.info(" Total output predictions: %d", len(full_tydi_pred_dict)) with tf.gfile.Open(FLAGS.output_prediction_file, "w") as output_file: for prediction in full_tydi_pred_dict.values(): output_file.write((json.dumps(prediction) + "\n").encode())
def model_fn(features, labels, mode, params): tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) is_training = (mode == tf.estimator.ModeKeys.TRAIN) (loss, per_example_loss, logits, predicted_labels) = create_model(bert_config, is_training, features, num_labels) #Used for initializing BERT model from the checkpoint tvars = tf.trainable_variables() if init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Loading from checkpoint ****") output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: #learning rate: linear warmup with expontial decay global_step = tf.train.get_or_create_global_step() learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, 2000, 0.9, staircase=True) global_steps_int = tf.cast(global_step, tf.int32) warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) global_steps_float = tf.cast(global_steps_int, tf.float32) warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) warmup_percent_done = global_steps_float / warmup_steps_float warmup_learning_rate = starter_learning_rate * warmup_percent_done is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) learning_rate = ((1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) # Adam optimizer with correct L2 weight decay optimizer = optimization.AdamWeightDecayOptimizer( learning_rate=learning_rate, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), 2.0) train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) new_global_step = global_step + 1 train_op = tf.group(train_op, [global_step.assign(new_global_step)]) tf.summary.scalar('learning_rate', learning_rate) logging_hook = tf.train.LoggingTensorHook({"batch_loss": loss}, every_n_iter=10) output_spec = tf.estimator.EstimatorSpec( mode=mode, loss=loss, train_op=train_op, training_hooks=[logging_hook]) else: #metric_fn that uses tf_metrics for calculating f1, precision and recall #for the punctuation clasess def metric_fn(label_ids, predicted_labels, input_mask, num_labels): label_ids = tf.boolean_mask(label_ids, input_mask) predicted_labels = tf.boolean_mask(predicted_labels, input_mask) precision = tf_metrics.precision(label_ids, predicted_labels, num_labels, [1, 2, 3], average="macro") recall = tf_metrics.recall(label_ids, predicted_labels, num_labels, [1, 2, 3], average="macro") f1 = tf_metrics.f1(label_ids, predicted_labels, num_labels, [1, 2, 3], average="macro") return { "eval_precision": precision, "eval_recall": recall, "eval_f": f1 } input_mask = features["input_mask"] label_ids = features["label_ids"] eval_metrics = metric_fn(label_ids, predicted_labels, input_mask, num_labels) if mode == tf.estimator.ModeKeys.EVAL: output_spec = tf.estimator.EstimatorSpec( mode=mode, loss=loss, eval_metric_ops=eval_metrics) else: predictions = {'labels': predicted_labels} output_spec = tf.estimator.EstimatorSpec( mode=mode, predictions=predictions) return output_spec
global_steps_int = tf.cast(global_step, tf.int32) warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) global_steps_float = tf.cast(global_steps_int, tf.float32) warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) warmup_percent_done = global_steps_float / warmup_steps_float warmup_learning_rate = lr * warmup_percent_done is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) learning_rate = ((1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) optimizer = optimization.AdamWeightDecayOptimizer( learning_rate=learning_rate, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) tvars = tf.trainable_variables() grads = tf.gradients(loss, tvars) # This is how the model was pre-trained. (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) train_op = optimizer.apply_gradients( # 在这个方法内部 可以调整是否需要训练 BERT 参数 zip(grads, tvars), global_step=global_step) new_global_step = global_step + 1 train_op = tf.group(train_op, [global_step.assign(new_global_step)]) # train_op = tf.train.AdamOptimizer(lr).minimize(loss) # 这个是传统的optimization #bert模型参数初始化的地方