def run_bert_pretrain(strategy, input_meta_data): """Runs BERT pre-training.""" albert_config = AlbertConfig.from_json_file(FLAGS.albert_config_file) if not strategy: raise ValueError('Distribution strategy is not specified.') # Runs customized training loop. logging.info( 'Training using customized training loop TF 2.0 with distrubuted' 'strategy.') num_train_steps = None num_warmup_steps = None steps_per_epoch = None if FLAGS.do_train: len_train_examples = input_meta_data['train_data_size'] steps_per_epoch = int(len_train_examples / FLAGS.train_batch_size) num_train_steps = int(len_train_examples / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) return run_customized_training(strategy, albert_config, input_meta_data["max_seq_length"], input_meta_data["max_predictions_per_seq"], FLAGS.output_dir, steps_per_epoch, steps_per_epoch, FLAGS.num_train_epochs, FLAGS.learning_rate, num_warmup_steps, FLAGS.input_files, FLAGS.train_batch_size)
def get_model_v2_bidaf(albert_config_dict, max_seq_length, init_checkpoint, learning_rate, start_n_top, end_n_top, dropout, num_train_steps, num_warmup_steps): """Returns keras model""" if isinstance(albert_config_dict, dict): albert_config = AlbertConfig.from_dict(albert_config_dict) else: albert_config = albert_config_dict print('new model ALBertQAModel_v2 ...') squad_model = ALBertQAModel_v2(albert_config, max_seq_length, init_checkpoint, start_n_top, end_n_top, dropout) learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=learning_rate, decay_steps=num_train_steps, end_learning_rate=0.0) if num_warmup_steps: learning_rate_fn = WarmUp(initial_learning_rate=learning_rate, decay_schedule_fn=learning_rate_fn, warmup_steps=num_warmup_steps) if FLAGS.optimizer == "LAMB": optimizer_fn = LAMB else: optimizer_fn = AdamWeightDecay optimizer = optimizer_fn(learning_rate=learning_rate_fn, weight_decay_rate=FLAGS.weight_decay, beta_1=0.9, beta_2=0.999, epsilon=FLAGS.adam_epsilon, exclude_from_weight_decay=['layer_norm', 'bias']) squad_model.optimizer = optimizer return squad_model
def main(_): assert tf.version.VERSION.startswith('2.') output_path = FLAGS.converted_checkpoint_path v1_checkpoint = FLAGS.checkpoint_to_convert albert_config = AlbertConfig.from_json_file(FLAGS.albert_config_file) convert_checkpoint(albert_config, output_path, v1_checkpoint)
def main(_): logging.set_verbosity(logging.INFO) if FLAGS.enable_xla: set_config_v2(FLAGS.enable_xla) strategy = None if FLAGS.strategy_type == "one": strategy = tf.distribute.OneDeviceStrategy("GPU:0") elif FLAGS.strategy_type == "mirror": strategy = tf.distribute.MirroredStrategy() else: raise ValueError( 'The distribution strategy type is not supported: %s' % FLAGS.strategy_type) with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader: input_meta_data = json.loads(reader.read().decode('utf-8')) num_labels = input_meta_data["num_labels"] FLAGS.max_seq_length = input_meta_data["max_seq_length"] processor_type = input_meta_data['processor_type'] if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) albert_config = AlbertConfig.from_json_file(FLAGS.albert_config_file) if FLAGS.max_seq_length > albert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the ALBERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, albert_config.max_position_embeddings)) tf.io.gfile.makedirs(FLAGS.output_dir) num_train_steps = None num_warmup_steps = None steps_per_epoch = None if FLAGS.do_train: len_train_examples = input_meta_data['train_data_size'] steps_per_epoch = int(len_train_examples / FLAGS.train_batch_size) num_train_steps = int(len_train_examples / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) loss_multiplier = 1.0 / strategy.num_replicas_in_sync with strategy.scope(): model = get_model(albert_config=albert_config, max_seq_length=FLAGS.max_seq_length, num_labels=num_labels, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, loss_multiplier=loss_multiplier) model.summary() if FLAGS.do_train: logging.info("***** Running training *****") logging.info(" Num examples = %d", len_train_examples) logging.info(" Batch size = %d", FLAGS.train_batch_size) logging.info(" Num steps = %d", num_train_steps) train_input_fn = functools.partial(create_classifier_dataset, FLAGS.train_data_path, seq_length=FLAGS.max_seq_length, batch_size=FLAGS.train_batch_size, drop_remainder=False) eval_input_fn = functools.partial(create_classifier_dataset, FLAGS.eval_data_path, seq_length=FLAGS.max_seq_length, batch_size=FLAGS.eval_batch_size, is_training=False, drop_remainder=False) with strategy.scope(): summary_dir = os.path.join(FLAGS.output_dir, 'summaries') summary_callback = tf.keras.callbacks.TensorBoard(summary_dir) checkpoint_path = os.path.join(FLAGS.output_dir, 'checkpoint') checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( checkpoint_path, save_weights_only=True) custom_callbacks = [summary_callback, checkpoint_callback] def metric_fn(): if FLAGS.task_name.lower() == "sts": return tf.keras.metrics.MeanSquaredError(dtype=tf.float32) else: return tf.keras.metrics.SparseCategoricalAccuracy( dtype=tf.float32) if FLAGS.custom_training_loop: if FLAGS.task_name.lower() == "sts": loss_fn = get_loss_fn_v2(loss_factor=loss_multiplier) else: loss_fn = get_loss_fn(num_labels, loss_factor=loss_multiplier) model = run_customized_training_loop( strategy=strategy, model=model, loss_fn=loss_fn, model_dir=checkpoint_path, train_input_fn=train_input_fn, steps_per_epoch=steps_per_epoch, epochs=FLAGS.num_train_epochs, eval_input_fn=eval_input_fn, eval_steps=int(input_meta_data['eval_data_size'] / FLAGS.eval_batch_size), metric_fn=metric_fn, custom_callbacks=custom_callbacks) else: training_dataset = train_input_fn() evaluation_dataset = eval_input_fn() model.fit(x=training_dataset, validation_data=evaluation_dataset, epochs=FLAGS.num_train_epochs, callbacks=custom_callbacks) if FLAGS.do_eval: len_eval_examples = input_meta_data['eval_data_size'] logging.info("***** Running evaluation *****") logging.info(" Num examples = %d", len_eval_examples) logging.info(" Batch size = %d", FLAGS.eval_batch_size) evaluation_dataset = eval_input_fn() with strategy.scope(): loss, accuracy = model.evaluate(evaluation_dataset) print(f"loss : {loss} , Accuracy : {accuracy}") if FLAGS.do_predict: logging.info("***** Running prediction*****") flags.mark_flag_as_required("input_data_dir") flags.mark_flag_as_required("predict_data_path") tokenizer = tokenization.FullTokenizer( vocab_file=None, spm_model_file=FLAGS.spm_model_file, do_lower_case=FLAGS.do_lower_case) processors = { "cola": classifier_data_lib.ColaProcessor, "sts": classifier_data_lib.StsbProcessor, "sst": classifier_data_lib.Sst2Processor, "mnli": classifier_data_lib.MnliProcessor, "qnli": classifier_data_lib.QnliProcessor, "qqp": classifier_data_lib.QqpProcessor, "rte": classifier_data_lib.RteProcessor, "mrpc": classifier_data_lib.MrpcProcessor, "wnli": classifier_data_lib.WnliProcessor, "xnli": classifier_data_lib.XnliProcessor, } task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() predict_examples = processor.get_test_examples(FLAGS.input_data_dir) label_list = processor.get_labels() label_map = {i: label for i, label in enumerate(label_list)} classifier_data_lib.file_based_convert_examples_to_features( predict_examples, label_list, input_meta_data['max_seq_length'], tokenizer, FLAGS.predict_data_path) predict_input_fn = functools.partial( create_classifier_dataset, FLAGS.predict_data_path, seq_length=input_meta_data['max_seq_length'], batch_size=FLAGS.eval_batch_size, is_training=False, drop_remainder=False) prediction_dataset = predict_input_fn() with strategy.scope(): logits = model.predict(prediction_dataset) if FLAGS.task_name.lower() == "sts": predictions = logits probabilities = logits else: predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) probabilities = tf.nn.softmax(logits, axis=-1) output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") output_submit_file = os.path.join(FLAGS.output_dir, "submit_results.tsv") with tf.io.gfile.GFile(output_predict_file, "w") as pred_writer,\ tf.io.gfile.GFile(output_submit_file, "w") as sub_writer: logging.info("***** Predict results *****") for (example, probability, prediction) in zip(predict_examples, probabilities, predictions): output_line = "\t".join( str(class_probability.numpy()) for class_probability in probability) + "\n" pred_writer.write(output_line) actual_label = label_map[int(prediction)] sub_writer.write( six.ensure_str(example.guid) + "\t" + actual_label + "\n")
def predict_squad(strategy): """Makes predictions for a squad dataset.""" albert_config = AlbertConfig.from_json_file(FLAGS.albert_config_file) doc_stride = FLAGS.doc_stride max_query_length = FLAGS.max_query_length eval_examples = squad_lib.read_squad_examples( input_file=FLAGS.predict_file, is_training=False) tokenizer = tokenization.FullTokenizer(vocab_file=None, spm_model_file=FLAGS.spm_model_file, do_lower_case=FLAGS.do_lower_case) eval_writer = squad_lib.FeatureWriter(filename=os.path.join( FLAGS.model_dir.split(',')[0], 'eval.tf_record'), is_training=False) eval_features = [] def _append_feature(feature): eval_features.append(feature) eval_writer.process_feature(feature) # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. dataset_size = squad_lib.convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=FLAGS.max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=False, output_fn=_append_feature) eval_writer.close() logging.info('***** Running predictions *****') logging.info(' Num orig examples = %d', len(eval_examples)) logging.info(' Num split examples = %d', len(eval_features)) logging.info(' Batch size = %d', FLAGS.predict_batch_size) num_steps = math.ceil(dataset_size / FLAGS.predict_batch_size) all_results1, all_results2 = predict_squad_customized( strategy, albert_config, eval_writer.filename, num_steps) model_dirs = FLAGS.model_dir.split(',') model_dir1 = model_dirs[0] model_dir2 = model_dirs[1] output_prediction_file = os.path.join(model_dir1, 'predictions.json') output_nbest_file = os.path.join(model_dir1, 'nbest_predictions.json') output_null_log_odds_file = os.path.join(model_dir1, 'null_odds.json') if FLAGS.version_2_with_negative: squad_lib.write_predictions_v2( eval_examples, eval_features, all_results1, FLAGS.n_best_size, FLAGS.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, FLAGS.start_n_top, FLAGS.end_n_top) else: pass output_prediction_file = os.path.join(model_dir2, 'predictions.json') output_nbest_file = os.path.join(model_dir2, 'nbest_predictions.json') output_null_log_odds_file = os.path.join(model_dir2, 'null_odds.json') if FLAGS.version_2_with_negative: squad_lib.write_predictions_v2( eval_examples, eval_features, all_results2, FLAGS.n_best_size, FLAGS.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, FLAGS.start_n_top, FLAGS.end_n_top) else: pass
def train_squad(strategy, input_meta_data, custom_callbacks=None, run_eagerly=False): """Run bert squad training.""" if strategy: logging.info('Training using customized training loop with distribution' ' strategy.') # Enables XLA in Session Config. Should not be set for TPU. if FLAGS.enable_xla: set_config_v2(FLAGS.enable_xla) num_train_examples = input_meta_data['train_data_size'] max_seq_length = input_meta_data['max_seq_length'] num_train_steps = None num_warmup_steps = None steps_per_epoch = int(num_train_examples / FLAGS.train_batch_size) num_train_steps = int( num_train_examples / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) with strategy.scope(): albert_config = AlbertConfig.from_json_file(FLAGS.albert_config_file) if FLAGS.version_2_with_negative: model = get_model_v2(albert_config,input_meta_data['max_seq_length'], FLAGS.init_checkpoint, FLAGS.learning_rate, FLAGS.start_n_top, FLAGS.end_n_top,FLAGS.squad_dropout, num_train_steps, num_warmup_steps) else: model = get_model_v1(albert_config, input_meta_data['max_seq_length'], FLAGS.init_checkpoint, FLAGS.learning_rate, num_train_steps, num_warmup_steps) if FLAGS.version_2_with_negative: train_input_fn = functools.partial( input_pipeline.create_squad_dataset_v2, FLAGS.train_data_path, max_seq_length, FLAGS.train_batch_size, is_training=True) else: train_input_fn = functools.partial( input_pipeline.create_squad_dataset, FLAGS.train_data_path, max_seq_length, FLAGS.train_batch_size, is_training=True) # The original BERT model does not scale the loss by # 1/num_replicas_in_sync. It could be an accident. So, in order to use # the same hyper parameter, we do the same thing here by keeping each # replica loss as it is. if FLAGS.version_2_with_negative: loss_fn = get_loss_fn_v2( loss_factor=1.0 / strategy.num_replicas_in_sync) else: loss_fn = get_loss_fn(loss_factor=1.0 / strategy.num_replicas_in_sync) trained_model = run_customized_training_loop( strategy=strategy, model=model, loss_fn=loss_fn, model_dir=FLAGS.model_dir, train_input_fn=train_input_fn, steps_per_epoch=steps_per_epoch, # steps_per_loop=steps_per_epoch, epochs=FLAGS.num_train_epochs, run_eagerly=run_eagerly, custom_callbacks=custom_callbacks)
def create_sop(ptd, config): return ptd.create_sop(config) def full_document_sampling(ptd, config): return ptd.full_document_sampling(config) def create_mlm(ptd, config): return ptd.create_mlm(config) def create_sbo(ptd, config): return ptd.create_sbo(config) def none(): pass from albert import AlbertConfig, Albert import tokenization bert_config = AlbertConfig.from_json('bert_config.json') transformer = Albert(bert_config) bertx = PreTrainModel('ptm_config.json', transformer) train_iterator = bertx.create_instances( '00.txt', tokenization.ChineseWordpieceTokenizer('vocab.txt')) for batch in train_iterator: bertx(batch)
def main(_): tfhub_model_path = FLAGS.tf_hub_path max_seq_length = 512 float_type = tf.float32 input_word_ids = tf.keras.layers.Input(shape=(max_seq_length, ), dtype=tf.int32, name='input_word_ids') input_mask = tf.keras.layers.Input(shape=(max_seq_length, ), dtype=tf.int32, name='input_mask') input_type_ids = tf.keras.layers.Input(shape=(max_seq_length, ), dtype=tf.int32, name='input_type_ids') if FLAGS.version == 2: albert_config = AlbertConfig.from_json_file( os.path.join(tfhub_model_path, "assets", "albert_config.json")) else: albert_config = AlbertConfig.from_json_file( os.path.join("model_configs", FLAGS.model, "config.json")) tags = [] stock_values = {} with tf.Graph().as_default(): sm = tf.compat.v2.saved_model.load(tfhub_model_path, tags=tags) with tf.compat.v1.Session() as sess: sess.run(tf.compat.v1.global_variables_initializer()) stock_values = { v.name.split(":")[0]: v.read_value() for v in sm.variables } stock_values = sess.run(stock_values) loaded_weights = set() skip_count = 0 weight_value_tuples = [] skipped_weight_value_tuples = [] if FLAGS.model_type == "albert_encoder": albert_layer = AlbertModel(config=albert_config, float_type=float_type) pooled_output, sequence_output = albert_layer(input_word_ids, input_mask, input_type_ids) albert_model = tf.keras.Model( inputs=[input_word_ids, input_mask, input_type_ids], outputs=[pooled_output, sequence_output]) albert_params = albert_model.weights param_values = tf.keras.backend.batch_get_value(albert_model.weights) else: albert_full_model, _ = pretrain_model(albert_config, max_seq_length, max_predictions_per_seq=20) albert_layer = albert_full_model.get_layer("albert_model") albert_params = albert_full_model.weights param_values = tf.keras.backend.batch_get_value( albert_full_model.weights) for ndx, (param_value, param) in enumerate(zip(param_values, albert_params)): stock_name = weight_map[param.name] if stock_name in stock_values: ckpt_value = stock_values[stock_name] if param_value.shape != ckpt_value.shape: print( "loader: Skipping weight:[{}] as the weight shape:[{}] is not compatible " "with the checkpoint:[{}] shape:{}".format( param.name, param.shape, stock_name, ckpt_value.shape)) skipped_weight_value_tuples.append((param, ckpt_value)) continue weight_value_tuples.append((param, ckpt_value)) loaded_weights.add(stock_name) else: print("loader: No value for:[{}], i.e.:[{}] in:[{}]".format( param.name, stock_name, tfhub_model_path)) skip_count += 1 tf.keras.backend.batch_set_value(weight_value_tuples) print("Done loading {} ALBERT weights from: {} into {} (prefix:{}). " "Count of weights not found in the checkpoint was: [{}]. " "Count of weights with mismatched shape: [{}]".format( len(weight_value_tuples), tfhub_model_path, albert_layer, "albert", skip_count, len(skipped_weight_value_tuples))) print( "Unused weights from saved model:", "\n\t" + "\n\t".join( sorted(set(stock_values.keys()).difference(loaded_weights)))) if FLAGS.model_type == "albert_encoder": albert_model.save_weights(f"{tfhub_model_path}/tf2_model.h5") else: albert_full_model.save_weights(f"{tfhub_model_path}/tf2_model_full.h5")