def initsetting(): print("DOING INITIALSETTING!!!!!!!!!!!!!!!!!!!!!!!!!!!") tokenizer = run_classifier_with_tfhub.create_tokenizer_from_hub_module( BERT_MODEL_HUB) processors = { "cola": run_classifier.ColaProcessor, "mnli": run_classifier.MnliProcessor, "mrpc": run_classifier.MrpcProcessor, "korean_sa": run_classifier.KsaProcessor, } processor = processors[TASK.lower()]() label_list = processor.get_labels() num_train_steps = 1 num_warmup_steps = None bert_config = modeling.BertConfig.from_json_file(CONFIG_DIR) model_fn = run_classifier.model_fn_builder( bert_config=bert_config, num_labels=len(label_list), init_checkpoint=CKPT_DIR, learning_rate=3e-4, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=False, use_one_hot_embeddings=False) estimator = tf.contrib.tpu.TPUEstimator(use_tpu=False, model_fn=model_fn, config=get_run_config(), train_batch_size=32, eval_batch_size=8, predict_batch_size=1) return tokenizer, estimator, label_list
def __init__(self): tf.logging.set_verbosity(tf.logging.INFO) self.config = BertPredictionConfig() bert_config = modeling.BertConfig.from_json_file( self.config.bert_config_file) self.tokenizer = tokenization.FullTokenizer( vocab_file=self.config.vocab_file, do_lower_case=self.config.do_lower_case) self.processor = PlwiProcessor() self.label_list = self.processor.get_labels() tpu_cluster_resolver = None if self.config.use_tpu and self.config.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( self.config.tpu_name, zone=self.config.tpu_zone, project=self.config.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=None, model_dir="out", save_checkpoints_steps=False, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=1000, num_shards=8, per_host_input_for_training=is_per_host)) model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(self.label_list), init_checkpoint=self.config.init_checkpoint, learning_rate=0, num_train_steps=0, num_warmup_steps=0, use_tpu=self.config.use_tpu, use_one_hot_embeddings=self.config.use_tpu) self.estimator = tf.contrib.tpu.TPUEstimator( use_tpu=self.config.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=self.config.batch_size, eval_batch_size=self.config.batch_size, predict_batch_size=self.config.batch_size)
def load_estimator(config, FLAGS): bert_config_file = tempfile.NamedTemporaryFile(mode='w+t', encoding='utf-8', suffix='.json') bert_config_file.write(json.dumps({k:str_to_value(v) for k,v in config['BERT-CONFIG'].items()})) bert_config_file.seek(0) bert_config = modeling.BertConfig.from_json_file(bert_config_file.name) tpu_cluster_resolver = None is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host ) ) model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(FLAGS.task_proc.get_labels()), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=FLAGS.num_train_steps, num_warmup_steps=FLAGS.num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu ) estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size ) return estimator
def __init__(self, path): self.init_checkpoint = path + "/anshaj.ckpt" self.tokenization = run_classifier.tokenization processor = run_classifier.ColaProcessor() BATCH_SIZE = 32 self.MAX_SEQ_LENGTH = 50 self.tokenization.validate_case_matches_checkpoint( False, self.init_checkpoint) bert_config = run_classifier.modeling.BertConfig.from_json_file( path + "/bert_config.json") self.tokenizer = self.tokenization.FullTokenizer(vocab_file=path + "/vocab.txt", do_lower_case=False) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( model_dir=path, cluster=None, master=None, save_checkpoints_steps=500, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=1000, num_shards=8, per_host_input_for_training=is_per_host)) model_fn = run_classifier.model_fn_builder( bert_config=bert_config, num_labels=3, init_checkpoint=self.init_checkpoint, learning_rate=1e-05, num_train_steps=None, num_warmup_steps=None, use_tpu=False, use_one_hot_embeddings=False) self.estimator = tf.contrib.tpu.TPUEstimator( use_tpu=False, model_fn=model_fn, config=run_config, train_batch_size=BATCH_SIZE, eval_batch_size=BATCH_SIZE, predict_batch_size=BATCH_SIZE)
tpu_cluster_resolver = None run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=OUTPUT_DIR, save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=ITERATIONS_PER_LOOP, num_shards=NUM_TPU_CORES, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig. PER_HOST_V2)) model_fn = run_classifier.model_fn_builder( bert_config=modeling.BertConfig.from_json_file(CONFIG_FILE), num_labels=len(label_list), init_checkpoint=INIT_CHECKPOINT, learning_rate=LEARNING_RATE, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=False, use_one_hot_embeddings=True) estimator = tf.contrib.tpu.TPUEstimator(use_tpu=False, model_fn=model_fn, config=run_config, train_batch_size=TRAIN_BATCH_SIZE, eval_batch_size=EVAL_BATCH_SIZE) print('\n__________\nStarted training at {} '.format(datetime.datetime.now())) print('\nNum examples = {}'.format(len(train_examples))) print('\nBatch size = {}'.format(TRAIN_BATCH_SIZE)) tf.logging.info("Num steps = %d", num_train_steps)
def main(): # bert bert_config_file = tempfile.NamedTemporaryFile(mode='w+t', encoding='utf-8', suffix='.json') bert_config_file.write( json.dumps( {k: str_to_value(v) for k, v in config['BERT-CONFIG'].items()})) bert_config_file.seek(0) # [注意] 最初からread するから bert_config = modeling.BertConfig.from_json_file(bert_config_file.name) latest_ckpt = latest_ckpt_model() # model.ckpt-11052.index, model.ckpt-11052.meta データの prefix finetuned_model_path = latest_ckpt.split('.data-00000-of-00001')[0] flags = FLAGS(finetuned_model_path) processor = LivedoorProcessor() label_list = processor.get_labels() # sentencepiece tokenizer = tokenization.FullTokenizer(model_file=flags.model_file, vocab_file=flags.vocab_file, do_lower_case=flags.do_lower_case) # no use TPU tpu_cluster_resolver = None is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 # config run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=flags.master, model_dir=flags.output_dir, save_checkpoints_steps=flags.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=flags.iterations_per_loop, num_shards=flags.num_tpu_cores, per_host_input_for_training=is_per_host)) model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=flags.init_checkpoint, learning_rate=flags.learning_rate, num_train_steps=flags.num_train_steps, num_warmup_steps=flags.num_warmup_steps, use_tpu=flags.use_tpu, use_one_hot_embeddings=flags.use_tpu) estimator = tf.contrib.tpu.TPUEstimator( use_tpu=flags.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=flags.train_batch_size, eval_batch_size=flags.eval_batch_size, predict_batch_size=flags.predict_batch_size) # テストデータコレクションの取得 predict_examples = processor.get_test_examples(flags.data_dir) predict_file = tempfile.NamedTemporaryFile(mode='w+t', encoding='utf-8', suffix='.tf_record') """Convert a set of `InputExample`s to a TFRecord file.""" """出力: predict_file.name """ # https://github.com/yoheikikuta/bert-japanese/blob/master/src/run_classifier.py#L371-L380 file_based_convert_examples_to_features(predict_examples, label_list, flags.max_seq_length, tokenizer, predict_file.name) predict_drop_remainder = True if flags.use_tpu else False # TPUEstimatorに渡すクロージャを作成 predict_input_fn = file_based_input_fn_builder( input_file=predict_file.name, seq_length=flags.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) # 推論 result = estimator.predict(input_fn=predict_input_fn) result = list(result) # 精度を計算 accracy(result, label_list)
def construct_bert_predictor(init_checkpoint = '/home/ubuntu/bert/models/imdb_350_16_output/model.ckpt-7812', dataset_name = 'sst'): model_dir = './bert/models/uncased_L-12_H-768_A-12' bert_config_file = os.path.join(model_dir, 'bert_config.json') vocab_file = os.path.join(model_dir, 'vocab.txt') output_dir = '.' save_checkpoints_steps = 1000 iterations_per_loop = 1000 num_tpu_cores = 8 if dataset_name == 'sst': if init_checkpoint is None: init_checkpoint = './bert/models/sst_output/model.ckpt-6313' max_seq_length = 128 task_name = 'sst-2' batch_size = 32 bert_config = modeling.BertConfig.from_json_file(bert_config_file) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=True) tpu_cluster_resolver = None is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=None, model_dir=output_dir, save_checkpoints_steps=save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=iterations_per_loop, num_shards=num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list), init_checkpoint=init_checkpoint, learning_rate=2e-5, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=False, use_one_hot_embeddings=False) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=False, model_fn=model_fn, config=run_config, train_batch_size=16, eval_batch_size=8, predict_batch_size=batch_size) return processor, estimator, tokenizer
tpu_config = tf.contrib.tpu.TPUConfig(iterations_per_loop=ITERATIONS_PER_LOOP, num_shards=NUM_TPU_CORES, per_host_input_for_training=IS_PER_HOST) run_config = tf.contrib.tpu.RunConfig( cluster=TPU_CLUSTER_RESOLVER, master=MASTER, model_dir=OUTPUT_DIR, save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS, tpu_config=tpu_config) model_fn = run_classifier.model_fn_builder( bert_config=bert_config, num_labels=2, init_checkpoint=INIT_CHECKPOINT, learning_rate=LEARNING_RATE, num_train_steps=NUM_TRAIN_STEPS, num_warmup_steps=NUM_WARMUP_STEPS, use_tpu=USE_TPU, use_one_hot_embeddings=USE_ONE_HOT_EMBEDDING) estimator = tf.contrib.tpu.TPUEstimator(use_tpu=USE_TPU, model_fn=model_fn, config=run_config, train_batch_size=TRAIN_BATCH_SIZE, eval_batch_size=EVAL_BATCH_SIZE, predict_batch_size=PREDICT_BATCH_SIZE) predictions = estimator.predict(test_input_fn) probs0 = []
def setup_estimator(self, num_train_examples, label_list): """ setup tensorflow estimator to use and set as self.estimator """ ## clean output if num_train_examples > 0 and tf.gfile.Exists(self.config.output_dir): tf.gfile.DeleteRecursively(self.config.output_dir) ## make output tf.gfile.MakeDirs(self.config.output_dir) bert_config = modeling.BertConfig.from_json_file( self.config.bert_config_file) if self.config.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (self.config.max_seq_length, bert_config.max_position_embeddings)) tpu_cluster_resolver = None if self.config.use_tpu and self.config.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( self.config.tpu_name, zone=self.config.tpu_zone, project=self.config.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=self.config.master, model_dir=self.config.output_dir, save_checkpoints_steps=self.config.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=self.config.iterations_per_loop, num_shards=self.config.num_tpu_cores, per_host_input_for_training=is_per_host)) num_train_steps = int(num_train_examples / self.config.train_batch_size * self.config.num_train_epochs) num_warmup_steps = int(num_train_steps * self.config.warmup_proportion) model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list), init_checkpoint=self.config.init_checkpoint, learning_rate=self.config.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=self.config.use_tpu, use_one_hot_embeddings=self.config.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=self.config.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=self.config.train_batch_size, eval_batch_size=self.config.eval_batch_size, predict_batch_size=self.config.predict_batch_size) self.estimator = estimator self.num_train_steps = num_train_steps
def main(_): tf.logging.set_verbosity(tf.logging.DEBUG) processors = { "mana169": ManaProcessor169, } tokenization.validate_case_matches_checkpoint(DO_LOWER_CASE, INIT_CKPT) bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG_FILE) if MAX_SEQ_LENGTH > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (MAX_SEQ_LENGTH, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(OUTPUT_DIR) task_name = 'mana169' if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE) tpu_cluster_resolver = None hooks = [] # create a logging tensor hook because this takes forever on cpu logger = tf.train.LoggingTensorHook({"Input": "IteratorGetNext:0"}, every_n_iter=1) hooks.append(logger) # debug_hook = tfdbg.LocalCLIDebugHook() # hooks.append(debug_hook) run_config = tf.contrib.tpu.RunConfig(cluster=tpu_cluster_resolver, model_dir=OUTPUT_DIR, save_checkpoints_steps=1) model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=INIT_CKPT, learning_rate=5e-5, num_train_steps=None, num_warmup_steps=None, use_tpu=False, use_one_hot_embeddings=False) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=False, model_fn=model_fn, config=run_config, predict_batch_size=PREDICT_BATCH_SIZE) input_file = sys.argv[1] predict_examples = read_input_examples(input_file) num_actual_predict_examples = len(predict_examples) # if FLAGS.use_tpu: # # TPU requires a fixed batch size for all batches, therefore the number # # of examples must be a multiple of the batch size, or else examples # # will get dropped. So we pad with fake examples which are ignored # # later on. # while len(predict_examples) % FLAGS.predict_batch_size != 0: # predict_examples.append(PaddingInputExample()) predict_file = os.path.join(OUTPUT_DIR, "predict.tf_record") file_based_convert_examples_to_features(predict_examples, label_list, MAX_SEQ_LENGTH, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", PREDICT_BATCH_SIZE) predict_drop_remainder = False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=predict_drop_remainder) result = estimator.predict(input_fn=predict_input_fn, hooks=hooks) output_predict_file = os.path.join(OUTPUT_DIR, sys.argv[2]) scores_list = [] num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, prediction) in enumerate(result): probabilities = prediction["probabilities"] if i >= num_actual_predict_examples: break # writer.write(output_line) scores_list.append(probabilities) num_written_lines += 1 assert num_written_lines == num_actual_predict_examples scores_array = np.array(scores_list) # write the scores in a useful form top3_scores = [] all_topics = processor.get_labels() for i, row in enumerate(scores_array): top3_indices = row.argsort()[::-1][:3] # index 1, score 1, index 2, score 2, etc l = [] l += [input_df.values[i][j] for j in range(input_df.shape[1] - 1) ] # all but the original input l.append(str(input_df.values[i][-1]).replace( '\n', '')) # take the original input and remove newlines for v in top3_indices: l.append(all_topics[v]) l.append(row[v]) top3_scores.append(l) score_df = pd.DataFrame( top3_scores, columns=list(input_df.columns.values) + ["Class 1", "Score 1", "Class 2", "Score 2", "Class 3", "Score 3"]) score_df.to_csv(output_predict_file, index=None)
def traineval_model(self, local_dir, nb_epoch, batch_size): """ Use the BERT Uncased language model to train on new data """ tf.logging.set_verbosity(tf.logging.INFO) logging.info("*:BERT MODEL PATH:%s",BERT_MODEL_PATH) logging.info("*:Local Dir%s",local_dir) mod_name = self.model_name BERT_MODEL = 'uncased_L-12_H-768_A-12' BERT_PRETRAINED_DIR = BERT_MODEL_PATH OUTPUT_DIR = os.path.join(local_dir,'output_bert') DATA_DIR = os.path.join(local_dir,'data') logging.info('***** Model output directory: %s*****',OUTPUT_DIR) logging.info('***** BERT pretrained directory: %s *****',BERT_PRETRAINED_DIR) logging.info('***** DATA directory: %s *****',DATA_DIR) TRAIN_BATCH_SIZE = 32 EVAL_BATCH_SIZE = 8 LEARNING_RATE = 2e-5 NUM_TRAIN_EPOCHS = 3.0 WARMUP_PROPORTION = 0.1 MAX_SEQ_LENGTH = 128 # Model configs # if you wish to finetune a model on a larger dataset, use larger interval SAVE_CHECKPOINTS_STEPS = 1000 # each checpoint weights about 1,5gb ITERATIONS_PER_LOOP = 1000 NUM_TPU_CORES = 8 VOCAB_FILE = os.path.join(BERT_PRETRAINED_DIR,'vocab.txt') BERT_CONFIG_FILE = os.path.join(BERT_PRETRAINED_DIR,'bert_config.json') INIT_CHECKPOINT = os.path.join(BERT_PRETRAINED_DIR, 'bert_model.ckpt') DO_LOWER_CASE = BERT_MODEL.startswith('uncased') logging.info("Found VOCAB File:%s",VOCAB_FILE) bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG_FILE) tf.gfile.MakeDirs(OUTPUT_DIR) processor = run_classifier.ColaProcessor() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer( vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE) # Since training will happen on GPU, we won't need a cluster resolver tpu_cluster_resolver = None # TPUEstimator also supports training on CPU and GPU. You don't need to define a separate tf.estimator.Estimator. run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=OUTPUT_DIR, save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=ITERATIONS_PER_LOOP, num_shards=NUM_TPU_CORES, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2)) train_examples = None num_train_steps = None num_warmup_steps = None train_examples = processor.get_train_examples(DATA_DIR) num_train_steps = int( len(train_examples) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS) num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION) model_fn = run_classifier.model_fn_builder( bert_config=bert_config, num_labels=len(label_list), init_checkpoint=INIT_CHECKPOINT, learning_rate=LEARNING_RATE, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=False, # If False training will fall on CPU or GPU, depending on what is available use_one_hot_embeddings=False) #Try with True estimator = tf.contrib.tpu.TPUEstimator( use_tpu=False, # If False training will fall on CPU or GPU, depending on what is available model_fn=model_fn, config=run_config, train_batch_size=TRAIN_BATCH_SIZE, eval_batch_size=EVAL_BATCH_SIZE) # Train the model. logging.info('Starting Training...') train_file = os.path.join(OUTPUT_DIR, "train.tf_record") run_classifier.file_based_convert_examples_to_features( train_examples, label_list, MAX_SEQ_LENGTH, tokenizer, train_file) tf.logging.info('***** Started training at {} *****'.format(datetime.datetime.now())) tf.logging.info(' Num examples = {}'.format(len(train_examples))) tf.logging.info(' Batch size = {}'.format(TRAIN_BATCH_SIZE)) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = run_classifier.file_based_input_fn_builder( input_file=train_file, seq_length=MAX_SEQ_LENGTH, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) final_ckpt = estimator.latest_checkpoint() print('***** Finished training at {} *****'.format(datetime.datetime.now())) logging.info("*****Final Checkpoint*****%s",final_ckpt) final_ckpt_file = os.path.join(OUTPUT_DIR, "final_ckpt.txt") with tf.gfile.GFile(final_ckpt_file, "w") as writer: writer.write("%s" % final_ckpt) # Do Eval logging.info('Starting Eval..') eval_examples = processor.get_dev_examples(DATA_DIR) num_actual_eval_examples = len(eval_examples) eval_file = os.path.join(OUTPUT_DIR, "eval.tf_record") run_classifier.file_based_convert_examples_to_features( eval_examples, label_list, MAX_SEQ_LENGTH, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", TRAIN_BATCH_SIZE) eval_steps = None eval_input_fn = run_classifier.file_based_input_fn_builder( input_file=eval_file, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(OUTPUT_DIR, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return result
def test_model(self, local_dir, nb_epoch, batch_size, bucket_name): """ Use the BERT Uncased language model to train on new data """ tf.logging.set_verbosity(tf.logging.INFO) logging.info("*:BERT MODEL PATH:%s",BERT_MODEL_PATH) logging.info("*:Local Dir%s",local_dir) mod_name = self.model_name BERT_MODEL = 'uncased_L-12_H-768_A-12' BERT_PRETRAINED_DIR = BERT_MODEL_PATH OUTPUT_DIR = os.path.join(local_dir,'output_bert') DATA_DIR = os.path.join(local_dir,'data') logging.info('***** Model output directory: %s*****',OUTPUT_DIR) logging.info('***** BERT pretrained directory: %s *****',BERT_PRETRAINED_DIR) logging.info('***** DATA directory: %s *****',DATA_DIR) TRAIN_BATCH_SIZE = 32 EVAL_BATCH_SIZE = 8 PREDICT_BATCH_SIZE = 32 LEARNING_RATE = 2e-5 NUM_TRAIN_EPOCHS = 3.0 WARMUP_PROPORTION = 0.1 MAX_SEQ_LENGTH = 128 # Model configs # if you wish to finetune a model on a larger dataset, use larger interval SAVE_CHECKPOINTS_STEPS = 1000 # each checpoint weights about 1,5gb ITERATIONS_PER_LOOP = 1000 NUM_TPU_CORES = 8 VOCAB_FILE = os.path.join(BERT_PRETRAINED_DIR,'vocab.txt') BERT_CONFIG_FILE = os.path.join(BERT_PRETRAINED_DIR,'bert_config.json') with open(os.path.join(OUTPUT_DIR,'final_ckpt.txt')) as f: content = f.readlines() logging.info("***Final_cktp->%s\n",content) test_ckpt = content[0].split('/')[-1] INIT_CHECKPOINT = os.path.join(OUTPUT_DIR, test_ckpt) DO_LOWER_CASE = BERT_MODEL.startswith('uncased') logging.info("Found VOCAB File:%s",VOCAB_FILE) bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG_FILE) tf.gfile.MakeDirs(OUTPUT_DIR) processor = run_classifier.ColaProcessor() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer( vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE) # Since training will happen on GPU, we won't need a cluster resolver tpu_cluster_resolver = None # TPUEstimator also supports training on CPU and GPU. You don't need to define a separate tf.estimator.Estimator. run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=OUTPUT_DIR, save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=ITERATIONS_PER_LOOP, num_shards=NUM_TPU_CORES, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2)) train_examples = None num_train_steps = None num_warmup_steps = None model_fn = run_classifier.model_fn_builder( bert_config=bert_config, num_labels=len(label_list), init_checkpoint=INIT_CHECKPOINT, learning_rate=LEARNING_RATE, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=False, # If False training will fall on CPU or GPU, depending on what is available use_one_hot_embeddings=False) #Try with True estimator = tf.contrib.tpu.TPUEstimator( use_tpu=False, # If False training will fall on CPU or GPU, depending on what is available model_fn=model_fn, config=run_config, train_batch_size=TRAIN_BATCH_SIZE, eval_batch_size=EVAL_BATCH_SIZE, predict_batch_size=PREDICT_BATCH_SIZE) predict_examples = processor.get_test_examples(DATA_DIR) num_actual_predict_examples = len(predict_examples) predict_file = os.path.join(OUTPUT_DIR, "predict.tf_record") run_classifier.file_based_convert_examples_to_features(predict_examples, label_list, MAX_SEQ_LENGTH, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", batch_size) predict_input_fn = run_classifier.file_based_input_fn_builder( input_file=predict_file, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(OUTPUT_DIR, "test_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as writer: num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, prediction) in enumerate(result): probabilities = prediction["probabilities"] if i >= num_actual_predict_examples: break output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" writer.write(output_line) num_written_lines += 1 assert num_written_lines == num_actual_predict_examples s3 = boto3.resource('s3') tf.logging.info("Done with prediction uploading results to S3") try: s3.Bucket(bucket_name).upload_file(output_predict_file, output_predict_file) except Exception as err: logging.info("Unable to upload to S3") logging.info(err) return 1
run_config = tf.contrib.tpu.RunConfig( model_dir=OUTPUT_DIR, cluster=None, master=None, save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=1000, num_shards=8, per_host_input_for_training=is_per_host)) #model model_fn = run_classifier.model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=init_checkpoint, learning_rate=5e-5, num_train_steps=None, num_warmup_steps=None, use_tpu=False, use_one_hot_embeddings=False) #estimator estimator = tf.contrib.tpu.TPUEstimator(use_tpu=False, model_fn=model_fn, config=run_config, train_batch_size=BATCH_SIZE, eval_batch_size=BATCH_SIZE, predict_batch_size=BATCH_SIZE) # emotions emotions = pd.read_csv(emotions_file, header=None)
def main(argv): BERT_MODEL = 'uncased_L-12_H-768_A-12' VOCAB_FILE = '/root/cyliu/tftuner/selftf/tf_job/nlp/zmwu/bert_tf2/vocab.txt' CONFIG_FILE = '/root/cyliu/tftuner/selftf/tf_job/nlp/zmwu/bert_tf2/bert_config.json' INIT_CHECKPOINT = '/root/cyliu/tftuner/selftf/tf_job/nlp/zmwu/bert_tf2/bert_model.ckpt' DO_LOWER_CASE = BERT_MODEL.startswith('uncased') model_dir = "{}/{}".format("/opt/tftuner", mltunerUtil.get_job_id()) # model fix parameter TRAIN_BATCH_SIZE = mltunerUtil.get_batch_size() NUM_TRAIN_EPOCHS = 3 LEARNING_RATE = mltunerUtil.get_learning_rate() WARMUP_PROPORTION = 0.05 EVAL_BATCH_SIZE = 8 MAX_SEQ_LENGTH = 128 #data loading train_df = pd.read_csv( '/root/cyliu/tftuner/selftf/tf_job/nlp/zmwu/bert_tf2/train.csv') train_df = train_df.sample(1000) train, test = train_test_split(train_df, test_size=0.1, random_state=42) train_lines, train_labels = train.question_text.values, train.target.values test_lines, test_labels = test.question_text.values, test.target.values label_list = ['0', '1'] tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE) train_examples = create_examples(train_lines, 'train', labels=train_labels) num_train_steps = int( len(train_examples) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS) num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION) strategy = tf.distribute.experimental.ParameterServerStrategy() session_config = mltunerUtil.get_tf_session_config() config = tf.compat.v1.estimator.tpu.RunConfig( train_distribute=strategy, model_dir=model_dir, save_checkpoints_steps=None, save_checkpoints_secs=None, session_config=session_config) model_fn = run_classifier.model_fn_builder( bert_config=modeling.BertConfig.from_json_file(CONFIG_FILE), num_labels=len(label_list), init_checkpoint=None, learning_rate=LEARNING_RATE, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu= False, #If False training will fall on CPU or GPU, depending on what is available use_one_hot_embeddings=True) estimator = tf.compat.v1.estimator.tpu.TPUEstimator( use_tpu= False, #If False training will fall on CPU or GPU, depending on what is available model_fn=model_fn, config=config, train_batch_size=TRAIN_BATCH_SIZE, eval_batch_size=EVAL_BATCH_SIZE) class LoggerHook(tf.estimator.SessionRunHook): """Logs loss and runtime.""" def __init__(self): self.last_run_timestamp = time.time() def after_run(self, run_context, run_values): session: tf.Session = run_context.session loss, step = session.run([ tf.compat.v1.get_collection("losses")[0], tf.compat.v1.get_collection("global_step_read_op_cache")[0] ]) logging.debug("step:{} loss:{}".format(step, loss)) mltunerUtil.report_iter_loss(step, loss, time.time() - self.last_run_timestamp) self.last_run_timestamp = time.time() # prepare for train train_features = run_classifier.convert_examples_to_features( train_examples, label_list, MAX_SEQ_LENGTH, tokenizer) train_input_fn = input_fn_builder(features=train_features, seq_length=MAX_SEQ_LENGTH, is_training=True, drop_remainder=True) predict_examples = create_examples(test_lines, 'test') predict_features = run_classifier.convert_examples_to_features( predict_examples, label_list, MAX_SEQ_LENGTH, tokenizer) predict_input_fn = input_fn_builder(features=predict_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False) train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=num_train_steps, hooks=[LoggerHook()]) eval_spec = tf.estimator.EvalSpec(input_fn=predict_input_fn) # wait for chief ready? if not (mltunerUtil.is_chief() or mltunerUtil.is_ps()): time.sleep(1) if not tf.io.gfile.exists(model_dir): logging.debug("wait for chief init") time.sleep(1) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def mrpc_classifier(sent_list1, sent_list2, args): TUNED_MODEL_DIR = args.tuned_model_dir config = { "task_name": 'MRPC', "do_predict": True, "vocab_file": f"{TUNED_MODEL_DIR}/vocab.txt", "bert_config_file": f"{TUNED_MODEL_DIR}/bert_config.json", "init_checkpoint": f"{TUNED_MODEL_DIR}", "max_seq_length": 128, "output_dir": f"{TUNED_MODEL_DIR}", "do_lower_case": True, "predict_batch_size": 8 } bert_config = modeling.BertConfig.from_json_file( config["bert_config_file"]) processor = run_classifier.MrpcProcessor() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer( vocab_file=config["vocab_file"], do_lower_case=config["do_lower_case"]) run_config = tf.contrib.tpu.RunConfig( cluster=None, master=None, model_dir=config["output_dir"], save_checkpoints_steps=1000, ) model_fn = run_classifier.model_fn_builder( bert_config=bert_config, num_labels=len(label_list), init_checkpoint=config["init_checkpoint"], learning_rate=5e-5, num_train_steps=None, num_warmup_steps=None, use_tpu=False, use_one_hot_embeddings=False) estimator = tf.contrib.tpu.TPUEstimator( use_tpu=False, model_fn=model_fn, config=run_config, predict_batch_size=config["predict_batch_size"]) predict_examples = get_predict_examples(sent_list1, sent_list2) num_actual_predict_examples = len(predict_examples) predict_file = os.path.join(config["output_dir"], "predict.tf_record") run_classifier.file_based_convert_examples_to_features( predict_examples, label_list, config["max_seq_length"], tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", config["predict_batch_size"]) predict_drop_remainder = False predict_input_fn = run_classifier.file_based_input_fn_builder( input_file=predict_file, seq_length=config["max_seq_length"], is_training=False, drop_remainder=predict_drop_remainder) result = estimator.predict(input_fn=predict_input_fn) probabilities = [ prediction["probabilities"][1] for (i, prediction) in enumerate(result) ] return probabilities