def __init__(self, **kwargs): self.tokenizer = tokenization.FullTokenizer( vocab_file=kwargs['vocab_file'], do_lower_case=True) self.tag_to_id = {'O': 0, 'I-BRD': 1, 'I-PRO': 2, 'B-PRO': 3, 'I-KWD': 4, 'B-BRD': 5, 'I-POP': 6, 'B-KWD': 7, 'B-POP': 8, 'I-PRC': 9, 'I-FLR': 10, 'B-FLR': 11, 'B-PRC': 12, '[CLS]': 13, '[SEP]': 14} self.id_to_tag = {0: 'O', 1: 'I-BRD', 2: 'I-PRO', 3: 'B-PRO', 4: 'I-KWD', 5: 'B-BRD', 6: 'I-POP', 7: 'B-KWD', 8: 'B-POP', 9: 'I-PRC', 10: 'I-FLR', 11: 'B-FLR', 12: 'B-PRC', 13: '[CLS]', 14: '[SEP]'}
def __init__(self, **kwargs): self.tokenizer = tokenization.FullTokenizer( vocab_file=kwargs['vocab_file'], do_lower_case=True) self.max_seq_len = 70 self.ckpt_path = kwargs['model_dir'] self.init_checkpoint = kwargs['init_checkpoint_file'] self.bert_config = kwargs['bert_config_dir'] self.graph = kwargs["graph"] with self.graph.as_default(): self.model = Model(init_checkpoint_file=self.init_checkpoint, bert_config_dir=self.bert_config) self.saver = tf.train.Saver() config = tf.ConfigProto(log_device_placement=False) self.session = tf.Session(graph=self.graph, config=config) self.load()
def __init__(self): self.lstm_dim = 128 self.batch_size = 1 self.max_seq_len = 70 self.clip = 5.0 self.dropout_keep = 0.5 self.optimizer = 'adam' self.lr = 0.001 self.tag_schema = 'iob' self.ckpt_path = '..\\models' self.steps_check = 10 self.zeros = False self.lower = True self.max_epoch = 2 self.num_tags = len(convert_samples.tag_to_id) self.model = Model(init_checkpoint_file='D:\models\\albert_base_zh\\albert_model.ckpt' , bert_config_dir='D:\models\\albert_base_zh\\albert_config_base.json') self.saver = tf.train.Saver() self.tokenizer = tokenization.FullTokenizer(vocab_file='D:\models\\albert_base_zh\\vocab.txt', do_lower_case=True)
def __init__(self, **kwargs): self.tokenizer = tokenization.FullTokenizer( vocab_file=kwargs['vocab_file'], # vocab_file='D:\\mygit\\NER_MODEL\\albert_tiny_489k\\vocab.txt', do_lower_case=True) self.max_seq_len = 70 self.ckpt_path = kwargs['model_dir'] # self.ckpt_path = 'D:\\mygit\\NER_MODEL\\models' self.init_checkpoint = kwargs['init_checkpoint_file'] # self.init_checkpoint = 'D:\\mygit\\NER_MODEL\\albert_tiny_489k\\albert_model.ckpt' self.bert_config = kwargs['bert_config_dir'] # self.bert_config = 'D:\\mygit\\NER_MODEL\\albert_tiny_489k\\albert_config_tiny.json' self.graph = kwargs["graph"] with self.graph.as_default(): self.model = Model(init_checkpoint_file=self.init_checkpoint, bert_config_dir=self.bert_config) self.saver = tf.train.Saver() config = tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options) self.session = tf.Session(graph=self.graph, config=config) self.load()
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = { "mayi": SelfProcessor_test } tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 print("###tpu_cluster_resolver:", tpu_cluster_resolver) run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) # TODO print("###length of total train_examples:", len(train_examples)) num_train_steps = int(len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") train_file_exists = os.path.exists(train_file) print("###train_file_exists:", train_file_exists, " ;train_file:", train_file) if not train_file_exists: # if tf_record file not exist, convert from raw text file. # TODO file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) if FLAGS.use_tpu: while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(PaddingInputExample()) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") file_based_convert_examples_to_features( eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_steps = None eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) # evaluate all checkpoints; you can use the checkpoint with the best dev accuarcy steps_and_files = [] filenames = tf.gfile.ListDirectory(FLAGS.output_dir) for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) global_step = int(cur_filename.split("-")[-1]) tf.logging.info("Add {} to eval list.".format(cur_filename)) steps_and_files.append([global_step, cur_filename]) steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) output_eval_file = os.path.join(FLAGS.data_dir, "eval_results_albert_zh.txt") print("output_eval_file:", output_eval_file) tf.logging.info("output_eval_file:" + output_eval_file) with tf.gfile.GFile(output_eval_file, "w") as writer: for global_step, filename in sorted(steps_and_files, key=lambda x: x[0]): result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=filename) tf.logging.info("***** Eval results %s *****" % (filename)) writer.write("***** Eval results %s *****\n" % (filename)) for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # ============================ if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") file_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as writer: num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, prediction) in enumerate(result): probabilities = prediction["probabilities"] if i >= num_actual_predict_examples: break output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" writer.write(output_line) num_written_lines += 1 assert num_written_lines == num_actual_predict_examples