def __init__(self, init_checkpoint, vocab_file, stop_words_file, config_file, embedding_table_file, index_vecs_file, index_data_file, listen_port=5022, logger=logging.StreamHandler()): super(WordVecTransformer, self).__init__() self.logger = logger self.port = listen_port self.vec_size = 2400 if not os.path.exists("./tmp"): os.mkdir("./tmp") model_dir = "./tmp" self.index_vecs = np.asarray(self.load_index_bin(index_vecs_file), dtype=np.float32) self.index_data = self.load_index_data(index_data_file) self.tokenizer = tokenization.Tokenizer( vocab_file=vocab_file, stop_words_file=stop_words_file, use_pos=False) self.model_config = ModelConfig.from_json_file(config_file) self.estimator = create_estimator(self.model_config, init_checkpoint, model_dir, embedding_table_file) self.build_index(self.index_vecs) self.logger.info("Finish WordVecTransFormer init.")
def create_pairtexts_data(): tokenizer = tokenization.Tokenizer(vocab_file=FLAGS.vocab_file, stop_words_file=FLAGS.stop_words_file, use_pos=False) input_files_all = FLAGS.input_file input_files = input_files_all.split('#') train_examples = [] for input_file in input_files: tmp_examples = data_util.create_pairexamples_from_tsv_file(input_file) train_examples.extend(tmp_examples) train_file = FLAGS.output_file data_util.file_based_convert_pairexamples_to_features( train_examples, FLAGS.max_seq_length, tokenizer, train_file)
def create_keyword_data(): label_list = ["1", "2"] tokenizer = tokenization.Tokenizer(vocab_file=FLAGS.vocab_file, stop_words_file=FLAGS.stop_words_file, use_pos=True) input_files_all = FLAGS.input_file input_files = input_files_all.split('#') train_examples = [] for input_file in input_files: tmp_examples = data_util.create_examples_from_json_file(input_file) train_examples.extend(tmp_examples) train_file = FLAGS.output_file data_util.file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file)
def main(_): tf.logging.set_verbosity(tf.logging.DEBUG) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True.") model_config = ModelConfig.from_json_file(FLAGS.config_file) if FLAGS.max_seq_length > model_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, model_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) processor = TextProcessor(labels=["1","2"]) label_list = processor.get_labels() tokenizer = tokenization.Tokenizer( vocab_file=FLAGS.vocab_file, stop_words_file=FLAGS.stop_words_file, use_pos=FLAGS.use_pos) tf.logging.info("model_config vocab_size:%d, tokenizer.vocab_size:%d"%(model_config.vocab_size, tokenizer.vocab_size)) assert(model_config.vocab_size == tokenizer.vocab_size) if FLAGS.embedding_table is not None: embedding_table = load_embedding_table(FLAGS.embedding_table) else: embedding_table = None assert(len(tokenizer.vocab) == embedding_table.shape[0]) #train_examples = processor.get_train_examples(FLAGS.train_data) train_examples = None num_train_steps = FLAGS.num_train_steps num_warmup_steps = FLAGS.num_warmup_steps #if FLAGS.do_train: # train_examples = processor.get_train_examples(FLAGS.train_data) # num_train_steps = int( # len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) # num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) run_config = tf.estimator.RunConfig( model_dir=FLAGS.output_dir, save_summary_steps=100, save_checkpoints_steps=1000, keep_checkpoint_max=6, log_step_count_steps=100) model_fn = model_fn_builder( model_config=model_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, embedding_table_value=embedding_table, embedding_table_trainable=FLAGS.embedding_table_trainable, use_one_hot_embeddings=False) params = { "batch_size":FLAGS.batch_size, } estimator = tf.estimator.Estimator( model_fn=model_fn, config=run_config, params=params) if FLAGS.do_train: #train_file = os.path.join(FLAGS.output_dir, "train.tf_record") #file_based_convert_examples_to_features( # train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) #tf.logging.info("***** Running training *****") #tf.logging.info(" Num examples = %d", len(train_examples)) #tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) #tf.logging.info(" Num steps = %d", num_train_steps) train_file = FLAGS.train_data train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: pass #eval_examples = processor.get_dev_examples(FLAGS.eval_data) #num_actual_eval_examples = len(eval_examples) #eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") #file_based_convert_examples_to_features( # eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) #tf.logging.info("***** Running evaluation *****") #tf.logging.info(" Num examples = %d", num_actual_eval_examples) #tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) #eval_input_fn = file_based_input_fn_builder( # input_file=eval_file, # seq_length=FLAGS.max_seq_length, # is_training=False, # drop_remainder=False) #eval_steps = None #result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) #output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") #with tf.gfile.GFile(output_eval_file, "w") as writer: # tf.logging.info("***** Eval results *****") # for key in sorted(result.keys()): # tf.logging.info(" %s = %s", key, str(result[key])) # writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.pred_data) num_actual_predict_examples = len(predict_examples) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") file_based_convert_examples_to_features( predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d ", num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False) result = estimator.predict(input_fn=predict_input_fn, hooks=None) output_predict_file = os.path.join(FLAGS.output_dir, "pred_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as writer: num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, prediction) in enumerate(result): text_representation = prediction["text_representation"] keyword_probs = prediction["keyword_probs"] input_ids = prediction["input_ids"] if i >= num_actual_predict_examples: break sorted_keyword_probs = np.argsort(keyword_probs, axis=-1) top_keyword_ids = [] top_keyword_probs = [] for i in range(-1,-6,-1): idx = sorted_keyword_probs[i] top_keyword_ids.append(input_ids[idx]) top_keyword_probs.append(keyword_probs[idx]) #for i, idx in enumerate(sorted_keyword_probs): # top_keyword_ids.append(input_ids[idx]) # top_keyword_probs.append(keyword_probs[idx]) # if i >= 5: # break top_keywords = tokenizer.convert_ids_to_tokens(top_keyword_ids) output_line = "\t".join(kw + ":" + str(prob) for kw,prob in zip(top_keywords, top_keyword_probs)) + "\n" writer.write(output_line) words = tokenizer.convert_ids_to_tokens(input_ids) check_line = "\t".join(w + ":" + str(prob) for w, prob in zip(words, keyword_probs)) + "\n" writer.write(check_line) num_written_lines += 1 print("num_writen_lines:%d,num_actual_predict_examples:%d"%(num_written_lines, num_actual_predict_examples)) assert num_written_lines == num_actual_predict_examples
def main(_): tf.logging.set_verbosity(tf.logging.DEBUG) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict and not FLAGS.do_encode: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' or `do_encode` must be True." ) model_config = ModelConfig.from_json_file(FLAGS.config_file) if FLAGS.max_seq_length > model_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, model_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) processor = PairTextProcessor() tokenizer = tokenization.Tokenizer(vocab_file=FLAGS.vocab_file, stop_words_file=FLAGS.stop_words_file, use_pos=False) tf.logging.info("model_config vocab_size:%d, tokenizer.vocab_size:%d" % (model_config.vocab_size, tokenizer.vocab_size)) assert (model_config.vocab_size == tokenizer.vocab_size) if FLAGS.embedding_table is not None: embedding_table = load_embedding_table(FLAGS.embedding_table) else: embedding_table = None assert (len(tokenizer.vocab) == embedding_table.shape[0]) train_examples = None num_train_steps = FLAGS.num_train_steps num_warmup_steps = FLAGS.num_warmup_steps run_config = tf.estimator.RunConfig(model_dir=FLAGS.output_dir, save_summary_steps=100, save_checkpoints_steps=1000, keep_checkpoint_max=6, log_step_count_steps=100) model_fn = model_fn_builder( model_config=model_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, do_encode=FLAGS.do_encode, embedding_table_value=embedding_table, embedding_table_trainable=FLAGS.embedding_table_trainable, use_one_hot_embeddings=False) params = { "batch_size": FLAGS.batch_size, } estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config, params=params) if FLAGS.do_train: train_file = FLAGS.train_data train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) elif FLAGS.do_eval: pass elif FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.pred_data) num_actual_predict_examples = len(predict_examples) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") file_based_convert_pairexamples_to_features(predict_examples, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d ", num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False) result = estimator.predict(input_fn=predict_input_fn, hooks=None) output_predict_file = os.path.join(FLAGS.output_dir, "pred_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as writer: num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, prediction) in enumerate(result): text_representation = prediction["text_representation"] keyword_probs = prediction["keyword_probs"] input_ids = prediction["input_ids"] if i >= num_actual_predict_examples: break sorted_keyword_probs = np.argsort(keyword_probs, axis=-1) top_keyword_ids = [] top_keyword_probs = [] for i in range(-1, -6, -1): idx = sorted_keyword_probs[i] top_keyword_ids.append(input_ids[idx]) top_keyword_probs.append(keyword_probs[idx]) top_keywords = tokenizer.convert_ids_to_tokens(top_keyword_ids) output_line = "\t".join( kw + ":" + str(prob) for kw, prob in zip( top_keywords, top_keyword_probs)) + "\n" writer.write(output_line) words = tokenizer.convert_ids_to_tokens(input_ids) check_line = "\t".join( w + ":" + str(prob) for w, prob in zip(words, keyword_probs)) + "\n" writer.write(check_line) num_written_lines += 1 print("num_writen_lines:%d,num_actual_predict_examples:%d" % (num_written_lines, num_actual_predict_examples)) assert num_written_lines == num_actual_predict_examples elif FLAGS.do_encode: encode_input_file = FLAGS.encode_data encode_input_fn = file_based_encode_input_fn_builder( input_file=encode_input_file, max_seq_length=FLAGS.max_seq_length, tokenizer=tokenizer) output_file = FLAGS.encode_output wfp = open(output_file, "wb") result = estimator.predict(input_fn=encode_input_fn, hooks=None) text_embeddings = [] for idx, item in enumerate(result): text_embeddings.append(item["text_representation"]) if idx < 10: tf.logging.info("%s" % (item["text_representation"])) pickle.dump(text_embeddings, wfp) wfp.close()