Beispiel #1
0
 def __init__(self,
              init_checkpoint,
              vocab_file,
              stop_words_file,
              config_file,
              embedding_table_file,
              index_vecs_file,
              index_data_file,
              listen_port=5022,
              logger=logging.StreamHandler()):
     super(WordVecTransformer, self).__init__()
     self.logger = logger
     self.port = listen_port
     self.vec_size = 2400
     if not os.path.exists("./tmp"):
         os.mkdir("./tmp")
     model_dir = "./tmp"
     self.index_vecs = np.asarray(self.load_index_bin(index_vecs_file),
                                  dtype=np.float32)
     self.index_data = self.load_index_data(index_data_file)
     self.tokenizer = tokenization.Tokenizer(
         vocab_file=vocab_file,
         stop_words_file=stop_words_file,
         use_pos=False)
     self.model_config = ModelConfig.from_json_file(config_file)
     self.estimator = create_estimator(self.model_config, init_checkpoint,
                                       model_dir, embedding_table_file)
     self.build_index(self.index_vecs)
     self.logger.info("Finish WordVecTransFormer init.")
Beispiel #2
0
def create_pairtexts_data():
    tokenizer = tokenization.Tokenizer(vocab_file=FLAGS.vocab_file,
                                       stop_words_file=FLAGS.stop_words_file,
                                       use_pos=False)
    input_files_all = FLAGS.input_file
    input_files = input_files_all.split('#')
    train_examples = []
    for input_file in input_files:
        tmp_examples = data_util.create_pairexamples_from_tsv_file(input_file)
        train_examples.extend(tmp_examples)
    train_file = FLAGS.output_file
    data_util.file_based_convert_pairexamples_to_features(
        train_examples, FLAGS.max_seq_length, tokenizer, train_file)
Beispiel #3
0
def create_keyword_data():
    label_list = ["1", "2"]
    tokenizer = tokenization.Tokenizer(vocab_file=FLAGS.vocab_file,
                                       stop_words_file=FLAGS.stop_words_file,
                                       use_pos=True)
    input_files_all = FLAGS.input_file
    input_files = input_files_all.split('#')
    train_examples = []
    for input_file in input_files:
        tmp_examples = data_util.create_examples_from_json_file(input_file)
        train_examples.extend(tmp_examples)
    train_file = FLAGS.output_file
    data_util.file_based_convert_examples_to_features(train_examples,
                                                      label_list,
                                                      FLAGS.max_seq_length,
                                                      tokenizer, train_file)
Beispiel #4
0
def main(_):
    tf.logging.set_verbosity(tf.logging.DEBUG)
    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' must be True.")

    model_config = ModelConfig.from_json_file(FLAGS.config_file)
    if FLAGS.max_seq_length > model_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, model_config.max_position_embeddings))

    tf.gfile.MakeDirs(FLAGS.output_dir)

    processor = TextProcessor(labels=["1","2"])
    label_list = processor.get_labels()

    tokenizer = tokenization.Tokenizer(
        vocab_file=FLAGS.vocab_file, stop_words_file=FLAGS.stop_words_file, use_pos=FLAGS.use_pos)
    tf.logging.info("model_config vocab_size:%d, tokenizer.vocab_size:%d"%(model_config.vocab_size, tokenizer.vocab_size))
    assert(model_config.vocab_size == tokenizer.vocab_size)

    if FLAGS.embedding_table is not None:
        embedding_table = load_embedding_table(FLAGS.embedding_table)
    else:
        embedding_table = None

    assert(len(tokenizer.vocab) == embedding_table.shape[0])

    #train_examples = processor.get_train_examples(FLAGS.train_data)
    train_examples = None
    num_train_steps = FLAGS.num_train_steps
    num_warmup_steps = FLAGS.num_warmup_steps
    #if FLAGS.do_train:
    #    train_examples = processor.get_train_examples(FLAGS.train_data)
    #    num_train_steps = int(
    #        len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
    #    num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    run_config = tf.estimator.RunConfig(
        model_dir=FLAGS.output_dir,
        save_summary_steps=100,
        save_checkpoints_steps=1000,
        keep_checkpoint_max=6,
        log_step_count_steps=100)

    model_fn = model_fn_builder(
        model_config=model_config,
        num_labels=len(label_list),
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        embedding_table_value=embedding_table,
        embedding_table_trainable=FLAGS.embedding_table_trainable,
        use_one_hot_embeddings=False)

    params = {
        "batch_size":FLAGS.batch_size,
    }
    estimator = tf.estimator.Estimator(
        model_fn=model_fn,
        config=run_config,
        params=params)

    if FLAGS.do_train:
        #train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
        #file_based_convert_examples_to_features(
        #    train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file)
        #tf.logging.info("***** Running training *****")
        #tf.logging.info("  Num examples = %d", len(train_examples))
        #tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        #tf.logging.info("  Num steps = %d", num_train_steps)
        train_file = FLAGS.train_data
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    if FLAGS.do_eval:
        pass
        #eval_examples = processor.get_dev_examples(FLAGS.eval_data)
        #num_actual_eval_examples = len(eval_examples)
        #eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        #file_based_convert_examples_to_features(
        #    eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file)

        #tf.logging.info("***** Running evaluation *****")
        #tf.logging.info(" Num examples = %d", num_actual_eval_examples)
        #tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size)

        #eval_input_fn = file_based_input_fn_builder(
        #    input_file=eval_file,
        #    seq_length=FLAGS.max_seq_length,
        #    is_training=False,
        #    drop_remainder=False)

        #eval_steps = None
        #result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
        #output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        #with tf.gfile.GFile(output_eval_file, "w") as writer:
        #    tf.logging.info("***** Eval results *****")
        #    for key in sorted(result.keys()):
        #        tf.logging.info("  %s = %s", key, str(result[key]))
        #        writer.write("%s = %s\n" % (key, str(result[key])))

    if FLAGS.do_predict:
        predict_examples = processor.get_test_examples(FLAGS.pred_data)
        num_actual_predict_examples = len(predict_examples)
        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        file_based_convert_examples_to_features(
            predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file)

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d ", num_actual_predict_examples)
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=False)

        result = estimator.predict(input_fn=predict_input_fn, hooks=None)
        output_predict_file = os.path.join(FLAGS.output_dir, "pred_results.tsv")
        with tf.gfile.GFile(output_predict_file, "w") as writer:
            num_written_lines = 0
            tf.logging.info("***** Predict results *****")
            for (i, prediction) in enumerate(result):
                text_representation = prediction["text_representation"]
                keyword_probs = prediction["keyword_probs"]
                input_ids = prediction["input_ids"]
                if i >= num_actual_predict_examples:
                    break

                sorted_keyword_probs = np.argsort(keyword_probs, axis=-1)
                top_keyword_ids = []
                top_keyword_probs = []
                for i in range(-1,-6,-1):
                    idx = sorted_keyword_probs[i]
                    top_keyword_ids.append(input_ids[idx])
                    top_keyword_probs.append(keyword_probs[idx])
                    
                #for i, idx in enumerate(sorted_keyword_probs):
                #    top_keyword_ids.append(input_ids[idx])
                #    top_keyword_probs.append(keyword_probs[idx])
                #    if i >= 5:
                #        break
                top_keywords = tokenizer.convert_ids_to_tokens(top_keyword_ids)
                output_line = "\t".join(kw + ":" + str(prob) for kw,prob in zip(top_keywords, top_keyword_probs)) + "\n"
                writer.write(output_line)
                words = tokenizer.convert_ids_to_tokens(input_ids)
                check_line = "\t".join(w + ":" + str(prob) for w, prob in zip(words, keyword_probs)) + "\n"
                writer.write(check_line)
                num_written_lines += 1
        print("num_writen_lines:%d,num_actual_predict_examples:%d"%(num_written_lines, num_actual_predict_examples))
        assert num_written_lines == num_actual_predict_examples
Beispiel #5
0
def main(_):
    tf.logging.set_verbosity(tf.logging.DEBUG)
    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict and not FLAGS.do_encode:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' or `do_encode` must be True."
        )

    model_config = ModelConfig.from_json_file(FLAGS.config_file)
    if FLAGS.max_seq_length > model_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, model_config.max_position_embeddings))

    tf.gfile.MakeDirs(FLAGS.output_dir)
    processor = PairTextProcessor()
    tokenizer = tokenization.Tokenizer(vocab_file=FLAGS.vocab_file,
                                       stop_words_file=FLAGS.stop_words_file,
                                       use_pos=False)
    tf.logging.info("model_config vocab_size:%d, tokenizer.vocab_size:%d" %
                    (model_config.vocab_size, tokenizer.vocab_size))
    assert (model_config.vocab_size == tokenizer.vocab_size)

    if FLAGS.embedding_table is not None:
        embedding_table = load_embedding_table(FLAGS.embedding_table)
    else:
        embedding_table = None

    assert (len(tokenizer.vocab) == embedding_table.shape[0])

    train_examples = None
    num_train_steps = FLAGS.num_train_steps
    num_warmup_steps = FLAGS.num_warmup_steps

    run_config = tf.estimator.RunConfig(model_dir=FLAGS.output_dir,
                                        save_summary_steps=100,
                                        save_checkpoints_steps=1000,
                                        keep_checkpoint_max=6,
                                        log_step_count_steps=100)

    model_fn = model_fn_builder(
        model_config=model_config,
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        do_encode=FLAGS.do_encode,
        embedding_table_value=embedding_table,
        embedding_table_trainable=FLAGS.embedding_table_trainable,
        use_one_hot_embeddings=False)

    params = {
        "batch_size": FLAGS.batch_size,
    }
    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       config=run_config,
                                       params=params)

    if FLAGS.do_train:
        train_file = FLAGS.train_data
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    elif FLAGS.do_eval:
        pass

    elif FLAGS.do_predict:
        predict_examples = processor.get_test_examples(FLAGS.pred_data)
        num_actual_predict_examples = len(predict_examples)
        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        file_based_convert_pairexamples_to_features(predict_examples,
                                                    FLAGS.max_seq_length,
                                                    tokenizer, predict_file)
        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d ", num_actual_predict_examples)
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=False)

        result = estimator.predict(input_fn=predict_input_fn, hooks=None)
        output_predict_file = os.path.join(FLAGS.output_dir,
                                           "pred_results.tsv")
        with tf.gfile.GFile(output_predict_file, "w") as writer:
            num_written_lines = 0
            tf.logging.info("***** Predict results *****")
            for (i, prediction) in enumerate(result):
                text_representation = prediction["text_representation"]
                keyword_probs = prediction["keyword_probs"]
                input_ids = prediction["input_ids"]
                if i >= num_actual_predict_examples:
                    break

                sorted_keyword_probs = np.argsort(keyword_probs, axis=-1)
                top_keyword_ids = []
                top_keyword_probs = []
                for i in range(-1, -6, -1):
                    idx = sorted_keyword_probs[i]
                    top_keyword_ids.append(input_ids[idx])
                    top_keyword_probs.append(keyword_probs[idx])

                top_keywords = tokenizer.convert_ids_to_tokens(top_keyword_ids)
                output_line = "\t".join(
                    kw + ":" + str(prob) for kw, prob in zip(
                        top_keywords, top_keyword_probs)) + "\n"
                writer.write(output_line)
                words = tokenizer.convert_ids_to_tokens(input_ids)
                check_line = "\t".join(
                    w + ":" + str(prob)
                    for w, prob in zip(words, keyword_probs)) + "\n"
                writer.write(check_line)
                num_written_lines += 1
        print("num_writen_lines:%d,num_actual_predict_examples:%d" %
              (num_written_lines, num_actual_predict_examples))
        assert num_written_lines == num_actual_predict_examples
    elif FLAGS.do_encode:
        encode_input_file = FLAGS.encode_data
        encode_input_fn = file_based_encode_input_fn_builder(
            input_file=encode_input_file,
            max_seq_length=FLAGS.max_seq_length,
            tokenizer=tokenizer)

        output_file = FLAGS.encode_output
        wfp = open(output_file, "wb")
        result = estimator.predict(input_fn=encode_input_fn, hooks=None)
        text_embeddings = []
        for idx, item in enumerate(result):
            text_embeddings.append(item["text_representation"])
            if idx < 10:
                tf.logging.info("%s" % (item["text_representation"]))
        pickle.dump(text_embeddings, wfp)
        wfp.close()