Beispiel #1
0
    def __init__(self,
                 bert_config_file,
                 vocab_file,
                 init_checkpoint,
                 batch_size=4):
        self.bert_config = bert_config_file
        self.max_seq_length = self.bert_config.max_position_embeddings
        self.batch_size = batch_size

        self.bert_config = modeling.BertConfig.from_json_file(bert_config_file)
        self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                                    do_lower_case=True)
        self.model_fn = model_fn_builder(bert_config=self.bert_config,
                                         init_checkpoint=init_checkpoint,
                                         use_one_hot_embeddings=False)
def process_function(data_dir, vocab_file_path, do_train, do_eval, do_test,
                     max_seq_length, max_sent_length, batch_size):
    train_input = None
    eval_input = None
    test_input = None
    processor = MyProcessor()
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file_path,
                                           do_lower_case=True)
    if do_train:
        # train_examples is a list, 每个元素为InputExample(guid=guid, text_a=text_a, text_b=None, label=label)
        train_examples = processor.get_train_examples(data_dir)
        # InputExample ---> features
        train_input = file_based_convert_examples_to_features(
            train_examples,
            max_seq_length,
            max_sent_length,
            tokenizer,
            data_mode='train')
        print('***start to training**')
        print('  Number training examples  %d', len(train_examples))
        print('   Batch size %d', batch_size)
    if do_eval:
        eval_examples = processor.get_dev_examples(data_dir)
        eval_input = file_based_convert_examples_to_features(eval_examples,
                                                             max_seq_length,
                                                             max_sent_length,
                                                             tokenizer,
                                                             data_mode='dev')
        print('***start to validation**')
        print('  Number validate examples  %d', len(eval_examples))
        print('   Batch size %d', batch_size)
    if do_test:
        test_examples = processor.get_test_examples(data_dir)
        test_input = file_based_convert_examples_to_features(test_examples,
                                                             max_seq_length,
                                                             max_sent_length,
                                                             tokenizer,
                                                             data_mode='test')
        print('***start to testing**')
        print('  Number test examples  %d', len(test_examples))
        print('   Batch size %d', batch_size)
    # 返回的 train_input,eval_input,predict_input 都是list
    return train_input, eval_input, test_input
Beispiel #3
0
def process_function(data_dir, vocab_file_path, do_train, do_eval, do_predict,
                     output_dir, max_seq_length, batch_size):
    train_input = None
    eval_input = None
    predict_input = None
    processor = MyProcessor()
    label_list = processor.get_labels()
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file_path,
                                           do_lower_case=True)
    if do_train:
        # train_examples is a list, 每个元素为InputExample(guid=guid, text_a=text_a, text_b=None, label=label)
        train_examples = processor.get_train_examples(data_dir)
        train_file = os.path.join(output_dir, "train.tf_record")
        # InputExample ---> features
        train_input = file_based_convert_examples_to_features(
            train_examples, label_list, max_seq_length, tokenizer, train_file)
        print('***start to training**')
        print('  Number training examples  %d', len(train_examples))
        print('   Batch size %d', batch_size)
    if do_eval:
        eval_examples = processor.get_dev_examples(data_dir)
        eval_file = os.path.join(output_dir, "eval.tf_record")
        eval_input = file_based_convert_examples_to_features(
            eval_examples, label_list, max_seq_length, tokenizer, eval_file)
        print('***start to validation**')
        print('  Number validate examples  %d', len(eval_examples))
        print('   Batch size %d', batch_size)
    if do_predict:
        predict_examples = processor.get_test_examples(data_dir)
        predict_file = os.path.join(output_dir, "predict.tf_record")
        predict_input = file_based_convert_examples_to_features(
            predict_examples, label_list, max_seq_length, tokenizer,
            predict_file)
        print('***start to predict**')
        print('  Number predict examples %d', len(predict_examples))
        print('   Batch size  %d', batch_size)
    # 返回的 train_input,eval_input,predict_input 都是list
    return train_input, eval_input, predict_input
def load_model():
    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
    cluster=tpu_cluster_resolver,
    master=master,
    model_dir=output_dir,
    save_checkpoints_steps=save_checkpoints_steps,
    tpu_config=tf.contrib.tpu.TPUConfig(
                iterations_per_loop=iterations_per_loop,
                num_shards=num_tpu_cores,
                per_host_input_for_training=is_per_host))


    tokenizer = tokenization.FullTokenizer(
                vocab_file=vocab_file, do_lower_case=do_lower_case)

    model_fn = run_classifier.model_fn_builder(
                bert_config=bert_config,
                num_labels=len(label_list),
                init_checkpoint=init_checkpoint,
                learning_rate=learning_rate,
                num_train_steps=num_train_steps,
                num_warmup_steps=num_warmup_steps,
                use_tpu=use_tpu,
                use_one_hot_embeddings=use_tpu)

        # estimator = tf.estimator.Estimator(model_fn=model_fn,
        #                                    params=params,
        #                                    model_dir="./weibo_and_t_train_20/")
    estimator = tf.contrib.tpu.TPUEstimator(
                use_tpu=use_tpu,
                model_fn=model_fn,
                config=run_config,
                train_batch_size=8,
                eval_batch_size=8,
                predict_batch_size=8)
    return estimator,tokenizer
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    processors = {
        "fake_news": FakeNewsProcessor,
    }

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)

    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' must be True.")

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    tf.gfile.MakeDirs(FLAGS.output_dir)

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(
        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    model_fn = model_fn_builder(
        bert_config=bert_config,
        num_labels=len(label_list),
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_tpu)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
        file_based_convert_examples_to_features(
            train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file)
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        num_actual_eval_examples = len(eval_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on. These do NOT count towards the metric (all tf.metrics
            # support a per-instance weight, and these get a weight of 0.0).
            while len(eval_examples) % FLAGS.eval_batch_size != 0:
                eval_examples.append(PaddingInputExample())

        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        file_based_convert_examples_to_features(
            eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file)

        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(eval_examples), num_actual_eval_examples,
                        len(eval_examples) - num_actual_eval_examples)
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        # This tells the estimator to run through the entire set.
        eval_steps = None
        # However, if running eval on the TPU, you will need to specify the
        # number of steps.
        if FLAGS.use_tpu:
            assert len(eval_examples) % FLAGS.eval_batch_size == 0
            eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)

        eval_drop_remainder = True if FLAGS.use_tpu else False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)

        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    if FLAGS.do_predict:
        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        num_actual_predict_examples = len(predict_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on.
            while len(predict_examples) % FLAGS.predict_batch_size != 0:
                predict_examples.append(PaddingInputExample())

        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        file_based_convert_examples_to_features(predict_examples, label_list,
                                                FLAGS.max_seq_length, tokenizer,
                                                predict_file)

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(predict_examples), num_actual_predict_examples,
                        len(predict_examples) - num_actual_predict_examples)
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_drop_remainder = True if FLAGS.use_tpu else False
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        result = estimator.predict(input_fn=predict_input_fn)

        output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv")
        with tf.gfile.GFile(output_predict_file, "w") as writer:
            num_written_lines = 0
            tf.logging.info("***** Predict results *****")
            for (i, prediction) in enumerate(result):
                probabilities = prediction["probabilities"]
                if i >= num_actual_predict_examples:
                    break
                output_line = "\t".join(
                    str(class_probability)
                    for class_probability in probabilities) + "\n"
                writer.write(output_line)
                num_written_lines += 1
        assert num_written_lines == num_actual_predict_examples
Beispiel #6
0
def main(input_data, task_name):
  tf.logging.set_verbosity(tf.logging.INFO)

  processors = {
      "sim": SimProcessor,
      "sent": SentProcessor
  }
  classif = []

  if task_name == 'sim':
	  tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
        	                                        FLAGS.sim_init_checkpoint)
  elif task_name == 'sent':
	  tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
        	                                        FLAGS.sent_init_checkpoint)

  if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
    raise ValueError(
        "At least one of `do_train`, `do_eval` or `do_predict' must be True.")

  bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

  if FLAGS.max_seq_length > bert_config.max_position_embeddings:
    raise ValueError(
        "Cannot use sequence length %d because the BERT model "
        "was only trained up to sequence length %d" %
        (FLAGS.max_seq_length, bert_config.max_position_embeddings))

  #task_name = FLAGS.task_name.lower()

  if task_name not in processors:
    raise ValueError("Task not found: %s" % (task_name))

  processor = processors[task_name]()

  label_list = processor.get_labels()
  print(label_list)

  tokenizer = tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

  tpu_cluster_resolver = None
  if FLAGS.use_tpu and FLAGS.tpu_name:
    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

  is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
  run_config = tf.contrib.tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      master=FLAGS.master,
      model_dir=FLAGS.output_dir,
      save_checkpoints_steps=FLAGS.save_checkpoints_steps,
      tpu_config=tf.contrib.tpu.TPUConfig(
          iterations_per_loop=FLAGS.iterations_per_loop,
          num_shards=FLAGS.num_tpu_cores,
          per_host_input_for_training=is_per_host))

  train_examples = None
  num_train_steps = None
  num_warmup_steps = None

  if task_name == 'sim':
    model_fn = model_fn_builder(
        bert_config=bert_config,
        num_labels=len(label_list),
        init_checkpoint=FLAGS.sim_init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_tpu)
  elif task_name == 'sent':
    model_fn = model_fn_builder(
        bert_config=bert_config,
        num_labels=len(label_list),
        init_checkpoint=FLAGS.sent_init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_tpu)

  # If TPU is not available, this will fall back to normal Estimator on CPU
  # or GPU.
  estimator = tf.contrib.tpu.TPUEstimator(
      use_tpu=FLAGS.use_tpu,
      model_fn=model_fn,
      config=run_config,
      train_batch_size=FLAGS.train_batch_size,
      eval_batch_size=FLAGS.eval_batch_size,
      predict_batch_size=FLAGS.predict_batch_size)

  if FLAGS.do_predict:
    predict_examples = processor.get_test_examples(input_data)
    num_actual_predict_examples = len(predict_examples)
    if FLAGS.use_tpu:
      # TPU requires a fixed batch size for all batches, therefore the number
      # of examples must be a multiple of the batch size, or else examples
      # will get dropped. So we pad with fake examples which are ignored
      # later on.
      while len(predict_examples) % FLAGS.predict_batch_size != 0:
        predict_examples.append(PaddingInputExample())

    predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
    file_based_convert_examples_to_features(predict_examples, label_list,
                                            FLAGS.max_seq_length, tokenizer,
                                            predict_file)

    tf.logging.info("***** Running prediction*****")
    tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                    len(predict_examples), num_actual_predict_examples,
                    len(predict_examples) - num_actual_predict_examples)
    tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

    predict_drop_remainder = True if FLAGS.use_tpu else False
    predict_input_fn = file_based_input_fn_builder(
        input_file=predict_file,
        seq_length=FLAGS.max_seq_length,
        is_training=False,
        drop_remainder=predict_drop_remainder)

    result = estimator.predict(input_fn=predict_input_fn)
    for prediction in result:
      probabilities = prediction["probabilities"]
      classif.append(probabilities) 

  return classif, label_list
Beispiel #7
0
def main():
    tf.logging.set_verbosity(tf.logging.INFO)

    processors = {
        "cluener": NerProcessor,
    }

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)

    # if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
    #     raise ValueError(
    #         "At least one of `do_train`, `do_eval` or `do_predict' must be True.")

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    tf.gfile.MakeDirs(FLAGS.output_dir)

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    tag_list = processor.get_tags()

    tokenizer = tokenization.FullTokenizer(
        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
    # num_labels=2 * len(tag_list) + 1 BI两种外加一个O
    model_fn = model_fn_builder(
        bert_config=bert_config,
        num_labels=2*len(tag_list) + 1,
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_tpu)

    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        train_file = os.path.join(FLAGS.data_dir, "train.tf_record")
        file_based_convert_examples_to_features(
            train_examples, tag_list, FLAGS.max_seq_length, tokenizer, train_file)
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        num_actual_eval_examples = len(eval_examples)
        if FLAGS.use_tpu:
            while len(eval_examples) % FLAGS.eval_batch_size != 0:
                eval_examples.append(PaddingInputExample())

        eval_file = os.path.join(FLAGS.data_dir, "eval.tf_record")
        file_based_convert_examples_to_features(
            eval_examples, tag_list, FLAGS.max_seq_length, tokenizer, eval_file)

        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(eval_examples), num_actual_eval_examples,
                        len(eval_examples) - num_actual_eval_examples)
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        # This tells the estimator to run through the entire set.
        eval_steps = None
        if FLAGS.use_tpu:
            assert len(eval_examples) % FLAGS.eval_batch_size == 0
            eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)

        eval_drop_remainder = True if FLAGS.use_tpu else False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)

        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    if FLAGS.do_predict:
        # label dict的设置
        tag_ids = {0: 'O', 1: 'B-address', 2: 'I-address', 3: 'B-book', 4: 'I-book',
                   5: 'B-company', 6: 'I-company', 7: 'B-game', 8: 'I-game',
                   9: 'B-government', 10: 'I-government', 11: 'B-movie', 12: 'I-movie',
                   13: 'B-name', 14: 'I-name', 15: 'B-organization', 16: 'I-organization',
                   17: 'B-position', 18: 'I-position', 19: 'B-scene', 20: 'I-scene'}

        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        num_actual_predict_examples = len(predict_examples)
        test_file = os.path.join(FLAGS.data_dir, "test.tf_record")
        file_based_convert_examples_to_features(predict_examples, tag_list,
                                                FLAGS.max_seq_length, tokenizer,
                                                test_file)

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(predict_examples), num_actual_predict_examples,
                        len(predict_examples) - num_actual_predict_examples)
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_drop_remainder = True if FLAGS.use_tpu else False
        predict_input_fn = file_based_input_fn_builder(
            input_file=test_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        results = estimator.predict(input_fn=predict_input_fn)

        output_file = os.path.join(FLAGS.data_dir, 'clue_predict.json')
        with open(output_file, 'w', encoding='utf-8') as fr:
            for example, result in zip(predict_examples, results):
                pre_id = result['predictions']
                # print(f'text is {example.text_a}')
                # print(f'preid is {pre_id}')
                text = example.text_a
                # 只获取text中的长度的tag输出
                tags = [tag_ids[tag] for tag in pre_id][1:len(text) + 1]
                res_words, res_pos = get_result(text, tags)
                rs = {}
                for w, t in zip(res_words, res_pos):
                    rs[t] = rs.get(t, []) + [w]
                pres = {}
                for t, ws in rs.items():
                    temp = {}
                    for w in ws:
                        word = text[w[0]: w[1] + 1]
                        temp[word] = temp.get(word, []) + [w]
                    pres[t] = temp
                output_line = json.dumps({'id': example.guid, 'label': pres}, ensure_ascii=False) + '\n'
                fr.write(output_line)
Beispiel #8
0
def main():
    tf.logging.set_verbosity(tf.logging.INFO)

    processors = {
        "tnews": TnewsProcessor,
    }

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)

    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' must be True."
        )

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    tf.gfile.MakeDirs(FLAGS.output_dir)

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list),
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_tpu=FLAGS.use_tpu,
                                use_one_hot_embeddings=FLAGS.use_tpu)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        train_file = os.path.join(FLAGS.data_dir, "train.tf_record")
        file_based_convert_examples_to_features(train_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, train_file)
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        num_actual_eval_examples = len(eval_examples)
        if FLAGS.use_tpu:
            while len(eval_examples) % FLAGS.eval_batch_size != 0:
                eval_examples.append(PaddingInputExample())

        eval_file = os.path.join(FLAGS.data_dir, "eval.tf_record")
        file_based_convert_examples_to_features(eval_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, eval_file)

        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(eval_examples), num_actual_eval_examples,
                        len(eval_examples) - num_actual_eval_examples)
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        # This tells the estimator to run through the entire set.
        eval_steps = None
        if FLAGS.use_tpu:
            assert len(eval_examples) % FLAGS.eval_batch_size == 0
            eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)

        eval_drop_remainder = True if FLAGS.use_tpu else False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)

        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    if FLAGS.do_predict:
        # label dict的设置
        label_dict = {
            0: 100,
            1: 101,
            2: 102,
            3: 103,
            4: 104,
            5: 106,
            6: 107,
            7: 108,
            8: 109,
            9: 110,
            10: 112,
            11: 113,
            12: 114,
            13: 115,
            14: 116
        }
        label_desc = {
            100: "news_story",
            101: "news_culture",
            102: "news_entertainment",
            103: "news_sports",
            104: "news_finance",
            106: "news_house",
            107: "news_car",
            108: "news_edu",
            109: "news_tech",
            110: "news_military",
            112: "news_travel",
            113: "news_world",
            114: "news_stock",
            115: "news_agriculture",
            116: "news_game"
        }

        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        num_actual_predict_examples = len(predict_examples)
        test_file = os.path.join(FLAGS.data_dir, "test.tf_record")
        file_based_convert_examples_to_features(predict_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, test_file)

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(predict_examples), num_actual_predict_examples,
                        len(predict_examples) - num_actual_predict_examples)
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_drop_remainder = True if FLAGS.use_tpu else False
        predict_input_fn = file_based_input_fn_builder(
            input_file=test_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        results = estimator.predict(input_fn=predict_input_fn)

        output_file = os.path.join(FLAGS.output_dir, 'news_predict.json')
        with open(output_file, 'w', encoding='utf-8') as fr:
            print(results)
            for index, result in enumerate(results):
                pre_id = result['predictions']
                print(f'the index is {index} preid is {pre_id}')
                label = label_dict.get(pre_id)
                label_d = label_desc.get(label)

                json_str = json.dumps({
                    "id": index,
                    "label": str(label),
                    "label_desc": label_d
                })
                fr.write(json_str)
                fr.write('\n')
def main():
    """ 训练主入口 """
    tf.logging.info('start to train')

    # 部分参数设置
    process = AllProcessor()
    label_list = process.get_labels()
    tokenizer = tokenization.FullTokenizer(
        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

    train_examples = process.get_train_examples(FLAGS.data_dir)
    train_cnt = file_based_convert_examples_to_features(
        train_examples,
        label_list,
        FLAGS.max_seq_length,
        tokenizer,
        FLAGS.data_dir,
        'train'
    )
    dev_examples = process.get_dev_examples(FLAGS.data_dir)
    dev_cnt = file_based_convert_examples_to_features(
        dev_examples,
        label_list,
        FLAGS.max_seq_length,
        tokenizer,
        FLAGS.data_dir,
        'dev'
    )

    # 输入输出定义
    input_ids = tf.placeholder(tf.int64, shape=[None, FLAGS.max_seq_length],
                               name='input_ids')
    input_mask = tf.placeholder(tf.int64, shape=[None, FLAGS.max_seq_length],
                                name='input_mask')
    segment_ids = tf.placeholder(tf.int64, shape=[None, FLAGS.max_seq_length],
                                 name='segment_ids')
    labels = tf.placeholder(tf.int64, shape=[None], name='labels')
    task = tf.placeholder(tf.int64, name='task')

    # bert相关参数设置
    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    loss, logits, acc, pre_id = create_model(
        bert_config,
        True,
        input_ids,
        input_mask,
        segment_ids,
        labels,
        False,
        task
    )
    num_train_steps = int(len(train_examples) / FLAGS.train_batch_size)
    num_warmup_steps = math.ceil(
        num_train_steps * FLAGS.train_batch_size * FLAGS.warmup_proportion)
    train_op = optimization.create_optimizer(
        loss,
        FLAGS.learning_rate,
        num_train_steps * FLAGS.num_train_epochs,
        num_warmup_steps,
        False
    )

    # 初始化参数
    init_global = tf.global_variables_initializer()
    saver = tf.train.Saver(
        [v for v in tf.global_variables()
         if 'adam_v' not in v.name and 'adam_m' not in v.name])

    with tf.Session() as sess:
        sess.run(init_global)
        print('start to load bert params')
        if FLAGS.init_checkpoint:
            # tvars = tf.global_variables()
            tvars = tf.trainable_variables()
            print("global_variables", len(tvars))
            assignment_map, initialized_variable_names = \
                modeling.get_assignment_map_from_checkpoint(tvars,
                                                            FLAGS.init_checkpoint)
            print("initialized_variable_names:", len(initialized_variable_names))
            saver_ = tf.train.Saver([v for v in tvars if v.name in initialized_variable_names])
            saver_.restore(sess, FLAGS.init_checkpoint)
            tvars = tf.global_variables()
            # initialized_vars = [v for v in tvars if v.name in initialized_variable_names]
            not_initialized_vars = [v for v in tvars if v.name not in initialized_variable_names]
            print('all size %s; not initialized size %s' % (len(tvars), len(not_initialized_vars)))
            if len(not_initialized_vars):
                sess.run(tf.variables_initializer(not_initialized_vars))
            # for v in initialized_vars:
            #     print('initialized: %s, shape = %s' % (v.name, v.shape))
            # for v in not_initialized_vars:
            #     print('not initialized: %s, shape = %s' % (v.name, v.shape))
        else:
            print('the bert init checkpoint is None!!!')
            sess.run(tf.global_variables_initializer())

        # 训练的step
        def train_step(ids, mask, seg, true_y, task_id):
            feed = {input_ids: ids,
                    input_mask: mask,
                    segment_ids: seg,
                    labels: true_y,
                    task: task_id}
            _, logits_out, loss_out = sess.run([train_op, logits, loss], feed_dict=feed)
            return logits_out, loss_out

        # 验证的step
        def dev_step(ids, mask, seg, true_y, task_id):
            feed = {input_ids: ids,
                    input_mask: mask,
                    segment_ids: seg,
                    labels: true_y,
                    task: task_id}
            pre_out, acc_out = sess.run([pre_id, acc], feed_dict=feed)
            return pre_out, acc_out

        # 开始训练
        for epoch in range(FLAGS.num_train_epochs):
            tf.logging.info(f'start to train and the epoch:{epoch}')
            epoch_loss = do_train(sess, train_cnt, train_step, epoch)
            tf.logging.info(f'the epoch{epoch} loss is {epoch_loss}')
            saver.save(sess, FLAGS.output_dir + 'bert.ckpt', global_step=epoch)
            # 每一个epoch开始验证模型
            do_eval(sess, dev_cnt, dev_step)

        # 进行预测并保存结果
        do_predict(label_list, process, tokenizer, dev_step)

        tf.logging.info('the training is over!!!!')