Python FullTokenizer Examples

Programming Language: Python

Namespace/Package Name: albert.tokenization

Method/Function: FullTokenizer

Examples at hotexamples.com: 24

Python FullTokenizer - 24 examples found. These are the top rated real world Python examples of albert.tokenization.FullTokenizer extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: albert_model.py Project: snowlixue/NLP_Learn

def get_test_example():
    processor = processors[task_name]()
    label_list = processor.get_labels()
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           spm_model_file=spm_model_file)
    # save data to tf_record
    #test_examples = processor.get_test_examples("")#测试数据目录
    test_examples = processor.get_test_examples(data_dir)
    features = get_test_features(test_examples, label_list, max_seq_length,
                                 tokenizer)

    return features

Example #2

Show file

def create_tokenizer_from_hub_module(albert_hub_module_handle):
    """Get the vocab file and casing info from the Hub module."""
    with tf.Graph().as_default():
        albert_module = hub.Module(albert_hub_module_handle)
        tokenization_info = albert_module(signature="tokenization_info",
                                          as_dict=True)
        with tf.Session() as sess:
            vocab_file, do_lower_case = sess.run([
                tokenization_info["vocab_file"],
                tokenization_info["do_lower_case"]
            ])
    return tokenization.FullTokenizer(vocab_file=vocab_file,
                                      do_lower_case=do_lower_case,
                                      spm_model_file=FLAGS.spm_model_file)

Example #3

Show file

File: tag.py Project: karthikmurugesan2/Malaya

def transformer(path, s3_path, class_name, model='xlnet', **kwargs):
    check_file(path[model], s3_path[model], **kwargs)
    g = load_graph(path[model]['model'], **kwargs)

    try:
        with open(path[model]['setting']) as fopen:
            nodes = json.load(fopen)
    except:
        raise Exception(
            f"model corrupted due to some reasons, please run malaya.clear_cache('{class_name}/{model}/{size}') and try again"
        )

    if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
        if model in ['bert', 'tiny-bert']:
            tokenizer = sentencepiece_tokenizer_bert(path[model]['tokenizer'],
                                                     path[model]['vocab'])

        if model in ['albert', 'tiny-albert']:
            from albert import tokenization

            tokenizer = tokenization.FullTokenizer(
                vocab_file=path[model]['vocab'],
                do_lower_case=False,
                spm_model_file=path[model]['tokenizer'],
            )

        return TAGGING_BERT(
            X=g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids=None,
            input_masks=g.get_tensor_by_name('import/Placeholder_1:0'),
            logits=g.get_tensor_by_name('import/logits:0'),
            sess=generate_session(graph=g, **kwargs),
            tokenizer=tokenizer,
            settings=nodes,
        )

    if model in ['xlnet', 'alxlnet']:
        tokenizer = sentencepiece_tokenizer_xlnet(path[model]['tokenizer'])
        return TAGGING_XLNET(
            X=g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'),
            input_masks=g.get_tensor_by_name('import/Placeholder_2:0'),
            logits=g.get_tensor_by_name('import/logits:0'),
            sess=generate_session(graph=g, **kwargs),
            tokenizer=tokenizer,
            settings=nodes,
        )

Example #4

Show file

File: __init__.py Project: lkngin/Malaya

def load(model: str = 'albert', **kwargs):
    """
    Load albert model.

    Parameters
    ----------
    model : str, optional (default='base')
        Model architecture supported. Allowed values:

        * ``'albert'`` - base albert-bahasa released by Malaya.
        * ``'albert-tiny'`` - tiny bert-bahasa released by Malaya.

    Returns
    -------
    result : malaya.transformers.albert.Model class
    """

    from malaya.path import PATH_ALBERT, S3_PATH_ALBERT
    from malaya.function import check_file

    model = model.lower()
    check_file(PATH_ALBERT[model]['model'], S3_PATH_ALBERT[model], **kwargs)

    if not os.path.exists(PATH_ALBERT[model]['directory'] + 'model.ckpt'):
        import tarfile

        with tarfile.open(PATH_ALBERT[model]['model']['model']) as tar:
            tar.extractall(path=PATH_ALBERT[model]['path'])

    from albert import tokenization

    bert_checkpoint = PATH_ALBERT[model]['directory'] + 'model.ckpt'
    vocab_model = PATH_ALBERT[model]['directory'] + 'sp10m.cased.v10.model'
    vocab = PATH_ALBERT[model]['directory'] + 'sp10m.cased.v10.vocab'
    bert_config = PATH_ALBERT[model]['directory'] + 'config.json'

    tokenizer = tokenization.FullTokenizer(vocab_file=vocab,
                                           do_lower_case=False,
                                           spm_model_file=vocab_model)

    bert_config = modeling.AlbertConfig.from_json_file(bert_config)
    model = Model(bert_config, tokenizer)
    model._saver.restore(model._sess, bert_checkpoint)
    return model

Example #5

Show file

def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    tokenizer = tokenization.FullTokenizer(
        vocab_file=FLAGS.vocab_file,
        do_lower_case=FLAGS.do_lower_case,
        spm_model_file=FLAGS.spm_model_file,
    )

    input_files = []
    for input_pattern in FLAGS.input_file.split(","):
        input_files.extend(tf.gfile.Glob(input_pattern))

    tf.logging.info("*** Reading from input files ***")
    for input_file in input_files:
        tf.logging.info("  %s", input_file)

    rng = random.Random(FLAGS.random_seed)
    instances = create_training_instances(
        input_files,
        tokenizer,
        FLAGS.max_seq_length,
        FLAGS.dupe_factor,
        FLAGS.short_seq_prob,
        FLAGS.masked_lm_prob,
        FLAGS.max_predictions_per_seq,
        rng,
    )

    tf.logging.info("number of instances: %i", len(instances))

    output_files = FLAGS.output_file.split(",")
    tf.logging.info("*** Writing to output files ***")
    for output_file in output_files:
        tf.logging.info("  %s", output_file)

    write_instance_to_example_files(
        instances,
        tokenizer,
        FLAGS.max_seq_length,
        FLAGS.max_predictions_per_seq,
        output_files,
    )

Example #6

Show file

    def sentence_to_idx(self, text):
        """
        将分词后的句子转换成idx表示
        :return:
        """
        tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_path,
                                               do_lower_case=True)

        text = tokenization.convert_to_unicode(text)
        tokens = tokenizer.tokenize(text)
        tokens = ["[CLS]"] + tokens + ["[SEP]"]
        input_id = tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_id)
        segment_id = [0] * len(input_id)

        input_id, input_mask, segment_id = self.padding(
            input_id, input_mask, segment_id)

        return [input_id], [input_mask], [segment_id]

Example #7

Show file

    def trans_to_index(self, inputs):
        """
        将输入转化为索引表示
        :param inputs: 输入
        :return:
        """
        tokenizer = tokenization.FullTokenizer(vocab_file=self.__vocab_path,
                                               do_lower_case=True)
        input_ids = []
        input_masks = []
        segment_ids = []
        for text in inputs:
            text = tokenization.convert_to_unicode(text)
            tokens = tokenizer.tokenize(text)
            tokens = ["[CLS]"] + tokens + ["[SEP]"]
            input_id = tokenizer.convert_tokens_to_ids(tokens)
            input_ids.append(input_id)
            input_masks.append([1] * len(input_id))
            segment_ids.append([0] * len(input_id))

        return input_ids, input_masks, segment_ids

Example #8

Show file

  def test_full_tokenizer(self):
    vocab_tokens = [
        "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
        "##ing", ","
    ]
    with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
      if six.PY2:
        vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
      else:
        contents = "".join([six.ensure_str(x) + "\n" for x in vocab_tokens])
        vocab_writer.write(six.ensure_binary(contents, "utf-8"))

      vocab_file = vocab_writer.name

    tokenizer = tokenization.FullTokenizer(vocab_file)
    os.unlink(vocab_file)

    tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
    self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])

    self.assertAllEqual(
        tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])

Example #9

Show file

File: albert_model.py Project: snowlixue/NLP_Learn

def make_tf_record(output_dir, data_dir, vocab_file, spm_model_file):
    tf.gfile.MakeDirs(output_dir)  #"model/bert"
    processor = processors[task_name]()  #"atec"
    label_list = processor.get_labels()
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           spm_model_file=spm_model_file)
    train_file = os.path.join(output_dir, "train.tf_record")
    eval_file = os.path.join(output_dir, "eval.tf_record")

    # save data to tf_record
    if not os.path.isfile(train_file):
        train_examples = processor.get_train_examples(data_dir)
        file_based_convert_examples_to_features(train_examples, label_list,
                                                max_seq_length, tokenizer,
                                                train_file, task_name)
        del train_examples
    # eval data
    if not os.path.isfile(eval_file):
        eval_examples = processor.get_dev_examples(data_dir)
        file_based_convert_examples_to_features(eval_examples, label_list,
                                                max_seq_length, tokenizer,
                                                eval_file, task_name)
        del eval_examples

Example #10

Show file

def main(_):
  tf.logging.set_verbosity(tf.logging.INFO)

  albert_config = modeling.AlbertConfig.from_json_file(FLAGS.albert_config_file)

  validate_flags_or_throw(albert_config)

  tf.gfile.MakeDirs(FLAGS.output_dir)

  tokenizer = tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case,
      spm_model_file=FLAGS.spm_model_file)

  tpu_cluster_resolver = None
  if FLAGS.use_tpu and FLAGS.tpu_name:
    tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
        FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

  is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2
  if FLAGS.do_train:
    iterations_per_loop = int(min(FLAGS.iterations_per_loop,
                                  FLAGS.save_checkpoints_steps))
  else:
    iterations_per_loop = FLAGS.iterations_per_loop
  run_config = contrib_tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      master=FLAGS.master,
      model_dir=FLAGS.output_dir,
      save_checkpoints_steps=FLAGS.save_checkpoints_steps,
      tpu_config=contrib_tpu.TPUConfig(
          iterations_per_loop=iterations_per_loop,
          num_shards=FLAGS.num_tpu_cores,
          per_host_input_for_training=is_per_host))

  train_examples = None
  num_train_steps = None
  num_warmup_steps = None
  train_examples = squad_utils.read_squad_examples(
      input_file=FLAGS.train_file, is_training=True)
  num_train_steps = int(
      len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
  if FLAGS.do_train:
    num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    # Pre-shuffle the input to avoid having to make a very large shuffle
    # buffer in in the `input_fn`.
    rng = random.Random(12345)
    rng.shuffle(train_examples)

  model_fn = squad_utils.v1_model_fn_builder(
      albert_config=albert_config,
      init_checkpoint=FLAGS.init_checkpoint,
      learning_rate=FLAGS.learning_rate,
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps,
      use_tpu=FLAGS.use_tpu,
      use_one_hot_embeddings=FLAGS.use_tpu)

  # If TPU is not available, this will fall back to normal Estimator on CPU
  # or GPU.
  estimator = contrib_tpu.TPUEstimator(
      use_tpu=FLAGS.use_tpu,
      model_fn=model_fn,
      config=run_config,
      train_batch_size=FLAGS.train_batch_size,
      predict_batch_size=FLAGS.predict_batch_size)

  if FLAGS.do_train:
    # We write to a temporary file to avoid storing very large constant tensors
    # in memory.

    if not tf.gfile.Exists(FLAGS.train_feature_file):
      train_writer = squad_utils.FeatureWriter(
          filename=os.path.join(FLAGS.train_feature_file), is_training=True)
      squad_utils.convert_examples_to_features(
          examples=train_examples,
          tokenizer=tokenizer,
          max_seq_length=FLAGS.max_seq_length,
          doc_stride=FLAGS.doc_stride,
          max_query_length=FLAGS.max_query_length,
          is_training=True,
          output_fn=train_writer.process_feature,
          do_lower_case=FLAGS.do_lower_case)
      train_writer.close()

    tf.logging.info("***** Running training *****")
    tf.logging.info("  Num orig examples = %d", len(train_examples))
    # tf.logging.info("  Num split examples = %d", train_writer.num_features)
    tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
    tf.logging.info("  Num steps = %d", num_train_steps)
    del train_examples

    train_input_fn = squad_utils.input_fn_builder(
        input_file=FLAGS.train_feature_file,
        seq_length=FLAGS.max_seq_length,
        is_training=True,
        drop_remainder=True,
        use_tpu=FLAGS.use_tpu,
        bsz=FLAGS.train_batch_size,
        is_v2=False)
    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

  if FLAGS.do_predict:
    with tf.gfile.Open(FLAGS.predict_file) as predict_file:
      prediction_json = json.load(predict_file)["data"]

    eval_examples = squad_utils.read_squad_examples(
        input_file=FLAGS.predict_file, is_training=False)

    if (tf.gfile.Exists(FLAGS.predict_feature_file) and tf.gfile.Exists(
        FLAGS.predict_feature_left_file)):
      tf.logging.info("Loading eval features from {}".format(
          FLAGS.predict_feature_left_file))
      with tf.gfile.Open(FLAGS.predict_feature_left_file, "rb") as fin:
        eval_features = pickle.load(fin)
    else:
      eval_writer = squad_utils.FeatureWriter(
          filename=FLAGS.predict_feature_file, is_training=False)
      eval_features = []

      def append_feature(feature):
        eval_features.append(feature)
        eval_writer.process_feature(feature)

      squad_utils.convert_examples_to_features(
          examples=eval_examples,
          tokenizer=tokenizer,
          max_seq_length=FLAGS.max_seq_length,
          doc_stride=FLAGS.doc_stride,
          max_query_length=FLAGS.max_query_length,
          is_training=False,
          output_fn=append_feature,
          do_lower_case=FLAGS.do_lower_case)
      eval_writer.close()

      with tf.gfile.Open(FLAGS.predict_feature_left_file, "wb") as fout:
        pickle.dump(eval_features, fout)

    tf.logging.info("***** Running predictions *****")
    tf.logging.info("  Num orig examples = %d", len(eval_examples))
    tf.logging.info("  Num split examples = %d", len(eval_features))
    tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

    predict_input_fn = squad_utils.input_fn_builder(
        input_file=FLAGS.predict_feature_file,
        seq_length=FLAGS.max_seq_length,
        is_training=False,
        drop_remainder=False,
        use_tpu=FLAGS.use_tpu,
        bsz=FLAGS.predict_batch_size,
        is_v2=False)

    def get_result(checkpoint):
      """Evaluate the checkpoint on SQuAD 1.0."""
      # If running eval on the TPU, you will need to specify the number of
      # steps.
      reader = tf.train.NewCheckpointReader(checkpoint)
      global_step = reader.get_tensor(tf.GraphKeys.GLOBAL_STEP)
      all_results = []
      for result in estimator.predict(
          predict_input_fn, yield_single_examples=True,
          checkpoint_path=checkpoint):
        if len(all_results) % 1000 == 0:
          tf.logging.info("Processing example: %d" % (len(all_results)))
        unique_id = int(result["unique_ids"])
        start_log_prob = [float(x) for x in result["start_log_prob"].flat]
        end_log_prob = [float(x) for x in result["end_log_prob"].flat]
        all_results.append(
            squad_utils.RawResult(
                unique_id=unique_id,
                start_log_prob=start_log_prob,
                end_log_prob=end_log_prob))

      output_prediction_file = os.path.join(
          FLAGS.output_dir, "predictions.json")
      output_nbest_file = os.path.join(
          FLAGS.output_dir, "nbest_predictions.json")

      result_dict = {}
      squad_utils.accumulate_predictions_v1(
          result_dict, eval_examples, eval_features,
          all_results, FLAGS.n_best_size, FLAGS.max_answer_length)
      predictions = squad_utils.write_predictions_v1(
          result_dict, eval_examples, eval_features, all_results,
          FLAGS.n_best_size, FLAGS.max_answer_length,
          output_prediction_file, output_nbest_file)

      return squad_utils.evaluate_v1(
          prediction_json, predictions), int(global_step)

    def _find_valid_cands(curr_step):
      filenames = tf.gfile.ListDirectory(FLAGS.output_dir)
      candidates = []
      for filename in filenames:
        if filename.endswith(".index"):
          ckpt_name = filename[:-6]
          idx = ckpt_name.split("-")[-1]
          if idx != "best" and int(idx) > curr_step:
            candidates.append(filename)
      return candidates

    output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
    checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best")
    key_name = "f1"
    writer = tf.gfile.GFile(output_eval_file, "w")
    if tf.gfile.Exists(checkpoint_path + ".index"):
      result = get_result(checkpoint_path)
      best_perf = result[0][key_name]
      global_step = result[1]
    else:
      global_step = -1
      best_perf = -1
      checkpoint_path = None
    while global_step < num_train_steps:
      steps_and_files = {}
      filenames = tf.gfile.ListDirectory(FLAGS.output_dir)
      for filename in filenames:
        if filename.endswith(".index"):
          ckpt_name = filename[:-6]
          cur_filename = os.path.join(FLAGS.output_dir, ckpt_name)
          if cur_filename.split("-")[-1] == "best":
            continue
          gstep = int(cur_filename.split("-")[-1])
          if gstep not in steps_and_files:
            tf.logging.info("Add {} to eval list.".format(cur_filename))
            steps_and_files[gstep] = cur_filename
      tf.logging.info("found {} files.".format(len(steps_and_files)))
      if not steps_and_files:
        tf.logging.info("found 0 file, global step: {}. Sleeping."
                        .format(global_step))
        time.sleep(1)
      else:
        for ele in sorted(steps_and_files.items()):
          step, checkpoint_path = ele
          if global_step >= step:
            if len(_find_valid_cands(step)) > 1:
              for ext in ["meta", "data-00000-of-00001", "index"]:
                src_ckpt = checkpoint_path + ".{}".format(ext)
                tf.logging.info("removing {}".format(src_ckpt))
                tf.gfile.Remove(src_ckpt)
            continue
          result, global_step = get_result(checkpoint_path)
          tf.logging.info("***** Eval results *****")
          for key in sorted(result.keys()):
            tf.logging.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))
          if result[key_name] > best_perf:
            best_perf = result[key_name]
            for ext in ["meta", "data-00000-of-00001", "index"]:
              src_ckpt = checkpoint_path + ".{}".format(ext)
              tgt_ckpt = checkpoint_path.rsplit(
                  "-", 1)[0] + "-best.{}".format(ext)
              tf.logging.info("saving {} to {}".format(src_ckpt, tgt_ckpt))
              tf.gfile.Copy(src_ckpt, tgt_ckpt, overwrite=True)
              writer.write("saved {} to {}\n".format(src_ckpt, tgt_ckpt))
          writer.write("best {} = {}\n".format(key_name, best_perf))
          tf.logging.info("  best {} = {}\n".format(key_name, best_perf))

          if len(_find_valid_cands(global_step)) > 2:
            for ext in ["meta", "data-00000-of-00001", "index"]:
              src_ckpt = checkpoint_path + ".{}".format(ext)
              tf.logging.info("removing {}".format(src_ckpt))
              tf.gfile.Remove(src_ckpt)
          writer.write("=" * 50 + "\n")

    checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best")
    result, global_step = get_result(checkpoint_path)

    tf.logging.info("***** Final Eval results *****\n")
    writer.write("===== Hyperparameters =====\n")
    writer.write("Training batch size: {}\n".format(FLAGS.train_batch_size))
    writer.write("Max sequence length: {}\n".format(FLAGS.max_seq_length))
    writer.write("Learning rate: {}\n".format(FLAGS.learning_rate))
    if num_train_steps and num_warmup_steps:
        writer.write("Training steps: {}\n".format(num_train_steps))
        writer.write("Warmup steps: {}\n".format(num_warmup_steps))
    writer.write("===== Evuations =====\n")

    for key in sorted(result.keys()):
      tf.logging.info("  %s = %s", key, str(result[key]))
      writer.write("%s = %s\n" % (key, str(result[key])))
    writer.write("best perf happened at step: {}".format(global_step))

Example #11

Show file

File: data_helper.py Project: duguiming111/NER-ALBERT-BiLSTM-CRF

# Author: dgm
# Description: 数据预处理
# Date: 2020-08-14
import math
import codecs
import random

from albert import tokenization
from utils import create_dico, create_mapping, zero_digits

tokenizer = tokenization.FullTokenizer(
    vocab_file='albert_model/albert_base/vocab_chinese.txt',
    do_lower_case=True)


def load_sentences(path, lower, zeros):
    """
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    """
    sentences = []
    sentence = []
    num = 0
    for line in codecs.open(path, 'r', 'utf8'):
        num += 1
        line = zero_digits(line.rstrip()) if zeros else line.rstrip()
        if not line:
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    sentences.append(sentence)
                sentence = []

Example #12

Show file

File: data_helper.py Project: zoeyhub/bert-for-task

    def trans_to_features(self, examples, is_training):
        """
        将输入转化为索引表示
        :param examples: 输入
        :param is_training:
        :return:
        """
        tokenizer = tokenization.FullTokenizer(vocab_file=self.__vocab_path,
                                               do_lower_case=True)
        features = []
        unique_id = 1000000000
        for (example_index, example) in enumerate(examples):
            # 用wordpiece的方法对query进行分词处理
            query_tokens = tokenizer.tokenize(example['question'])
            # 给定query一个最大长度来控制query的长度
            if len(query_tokens) > self.__query_length:
                query_tokens = query_tokens[:self.__query_length]

            # 主要是针对context构造索引，之前我们将中文，标点符号，空格，一连串的数字，英文单词分割存储在doc_tokens中
            # 但在bert的分词器中会将一连串的数字，中文，英文等分割成子词，也就是说经过bert的分词之后得到的tokens和之前
            # 获得的doc_tokens是不一样的，因此我们仍需要对start和end position从doc_tokens中的位置映射到当前tokens的位置
            tok_to_orig_index = []  # 存储未分词的token的索引，但长度和下面的相等
            orig_to_tok_index = []  # 存储分词后的token的索引，但索引不是连续的，会存在跳跃的情况
            all_doc_tokens = []  # 存储分词后的token，理论上长度是要大于all_tokens的

            for (i, token) in enumerate(example['doc_tokens']):
                sub_tokens = tokenizer.tokenize(token)
                # orig_to_tok_index的长度等于doc_tokens，里面每个值存储的是doc_tokens中的token在all_doc_tokens中的起止索引值
                # 用来将在all_token中的start和end转移到all_doc_tokens中
                orig_to_tok_index.append([len(all_doc_tokens)])
                for sub_token in sub_tokens:
                    # tok_to_orig_index的长度等于all_doc_tokens, 里面会有重复的值
                    tok_to_orig_index.append(i)
                    all_doc_tokens.append(sub_token)
                orig_to_tok_index[-1].append(len(all_doc_tokens) - 1)

            tok_start_position = -1
            tok_end_position = -1
            if is_training:
                # 原来token到新token的映射，这是新token的起点
                tok_start_position = orig_to_tok_index[
                    example['start_position']][0]
                tok_end_position = orig_to_tok_index[
                    example['end_position']][1]

                tok_start_position, tok_end_position = self._improve_answer_span(
                    all_doc_tokens, tok_start_position, tok_end_position,
                    tokenizer, example['orig_answer_text'])

            # The -3 accounts for [CLS], [SEP] and [SEP]
            max_tokens_for_doc = self.__max_length - len(query_tokens) - 3

            doc_spans = []
            _DocSpan = collections.namedtuple("DocSpan", ["start", "length"])

            # 在使用bert的时候，一般会将最大的序列长度控制在512，因此对于长度大于最大长度的context，我们需要将其分成多个片段
            # 采用滑窗的方式，滑窗大小是小于最大长度的，因此分割的片段之间是存在重复的子片段。
            start_offset = 0  # 截取的片段的起始位置
            while start_offset < len(all_doc_tokens):
                length = len(all_doc_tokens) - start_offset

                # 当长度超标，需要使用滑窗
                if length > max_tokens_for_doc:
                    length = max_tokens_for_doc
                doc_spans.append(_DocSpan(start=start_offset, length=length))
                if start_offset + length == len(
                        all_doc_tokens):  # 当length < max_len时，该条件成立
                    break
                start_offset += min(length, self.__doc_stride)

            # 组合query和context的片段成一个序列输入到bert中
            for (doc_span_index, doc_span) in enumerate(doc_spans):
                tokens = []
                token_to_orig_map = {}
                # 因为片段之间会存在重复的子片段，但是子片段中的token在不同的片段中的重要性是不一样的，
                # 在这里根据上下文的数量来决定token的重要性，在之后预测时对于出现在两个片段中的token，只取重要性高的片段
                # 中的token的分数作为该token的分数
                token_is_max_context = {}
                segment_ids = []
                tokens.append("[CLS]")
                segment_ids.append(0)
                for token in query_tokens:
                    tokens.append(token)
                    segment_ids.append(0)
                tokens.append("[SEP]")
                segment_ids.append(0)

                for i in range(doc_span.length):
                    split_token_index = doc_span.start + i
                    token_to_orig_map[len(tokens)] = tok_to_orig_index[
                        split_token_index]  # 映射当前span组成的句子对的索引到原始token的索引

                    # 在利用滑窗分割多个span时会存在有的词出现在两个span中，但最后统计的时候，我们只能选择一个span，因此
                    # 作者根据该词上下文词的数量构建了一个分数，取分数最高的那个span
                    is_max_context = self._check_is_max_context(
                        doc_spans, doc_span_index, split_token_index)
                    token_is_max_context[len(tokens)] = is_max_context
                    tokens.append(all_doc_tokens[split_token_index])
                    segment_ids.append(1)
                tokens.append("[SEP]")
                segment_ids.append(1)

                input_ids = tokenizer.convert_tokens_to_ids(tokens)

                # The mask has 1 for real tokens and 0 for padding tokens. Only real
                # tokens are attended to.
                input_mask = [1] * len(input_ids)

                # Zero-pad up to the sequence length.
                while len(input_ids) < self.__max_length:
                    input_ids.append(0)
                    input_mask.append(0)
                    segment_ids.append(0)

                assert len(input_ids) == self.__max_length
                assert len(input_mask) == self.__max_length
                assert len(segment_ids) == self.__max_length

                start_position = -1
                end_position = -1
                if is_training:
                    # For training, if our document chunk does not contain an annotation
                    # we throw it out, since there is nothing to predict.
                    if tok_start_position == -1 and tok_end_position == -1:
                        start_position = 0  # 问题本来没答案，0是[CLS]的位子
                        end_position = 0
                    else:  # 如果原本是有答案的，那么去除没有答案的feature
                        out_of_span = False
                        doc_start = doc_span.start  # 映射回原文的起点和终点
                        doc_end = doc_span.start + doc_span.length - 1

                        if not (tok_start_position >= doc_start and
                                tok_end_position <= doc_end):  # 该划窗没答案作为无答案增强
                            out_of_span = True
                        if out_of_span:
                            start_position = 0
                            end_position = 0
                        else:
                            doc_offset = len(query_tokens) + 2
                            start_position = tok_start_position - doc_start + doc_offset
                            end_position = tok_end_position - doc_start + doc_offset

                features.append({
                    'unique_id': unique_id,
                    'example_index': example_index,
                    'doc_span_index': doc_span_index,
                    'tokens': tokens,
                    'token_to_orig_map': token_to_orig_map,
                    'token_is_max_context': token_is_max_context,
                    'input_ids': input_ids,
                    'input_mask': input_mask,
                    'segment_ids': segment_ids,
                    'start_position': start_position,
                    'end_position': end_position
                })
                unique_id += 1
        return features

Example #13

Show file

def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
        "xnli": XnliProcessor,
    }

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)

    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' must be True."
        )

    albert_config = modeling.AlbertConfig.from_json_file(
        FLAGS.albert_config_file)

    if FLAGS.max_seq_length > albert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the ALBERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, albert_config.max_position_embeddings))

    tf.gfile.MakeDirs(FLAGS.output_dir)

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(
        vocab_file=FLAGS.vocab_file,
        sp_cdc_file=FLAGS.cdc_spm_model_file,
        do_lower_case=FLAGS.do_lower_case,
        spm_model_file=FLAGS.spm_model_file)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2
    run_config = contrib_tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        keep_checkpoint_max=8,
        tpu_config=contrib_tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    model_fn = model_fn_builder(albert_config=albert_config,
                                num_labels=len(label_list),
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_tpu=FLAGS.use_tpu,
                                use_one_hot_embeddings=FLAGS.use_tpu)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = contrib_tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        if FLAGS.data_examples:
            tf.gfile.MakeDirs(FLAGS.data_examples)
            train_file = os.path.join(FLAGS.data_examples, "train.tf_record")
        else:
            train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
        file_based_convert_examples_to_features(train_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, train_file)
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        tf.logging.set_verbosity(tf.logging.INFO)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        num_actual_eval_examples = len(eval_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on. These do NOT count towards the metric (all tf.metrics
            # support a per-instance weight, and these get a weight of 0.0).
            while len(eval_examples) % FLAGS.eval_batch_size != 0:
                eval_examples.append(PaddingInputExample())

        if FLAGS.data_examples:
            tf.gfile.MakeDirs(FLAGS.data_examples)
            eval_file = os.path.join(FLAGS.data_examples, "eval.tf_record")
        else:
            eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        file_based_convert_examples_to_features(eval_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, eval_file)

        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(eval_examples), num_actual_eval_examples,
                        len(eval_examples) - num_actual_eval_examples)
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        # This tells the estimator to run through the entire set.
        eval_steps = None
        # However, if running eval on the TPU, you will need to specify the
        # number of steps.
        if FLAGS.use_tpu:
            assert len(eval_examples) % FLAGS.eval_batch_size == 0
            eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)

        eval_drop_remainder = True if FLAGS.use_tpu else False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)

        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    if FLAGS.do_predict:
        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        num_actual_predict_examples = len(predict_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on.
            while len(predict_examples) % FLAGS.predict_batch_size != 0:
                predict_examples.append(PaddingInputExample())

        if FLAGS.data_examples:
            tf.gfile.MakeDirs(FLAGS.data_examples)
            predict_file = os.path.join(FLAGS.data_examples,
                                        "predict.tf_record")
        else:
            predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        file_based_convert_examples_to_features(predict_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, predict_file)

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(predict_examples), num_actual_predict_examples,
                        len(predict_examples) - num_actual_predict_examples)
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_drop_remainder = True if FLAGS.use_tpu else False
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        result = estimator.predict(input_fn=predict_input_fn)

        output_predict_file = os.path.join(FLAGS.output_dir,
                                           "test_results.tsv")
        output_submit_file = os.path.join(FLAGS.output_dir,
                                          "submit_results.tsv")
        with tf.gfile.GFile(output_predict_file, "w") as pred_writer,\
            tf.gfile.GFile(output_submit_file, "w") as sub_writer:
            num_written_lines = 0
            tf.logging.info("***** Predict results *****")
            for (i, (example, prediction)) in\
                enumerate(zip(predict_examples, result)):
                probabilities = prediction["probabilities"]
                if i >= num_actual_predict_examples:
                    break
                output_line = "\t".join(
                    str(class_probability)
                    for class_probability in probabilities) + "\n"
                pred_writer.write(output_line)

                actual_label = label_list[int(prediction["predictions"])]
                sub_writer.write(
                    six.ensure_str(example.guid) + "\t" + actual_label + "\n")
                num_written_lines += 1
        assert num_written_lines == num_actual_predict_examples

Example #14

Show file

def main(_):
  tf.logging.set_verbosity(tf.logging.INFO)

  processors = CommonsenseQAProcessor

  if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
    raise ValueError(
        "At least one of `do_train`, `do_eval` or `do_predict' must be True.")

  if not FLAGS.albert_config_file:
    raise ValueError("At least one of `--albert_config_file`must be set")

  if FLAGS.albert_config_file:
    albert_config = modeling.AlbertConfig.from_json_file(FLAGS.albert_config_file)
    if FLAGS.max_seq_length > albert_config.max_position_embeddings:
      raise ValueError(
          "Cannot use sequence length %d because the ALBERT model "
          "was only trained up to sequence length %d" %
          (FLAGS.max_seq_length, albert_config.max_position_embeddings))
  else:
    albert_config = None  # Get the config from TF-Hub.

  tf.gfile.MakeDirs(FLAGS.output_dir)

  processor = processors(
      use_spm=True if FLAGS.spm_model_file else False,
      do_lower_case=FLAGS.do_lower_case)

  label_list = processor.get_labels()

  tokenizer = tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file,
      do_lower_case=FLAGS.do_lower_case,
      spm_model_file=FLAGS.spm_model_file)

  tpu_cluster_resolver = None
  if FLAGS.use_tpu and FLAGS.tpu_name:
    tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
        FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

  is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2
  if FLAGS.do_train:
    iterations_per_loop = int(min(FLAGS.iterations_per_loop, FLAGS.save_checkpoints_steps))
  else:
    iterations_per_loop = FLAGS.iterations_per_loop
  run_config = contrib_tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      master=FLAGS.master,
      model_dir=FLAGS.output_dir,
      save_checkpoints_steps=int(FLAGS.save_checkpoints_steps),
      keep_checkpoint_max=0,
      tpu_config=contrib_tpu.TPUConfig(
          iterations_per_loop=iterations_per_loop,
          num_shards=FLAGS.num_tpu_cores,
          per_host_input_for_training=is_per_host))

  train_examples = processor.get_train_examples(FLAGS.data_dir)
  num_train_steps = int(len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
  num_warmup_steps  = int(num_train_steps * FLAGS.warmup_proportion)

  model_fn = model_fn_builder(
      albert_config=albert_config,
      num_labels=len(label_list),
      init_checkpoint=FLAGS.init_checkpoint,
      learning_rate=FLAGS.learning_rate,
      num_train_steps= num_train_steps,
      num_warmup_steps=num_warmup_steps,
      use_tpu=FLAGS.use_tpu,
      use_one_hot_embeddings=FLAGS.use_tpu,
      optimizer=FLAGS.optimizer)

  # If TPU is not available, this will fall back to normal Estimator on CPU
  # or GPU.
  estimator = contrib_tpu.TPUEstimator(
      use_tpu=FLAGS.use_tpu,
      model_fn=model_fn,
      config=run_config,
      train_batch_size=FLAGS.train_batch_size,
      eval_batch_size=FLAGS.eval_batch_size,
      predict_batch_size=FLAGS.predict_batch_size)

  if FLAGS.do_train:
    cached_dir = FLAGS.cached_dir
    if not cached_dir:
      cached_dir = FLAGS.output_dir
    train_file = os.path.join(cached_dir, "train.tf_record")
    if not tf.gfile.Exists(train_file):
      file_based_convert_examples_to_features(
          train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file)
    tf.logging.info("***** Running training *****")
    tf.logging.info("  Num examples = %d", len(train_examples))
    tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
    tf.logging.info("  Num steps = %d", num_train_steps)
    train_input_fn = file_based_input_fn_builder(
        input_file=train_file,
        seq_length=FLAGS.max_seq_length,
        is_training=True,
        drop_remainder=True)
    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

  if FLAGS.do_eval:
    eval_examples = processor.get_dev_examples(FLAGS.data_dir)
    num_actual_eval_examples = len(eval_examples)
    if FLAGS.use_tpu:
      while len(eval_examples) % FLAGS.eval_batch_size != 0:
        eval_examples.append(classifier_utils.PaddingInputExample())

    cached_dir = FLAGS.cached_dir
    if not cached_dir:
      cached_dir = FLAGS.output_dir
    eval_file = os.path.join(cached_dir, "eval.tf_record")
    if not tf.gfile.Exists(eval_file):
      file_based_convert_examples_to_features(
          eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file)

    tf.logging.info("***** Running evaluation *****")
    tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                    len(eval_examples), num_actual_eval_examples,
                    len(eval_examples) - num_actual_eval_examples)
    tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

    # This tells the estimator to run through the entire set.
    eval_steps = None
    # However, if running eval on the TPU, you will need to specify the
    # number of steps.
    if FLAGS.use_tpu:
      assert len(eval_examples) % FLAGS.eval_batch_size == 0
      eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)

    eval_drop_remainder = True if FLAGS.use_tpu else False
    eval_input_fn = file_based_input_fn_builder(
        input_file=eval_file,
        seq_length=FLAGS.max_seq_length,
        is_training=False,
        drop_remainder=eval_drop_remainder)

    best_trial_info_file = os.path.join(FLAGS.output_dir, "best_trial.txt")

    def _best_trial_info():
      """Returns information about which checkpoints have been evaled so far."""
      if tf.gfile.Exists(best_trial_info_file):
        with tf.gfile.GFile(best_trial_info_file, "r") as best_info:
          global_step, best_metric_global_step, metric_value = (
              best_info.read().split(":"))
          global_step = int(global_step)
          best_metric_global_step = int(best_metric_global_step)
          metric_value = float(metric_value)
      else:
        metric_value = -1
        best_metric_global_step = -1
        global_step = -1
      tf.logging.info(
          "Best trial info: Step: %s, Best Value Step: %s, "
          "Best Value: %s", global_step, best_metric_global_step, metric_value)
      return global_step, best_metric_global_step, metric_value

    def _remove_checkpoint(checkpoint_path):
      for ext in ["meta", "data-00000-of-00001", "index"]:
        src_ckpt = checkpoint_path + ".{}".format(ext)
        tf.logging.info("removing {}".format(src_ckpt))
        tf.gfile.Remove(src_ckpt)

    def _find_valid_cands(curr_step):
      filenames = tf.gfile.ListDirectory(FLAGS.output_dir)
      candidates = []
      for filename in filenames:
        if filename.endswith(".index"):
          ckpt_name = filename[:-6]
          idx = ckpt_name.split("-")[-1]
          if int(idx) > curr_step:
            candidates.append(filename)
      return candidates

    output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")

    global_step, best_perf_global_step, best_perf = _best_trial_info()
    writer = tf.gfile.GFile(output_eval_file, "w")
    while global_step < num_train_steps:
      steps_and_files = {}
      filenames = tf.gfile.ListDirectory(FLAGS.output_dir)
      for filename in filenames:
        if filename.endswith(".index"):
          ckpt_name = filename[:-6]
          cur_filename = os.path.join(FLAGS.output_dir, ckpt_name)
          if cur_filename.split("-")[-1] == "best":
            continue
          gstep = int(cur_filename.split("-")[-1])
          if gstep not in steps_and_files:
            tf.logging.info("Add {} to eval list.".format(cur_filename))
            steps_and_files[gstep] = cur_filename
      tf.logging.info("found {} files.".format(len(steps_and_files)))
      if not steps_and_files:
        tf.logging.info("found 0 file, global step: {}. Sleeping."
                        .format(global_step))
        time.sleep(60)
      else:
        for checkpoint in sorted(steps_and_files.items()):
          step, checkpoint_path = checkpoint
          if global_step >= step:
            if (best_perf_global_step != step and
                len(_find_valid_cands(step)) > 1):
              _remove_checkpoint(checkpoint_path)
            continue
          result = estimator.evaluate(
              input_fn=eval_input_fn,
              steps=eval_steps,
              checkpoint_path=checkpoint_path)
          global_step = result["global_step"]
          tf.logging.info("***** Eval results *****")
          for key in sorted(result.keys()):
            tf.logging.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))
          writer.write("best = {}\n".format(best_perf))
          if result["eval_accuracy"] > best_perf:
            best_perf = result["eval_accuracy"]
            best_perf_global_step = global_step
          elif len(_find_valid_cands(global_step)) > 1:
            _remove_checkpoint(checkpoint_path)
          writer.write("=" * 50 + "\n")
          writer.flush()
          with tf.gfile.GFile(best_trial_info_file, "w") as best_info:
            best_info.write("{}:{}:{}".format(
                global_step, best_perf_global_step, best_perf))
    writer.close()

    for ext in ["meta", "data-00000-of-00001", "index"]:
      src_ckpt = "model.ckpt-{}.{}".format(best_perf_global_step, ext)
      tgt_ckpt = "model.ckpt-best.{}".format(ext)
      tf.logging.info("saving {} to {}".format(src_ckpt, tgt_ckpt))
      tf.io.gfile.rename(
          os.path.join(FLAGS.output_dir, src_ckpt),
          os.path.join(FLAGS.output_dir, tgt_ckpt),
          overwrite=True)

  if FLAGS.do_predict:
    predict_examples = processor.get_test_examples(FLAGS.data_dir)
    num_actual_predict_examples = len(predict_examples)
    if FLAGS.use_tpu:
      while len(predict_examples) % FLAGS.predict_batch_size != 0:
        predict_examples.append(classifier_utils.PaddingInputExample())

    predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
    file_based_convert_examples_to_features(
        predict_examples, label_list,
        FLAGS.max_seq_length, tokenizer,
        predict_file)

    tf.logging.info("***** Running prediction*****")
    tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                    len(predict_examples), num_actual_predict_examples,
                    len(predict_examples) - num_actual_predict_examples)
    tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

    predict_drop_remainder = True if FLAGS.use_tpu else False
    predict_input_fn = file_based_input_fn_builder(
        input_file=predict_file,
        seq_length=FLAGS.max_seq_length,
        is_training=False,
        drop_remainder=predict_drop_remainder)

    checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best")
    result = estimator.predict(
        input_fn=predict_input_fn,
        checkpoint_path=checkpoint_path)

    output_predict_file = os.path.join(FLAGS.output_dir, "test_results.csv")
    output_submit_file = os.path.join(FLAGS.output_dir, "submit_results.csv")
    with tf.gfile.GFile(output_predict_file, "w") as pred_writer,\
        tf.gfile.GFile(output_submit_file, "w") as sub_writer:
      sub_writer.write("index" + "\t" + "prediction\n")
      num_written_lines = 0
      tf.logging.info("***** Predict results *****")
      for (i, (example, prediction)) in\
          enumerate(zip(predict_examples, result)):
        probabilities = prediction["probabilities"]
        if i >= num_actual_predict_examples:
          break
        output_line = "\t".join(
            str(class_probability)
            for class_probability in probabilities) + "\n"
        pred_writer.write(output_line)

        actual_label = label_list[int(prediction["predictions"])]
        sub_writer.write(example.guid + "\t" + actual_label + "\n")
        num_written_lines += 1
    assert num_written_lines == num_actual_predict_examples

Example #15

Show file

File: toxicity.py Project: samsonleegh/Malaya

def transformer(model: str = 'xlnet', quantized: bool = False, **kwargs):
    """
    Load Transformer toxicity model.

    Parameters
    ----------
    model : str, optional (default='bert')
        Model architecture supported. Allowed values:

        * ``'bert'`` - Google BERT BASE parameters.
        * ``'tiny-bert'`` - Google BERT TINY parameters.
        * ``'albert'`` - Google ALBERT BASE parameters.
        * ``'tiny-albert'`` - Google ALBERT TINY parameters.
        * ``'xlnet'`` - Google XLNET BASE parameters.
        * ``'alxlnet'`` - Malaya ALXLNET BASE parameters.

    quantized : bool, optional (default=False)
        if True, will load 8-bit quantized model. 
        Quantized model not necessary faster, totally depends on the machine.

    Returns
    -------
    result : malaya.model.bert.SIGMOID_BERT class
    """

    model = model.lower()
    if model not in _transformer_availability:
        raise Exception(
            'model not supported, please check supported models from `malaya.toxicity.available_transformer()`.'
        )

    check_file(
        PATH_TOXIC[model], S3_PATH_TOXIC[model], quantized = quantized, **kwargs
    )
    if quantized:
        model_path = 'quantized'
    else:
        model_path = 'model'
    g = load_graph(PATH_TOXIC[model][model_path], **kwargs)

    path = PATH_TOXIC

    if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
        if model in ['bert', 'tiny-bert']:
            from malaya.transformers.bert import (
                _extract_attention_weights_import,
            )
            from malaya.transformers.bert import bert_num_layers

            tokenizer = sentencepiece_tokenizer_bert(
                path[model]['tokenizer'], path[model]['vocab']
            )
        if model in ['albert', 'tiny-albert']:
            from malaya.transformers.albert import (
                _extract_attention_weights_import,
            )
            from malaya.transformers.albert import bert_num_layers
            from albert import tokenization

            tokenizer = tokenization.FullTokenizer(
                vocab_file = path[model]['vocab'],
                do_lower_case = False,
                spm_model_file = path[model]['tokenizer'],
            )

        return SIGMOID_BERT(
            X = g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids = None,
            input_masks = g.get_tensor_by_name('import/Placeholder_1:0'),
            logits = g.get_tensor_by_name('import/logits:0'),
            logits_seq = g.get_tensor_by_name('import/logits_seq:0'),
            vectorizer = g.get_tensor_by_name('import/dense/BiasAdd:0'),
            sess = generate_session(graph = g, **kwargs),
            tokenizer = tokenizer,
            label = label,
            attns = _extract_attention_weights_import(
                bert_num_layers[model], g
            ),
            class_name = 'toxic',
        )

    if model in ['xlnet', 'alxlnet']:
        if model in ['xlnet']:
            from malaya.transformers.xlnet import (
                _extract_attention_weights_import,
            )
        if model in ['alxlnet']:
            from malaya.transformers.alxlnet import (
                _extract_attention_weights_import,
            )

        tokenizer = sentencepiece_tokenizer_xlnet(path[model]['tokenizer'])

        return SIGMOID_XLNET(
            X = g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids = g.get_tensor_by_name('import/Placeholder_1:0'),
            input_masks = g.get_tensor_by_name('import/Placeholder_2:0'),
            logits = g.get_tensor_by_name('import/logits:0'),
            logits_seq = g.get_tensor_by_name('import/logits_seq:0'),
            vectorizer = g.get_tensor_by_name('import/transpose_3:0'),
            sess = generate_session(graph = g, **kwargs),
            tokenizer = tokenizer,
            label = label,
            attns = _extract_attention_weights_import(g),
            class_name = 'toxic',
        )

Example #16

Show file

def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    processors = {
        "cola": classifier_utils.ColaProcessor,
        "mnli": classifier_utils.MnliProcessor,
        "mismnli": classifier_utils.MisMnliProcessor,
        "mrpc": classifier_utils.MrpcProcessor,
        "rte": classifier_utils.RteProcessor,
        "sst-2": classifier_utils.Sst2Processor,
        "sts-b": classifier_utils.StsbProcessor,
        "qqp": classifier_utils.QqpProcessor,
        "qnli": classifier_utils.QnliProcessor,
        "wnli": classifier_utils.WnliProcessor,
    }

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)

    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' must be True."
        )

    albert_config = modeling.AlbertConfig.from_json_file(
        FLAGS.albert_config_file)

    if FLAGS.max_seq_length > albert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the ALBERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, albert_config.max_position_embeddings))

    tf.gfile.MakeDirs(FLAGS.output_dir)

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name](
        use_spm=True if FLAGS.spm_model_file else False,
        do_lower_case=FLAGS.do_lower_case)

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case,
                                           spm_model_file=FLAGS.spm_model_file)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2
    if FLAGS.do_train:
        iterations_per_loop = int(
            min(FLAGS.iterations_per_loop, FLAGS.save_checkpoints_steps))
    else:
        iterations_per_loop = FLAGS.iterations_per_loop
    run_config = contrib_tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=int(FLAGS.save_checkpoints_steps),
        keep_checkpoint_max=0,
        tpu_config=contrib_tpu.TPUConfig(
            iterations_per_loop=iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    train_examples = None
    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
    model_fn = classifier_utils.model_fn_builder(
        albert_config=albert_config,
        num_labels=len(label_list),
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        num_train_steps=FLAGS.train_step,
        num_warmup_steps=FLAGS.warmup_step,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_tpu,
        task_name=task_name,
        optimizer=FLAGS.optimizer)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = contrib_tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        cached_dir = FLAGS.cached_dir
        if not cached_dir:
            cached_dir = FLAGS.output_dir
        train_file = os.path.join(cached_dir, task_name + "_train.tf_record")
        if not tf.gfile.Exists(train_file):
            classifier_utils.file_based_convert_examples_to_features(
                train_examples, label_list, FLAGS.max_seq_length, tokenizer,
                train_file, task_name)
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", FLAGS.train_step)
        train_input_fn = classifier_utils.file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True,
            task_name=task_name,
            use_tpu=FLAGS.use_tpu,
            bsz=FLAGS.train_batch_size)
        estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_step)

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        num_actual_eval_examples = len(eval_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on. These do NOT count towards the metric (all tf.metrics
            # support a per-instance weight, and these get a weight of 0.0).
            while len(eval_examples) % FLAGS.eval_batch_size != 0:
                eval_examples.append(classifier_utils.PaddingInputExample())

        cached_dir = FLAGS.cached_dir
        if not cached_dir:
            cached_dir = FLAGS.output_dir
        eval_file = os.path.join(cached_dir, task_name + "_eval.tf_record")
        if not tf.gfile.Exists(eval_file):
            classifier_utils.file_based_convert_examples_to_features(
                eval_examples, label_list, FLAGS.max_seq_length, tokenizer,
                eval_file, task_name)

        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(eval_examples), num_actual_eval_examples,
                        len(eval_examples) - num_actual_eval_examples)
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        # This tells the estimator to run through the entire set.
        eval_steps = None
        # However, if running eval on the TPU, you will need to specify the
        # number of steps.
        if FLAGS.use_tpu:
            assert len(eval_examples) % FLAGS.eval_batch_size == 0
            eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)

        eval_drop_remainder = True if FLAGS.use_tpu else False
        eval_input_fn = classifier_utils.file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder,
            task_name=task_name,
            use_tpu=FLAGS.use_tpu,
            bsz=FLAGS.eval_batch_size)

        def _find_valid_cands(curr_step):
            filenames = tf.gfile.ListDirectory(FLAGS.output_dir)
            candidates = []
            for filename in filenames:
                if filename.endswith(".index"):
                    ckpt_name = filename[:-6]
                    idx = ckpt_name.split("-")[-1]
                    if idx != "best" and int(idx) > curr_step:
                        candidates.append(filename)
            return candidates

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best")

        if task_name == "sts-b":
            key_name = "pearson"
        elif task_name == "cola":
            key_name = "matthew_corr"
        else:
            key_name = "eval_accuracy"

        if tf.gfile.Exists(checkpoint_path + ".index"):
            result = estimator.evaluate(input_fn=eval_input_fn,
                                        steps=eval_steps,
                                        checkpoint_path=checkpoint_path)
            best_perf = result[key_name]
            global_step = result["global_step"]
        else:
            global_step = -1
            best_perf = -1
            checkpoint_path = None
        writer = tf.gfile.GFile(output_eval_file, "w")
        while global_step < FLAGS.train_step:
            steps_and_files = {}
            filenames = tf.gfile.ListDirectory(FLAGS.output_dir)
            for filename in filenames:
                if filename.endswith(".index"):
                    ckpt_name = filename[:-6]
                    cur_filename = os.path.join(FLAGS.output_dir, ckpt_name)
                    if cur_filename.split("-")[-1] == "best":
                        continue
                    gstep = int(cur_filename.split("-")[-1])
                    if gstep not in steps_and_files:
                        tf.logging.info(
                            "Add {} to eval list.".format(cur_filename))
                        steps_and_files[gstep] = cur_filename
            tf.logging.info("found {} files.".format(len(steps_and_files)))
            if not steps_and_files:
                tf.logging.info(
                    "found 0 file, global step: {}. Sleeping.".format(
                        global_step))
                time.sleep(1)
            else:
                for checkpoint in sorted(steps_and_files.items()):
                    step, checkpoint_path = checkpoint
                    if global_step >= step:
                        if len(_find_valid_cands(step)) > 1:
                            for ext in [
                                    "meta", "data-00000-of-00001", "index"
                            ]:
                                src_ckpt = checkpoint_path + ".{}".format(ext)
                                tf.logging.info("removing {}".format(src_ckpt))
                                tf.gfile.Remove(src_ckpt)
                        continue
                    result = estimator.evaluate(
                        input_fn=eval_input_fn,
                        steps=eval_steps,
                        checkpoint_path=checkpoint_path)
                    global_step = result["global_step"]
                    tf.logging.info("***** Eval results *****")
                    for key in sorted(result.keys()):
                        tf.logging.info("  %s = %s", key, str(result[key]))
                        writer.write("%s = %s\n" % (key, str(result[key])))
                    writer.write("best = {}\n".format(best_perf))
                    if result[key_name] > best_perf:
                        best_perf = result[key_name]
                        for ext in ["meta", "data-00000-of-00001", "index"]:
                            src_ckpt = checkpoint_path + ".{}".format(ext)
                            tgt_ckpt = checkpoint_path.rsplit(
                                "-", 1)[0] + "-best.{}".format(ext)
                            tf.logging.info("saving {} to {}".format(
                                src_ckpt, tgt_ckpt))
                            tf.gfile.Copy(src_ckpt, tgt_ckpt, overwrite=True)
                            writer.write("saved {} to {}\n".format(
                                src_ckpt, tgt_ckpt))

                    if len(_find_valid_cands(global_step)) > 1:
                        for ext in ["meta", "data-00000-of-00001", "index"]:
                            src_ckpt = checkpoint_path + ".{}".format(ext)
                            tf.logging.info("removing {}".format(src_ckpt))
                            tf.gfile.Remove(src_ckpt)
                    writer.write("=" * 50 + "\n")
        writer.close()
    if FLAGS.do_predict:
        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        num_actual_predict_examples = len(predict_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on.
            while len(predict_examples) % FLAGS.predict_batch_size != 0:
                predict_examples.append(classifier_utils.PaddingInputExample())

        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        classifier_utils.file_based_convert_examples_to_features(
            predict_examples, label_list, FLAGS.max_seq_length, tokenizer,
            predict_file, task_name)

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(predict_examples), num_actual_predict_examples,
                        len(predict_examples) - num_actual_predict_examples)
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_drop_remainder = True if FLAGS.use_tpu else False
        predict_input_fn = classifier_utils.file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder,
            task_name=task_name,
            use_tpu=FLAGS.use_tpu,
            bsz=FLAGS.predict_batch_size)

        checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best")
        result = estimator.predict(input_fn=predict_input_fn,
                                   checkpoint_path=checkpoint_path)

        output_predict_file = os.path.join(FLAGS.output_dir,
                                           "test_results.tsv")
        output_submit_file = os.path.join(FLAGS.output_dir,
                                          "submit_results.tsv")
        with tf.gfile.GFile(output_predict_file, "w") as pred_writer,\
            tf.gfile.GFile(output_submit_file, "w") as sub_writer:
            sub_writer.write("index" + "\t" + "prediction\n")
            num_written_lines = 0
            tf.logging.info("***** Predict results *****")
            for (i, (example, prediction)) in\
                enumerate(zip(predict_examples, result)):
                probabilities = prediction["probabilities"]
                if i >= num_actual_predict_examples:
                    break
                output_line = "\t".join(
                    str(class_probability)
                    for class_probability in probabilities) + "\n"
                pred_writer.write(output_line)

                if task_name != "sts-b":
                    actual_label = label_list[int(prediction["predictions"])]
                else:
                    actual_label = str(prediction["predictions"])
                sub_writer.write(example.guid + "\t" + actual_label + "\n")
                num_written_lines += 1
        assert num_written_lines == num_actual_predict_examples

Example #17

Show file

File: run_multigpus_race.py Project: zheyuye/ALBERT

def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    processors = {"race": race_utils.RaceProcessor}

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)

    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' must be True."
        )

    albert_config = modeling.AlbertConfig.from_json_file(
        FLAGS.albert_config_file)
    albert_config.hidden_dropout_prob = FLAGS.albert_dropout_prob
    albert_config.attention_probs_dropout_prob = FLAGS.albert_dropout_prob

    if FLAGS.max_seq_length > albert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the ALBERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, albert_config.max_position_embeddings))

    tf.gfile.MakeDirs(FLAGS.output_dir)

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name](
        use_spm=True if FLAGS.spm_model_file else False,
        do_lower_case=FLAGS.do_lower_case,
        high_only=FLAGS.high_only,
        middle_only=FLAGS.middle_only)

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case,
                                           spm_model_file=FLAGS.spm_model_file)

    # multiple gpus
    NUM_GPUS = FLAGS.num_gpu_cores if FLAGS.strategy_type == 'mirror' else 1
    using_customized_optimizer = None
    if NUM_GPUS > 1 and FLAGS.strategy_type == "mirror":
        os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
            [str(i) for i in list(range(NUM_GPUS))])
        # https://github.com/tensorflow/tensorflow/issues/21470#issuecomment-422506263
        strategy = tf.contrib.distribute.MirroredStrategy(
            num_gpus=NUM_GPUS,
            cross_device_ops=AllReduceCrossDeviceOps('nccl',
                                                     num_packs=NUM_GPUS),
        )
        using_customized_optimizer = True
        tf.logging.info('Use MirroredStrategy with %d devices.',
                        strategy.num_replicas_in_sync)
    else:
        strategy = tf.distribute.OneDeviceStrategy("GPU:0")
        using_customized_optimizer = False
        tf.logging.info('Single device mode.')

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2
    if FLAGS.do_train:
        iterations_per_loop = int(
            min(FLAGS.iterations_per_loop, FLAGS.save_checkpoints_steps))
    else:
        iterations_per_loop = FLAGS.iterations_per_loop
    run_config = contrib_tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=int(FLAGS.save_checkpoints_steps),
        keep_checkpoint_max=0,
        tpu_config=contrib_tpu.TPUConfig(
            iterations_per_loop=iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host),
        train_distribute=strategy,
        eval_distribute=strategy,  #get error during evaluation
    )

    train_examples = None
    total_time = None
    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)

    model_fn = race_utils.model_fn_builder(
        albert_config=albert_config,
        num_labels=len(label_list),
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        num_train_steps=FLAGS.train_step,
        num_warmup_steps=FLAGS.warmup_step,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_tpu,
        max_seq_length=FLAGS.max_seq_length,
        dropout_prob=FLAGS.dropout_prob,
        customized=using_customized_optimizer)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tf.logging.info("Use TPUEstimator")
        estimator = contrib_tpu.TPUEstimator(
            use_tpu=FLAGS.use_tpu,
            model_fn=model_fn,
            config=run_config,
            train_batch_size=FLAGS.train_batch_size,
            eval_batch_size=FLAGS.eval_batch_size,
            predict_batch_size=FLAGS.predict_batch_size)
    else:
        tf.logging.info("Use normal Estimator")
        estimator = Estimator(
            model_fn=model_fn,
            params={},
            config=run_config,
        )

    if FLAGS.do_train:
        if not tf.gfile.Exists(FLAGS.train_feature_file):
            race_utils.file_based_convert_examples_to_features(
                train_examples, label_list, FLAGS.max_seq_length, tokenizer,
                FLAGS.train_feature_file, FLAGS.max_qa_length)
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", FLAGS.train_step)
        train_input_fn = classifier_utils.file_based_input_fn_builder(
            input_file=FLAGS.train_feature_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True,
            task_name=task_name,
            use_tpu=FLAGS.use_tpu,
            bsz=FLAGS.train_batch_size,
            multiple=len(label_list))
        time_hist = TimeHistory()
        estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_step)
        total_time = sum(time_hist.times)

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        num_actual_eval_examples = len(eval_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on. These do NOT count towards the metric (all tf.metrics
            # support a per-instance weight, and these get a weight of 0.0).
            while len(eval_examples) % FLAGS.eval_batch_size != 0:
                eval_examples.append(classifier_utils.PaddingInputExample())

        if not tf.gfile.Exists(FLAGS.eval_feature_file):
            race_utils.file_based_convert_examples_to_features(
                eval_examples, label_list, FLAGS.max_seq_length, tokenizer,
                FLAGS.eval_feature_file, FLAGS.max_qa_length)

        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(eval_examples), num_actual_eval_examples,
                        len(eval_examples) - num_actual_eval_examples)
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        # This tells the estimator to run through the entire set.
        eval_steps = None
        # However, if running eval on the TPU, you will need to specify the
        # number of steps.
        if FLAGS.use_tpu:
            assert len(eval_examples) % FLAGS.eval_batch_size == 0
            eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)

        eval_drop_remainder = True if FLAGS.use_tpu else False
        eval_input_fn = classifier_utils.file_based_input_fn_builder(
            input_file=FLAGS.eval_feature_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder,
            task_name=task_name,
            use_tpu=FLAGS.use_tpu,
            bsz=FLAGS.eval_batch_size,
            multiple=len(label_list))

        def _find_valid_cands(curr_step):
            filenames = tf.gfile.ListDirectory(FLAGS.output_dir)
            candidates = []
            for filename in filenames:
                if filename.endswith(".index"):
                    ckpt_name = filename[:-6]
                    idx = ckpt_name.split("-")[-1]
                    if idx != "best" and int(idx) > curr_step:
                        candidates.append(filename)
            return candidates

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best")
        key_name = "eval_accuracy"
        if tf.gfile.Exists(checkpoint_path + ".index"):
            result = estimator.evaluate(input_fn=eval_input_fn,
                                        steps=eval_steps,
                                        checkpoint_path=checkpoint_path)
            best_perf = result[key_name]
            global_step = result["global_step"]
        else:
            global_step = -1
            best_perf = -1
            checkpoint_path = None
        writer = tf.gfile.GFile(output_eval_file, "w")
        avg_time_per_batch = np.mean(time_hist.times)
        writer.write("===== Hyperparameters =====\n")
        writer.write("Training batch size: {}\n".format(
            FLAGS.train_batch_size))
        writer.write("Max sequence length: {}\n".format(FLAGS.max_seq_length))
        writer.write("Max qa length: {}\n".format(FLAGS.max_qa_length))
        writer.write("Learning rate: {}\n".format(FLAGS.learning_rate))
        writer.write("Num of GPU cores: {}\n".format(NUM_GPUS))
        if FLAGS.do_train:
            avg_time_per_batch = np.mean(time_hist.times)
            writer.write("Total time: {}\n".format(total_time))
            writer.write("Speed: {}\n".format(FLAGS.train_batch_size *
                                              NUM_GPUS / avg_time_per_batch))
        if FLAGS.train_step and FLAGS.warmup_step:
            writer.write("Training steps: {}\n".format(FLAGS.train_step))
            writer.write("Warmup steps: {}\n".format(FLAGS.warmup_step))

        while global_step < FLAGS.train_step:
            steps_and_files = {}
            filenames = tf.gfile.ListDirectory(FLAGS.output_dir)
            for filename in filenames:
                if filename.endswith(".index"):
                    ckpt_name = filename[:-6]
                    cur_filename = os.path.join(FLAGS.output_dir, ckpt_name)
                    if cur_filename.split("-")[-1] == "best":
                        continue
                    gstep = int(cur_filename.split("-")[-1])
                    if gstep not in steps_and_files:
                        tf.logging.info(
                            "Add {} to eval list.".format(cur_filename))
                        steps_and_files[gstep] = cur_filename
            tf.logging.info("found {} files.".format(len(steps_and_files)))
            # steps_and_files = sorted(steps_and_files, key=lambda x: x[0])
            if not steps_and_files:
                tf.logging.info(
                    "found 0 file, global step: {}. Sleeping.".format(
                        global_step))
                time.sleep(1)
            else:
                for ele in sorted(steps_and_files.items()):
                    step, checkpoint_path = ele
                    if global_step >= step:
                        if len(_find_valid_cands(step)) > 1:
                            for ext in [
                                    "meta", "data-00000-of-00001", "index"
                            ]:
                                src_ckpt = checkpoint_path + ".{}".format(ext)
                                tf.logging.info("removing {}".format(src_ckpt))
                                tf.gfile.Remove(src_ckpt)
                        continue
                    result = estimator.evaluate(
                        input_fn=eval_input_fn,
                        steps=eval_steps,
                        checkpoint_path=checkpoint_path)
                    global_step = result["global_step"]
                    tf.logging.info("***** Eval results *****")
                    for key in sorted(result.keys()):
                        tf.logging.info("  %s = %s", key, str(result[key]))
                        writer.write("%s = %s\n" % (key, str(result[key])))
                    writer.write("best = {}\n".format(best_perf))
                    if result[key_name] > best_perf:
                        best_perf = result[key_name]
                        for ext in ["meta", "data-00000-of-00001", "index"]:
                            src_ckpt = checkpoint_path + ".{}".format(ext)
                            tgt_ckpt = checkpoint_path.rsplit(
                                "-", 1)[0] + "-best.{}".format(ext)
                            tf.logging.info("saving {} to {}".format(
                                src_ckpt, tgt_ckpt))
                            tf.gfile.Copy(src_ckpt, tgt_ckpt, overwrite=True)
                            writer.write("saved {} to {}\n".format(
                                src_ckpt, tgt_ckpt))

                    if len(_find_valid_cands(global_step)) > 1:
                        for ext in ["meta", "data-00000-of-00001", "index"]:
                            src_ckpt = checkpoint_path + ".{}".format(ext)
                            tf.logging.info("removing {}".format(src_ckpt))
                            tf.gfile.Remove(src_ckpt)
                    writer.write("=" * 50 + "\n")
        writer.close()
    if FLAGS.do_predict:
        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        num_actual_predict_examples = len(predict_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on.
            while len(predict_examples) % FLAGS.predict_batch_size != 0:
                predict_examples.append(classifier_utils.PaddingInputExample())
            assert len(predict_examples) % FLAGS.predict_batch_size == 0
            predict_steps = int(
                len(predict_examples) // FLAGS.predict_batch_size)

        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        race_utils.file_based_convert_examples_to_features(
            predict_examples, label_list, FLAGS.max_seq_length, tokenizer,
            predict_file, FLAGS.max_qa_length)

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(predict_examples), num_actual_predict_examples,
                        len(predict_examples) - num_actual_predict_examples)
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_drop_remainder = True if FLAGS.use_tpu else False
        predict_input_fn = classifier_utils.file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder,
            task_name=task_name,
            use_tpu=FLAGS.use_tpu,
            bsz=FLAGS.predict_batch_size,
            multiple=len(label_list))

        checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best")
        result = estimator.evaluate(input_fn=predict_input_fn,
                                    steps=predict_steps,
                                    checkpoint_path=checkpoint_path)

        output_predict_file = os.path.join(FLAGS.output_dir,
                                           "predict_results.txt")
        with tf.gfile.GFile(output_predict_file, "w") as pred_writer:
            # num_written_lines = 0
            tf.logging.info("***** Predict results *****")
            pred_writer.write("***** Predict results *****\n")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                pred_writer.write("%s = %s\n" % (key, str(result[key])))
            pred_writer.write("best = {}\n".format(best_perf))

Example #18

Show file

def _transformer(model, bert_class, xlnet_class, **kwargs):
    model = model.lower()
    if model not in _availability:
        raise Exception(
            'model not supported, please check supported models from malaya.similarity.available_transformer()'
        )

    check_file(PATH_SIMILARITY[model], S3_PATH_SIMILARITY[model], **kwargs)
    g = load_graph(PATH_SIMILARITY[model]['model'], **kwargs)

    path = PATH_SIMILARITY

    if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
        if model in ['bert', 'tiny-bert']:
            from malaya.transformers.bert import (
                _extract_attention_weights_import, )
            from malaya.transformers.bert import bert_num_layers

            tokenizer = sentencepiece_tokenizer_bert(path[model]['tokenizer'],
                                                     path[model]['vocab'])

        if model in ['albert', 'tiny-albert']:
            from malaya.transformers.albert import (
                _extract_attention_weights_import, )
            from malaya.transformers.albert import bert_num_layers
            from albert import tokenization

            tokenizer = tokenization.FullTokenizer(
                vocab_file=path[model]['vocab'],
                do_lower_case=False,
                spm_model_file=path[model]['tokenizer'],
            )

        return bert_class(
            X=g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'),
            input_masks=g.get_tensor_by_name('import/Placeholder_2:0'),
            logits=g.get_tensor_by_name('import/logits:0'),
            sess=generate_session(graph=g, **kwargs),
            tokenizer=tokenizer,
            label=['not similar', 'similar'],
        )

    if model in ['xlnet', 'alxlnet']:
        if model in ['xlnet']:
            from malaya.transformers.xlnet import (
                _extract_attention_weights_import, )
        if model in ['alxlnet']:
            from malaya.transformers.alxlnet import (
                _extract_attention_weights_import, )

        tokenizer = sentencepiece_tokenizer_xlnet(path[model]['tokenizer'])

        return xlnet_class(
            X=g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'),
            input_masks=g.get_tensor_by_name('import/Placeholder_2:0'),
            logits=g.get_tensor_by_name('import/logits:0'),
            sess=generate_session(graph=g, **kwargs),
            tokenizer=tokenizer,
            label=['not similar', 'similar'],
        )

Example #19

Show file

File: run_multigpus_squad_v2.py Project: zheyuye/ALBERT

def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    albert_config = modeling.AlbertConfig.from_json_file(
        FLAGS.albert_config_file)

    validate_flags_or_throw(albert_config)

    tf.gfile.MakeDirs(FLAGS.output_dir)

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case,
                                           spm_model_file=FLAGS.spm_model_file)

    # multiple gpus
    NUM_GPUS = FLAGS.num_gpu_cores if FLAGS.strategy_type == 'mirror' else 1
    using_customized_optimizer = None
    if NUM_GPUS > 1 and FLAGS.strategy_type == "mirror":
        os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
            [str(i) for i in list(range(NUM_GPUS))])
        # https://github.com/tensorflow/tensorflow/issues/21470#issuecomment-422506263
        strategy = tf.contrib.distribute.MirroredStrategy(
            num_gpus=NUM_GPUS,
            cross_device_ops=AllReduceCrossDeviceOps('nccl',
                                                     num_packs=NUM_GPUS),
        )
        using_customized_optimizer = True
        tf.logging.info('Use MirroredStrategy with %d devices.',
                        strategy.num_replicas_in_sync)
    else:
        strategy = tf.distribute.OneDeviceStrategy("GPU:0")
        using_customized_optimizer = False
        tf.logging.info('Single device mode.')

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2
    if FLAGS.do_train:
        iterations_per_loop = int(
            min(FLAGS.iterations_per_loop, FLAGS.save_checkpoints_steps))
    else:
        iterations_per_loop = FLAGS.iterations_per_loop
    run_config = contrib_tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=contrib_tpu.TPUConfig(
            iterations_per_loop=iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host),
        train_distribute=strategy,
        eval_distribute=strategy,  #get error during evaluation
    )

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    train_examples = squad_utils.read_squad_examples(
        input_file=FLAGS.train_file, is_training=True)
    num_train_steps = int(
        len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
    if FLAGS.do_train:
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

        # Pre-shuffle the input to avoid having to make a very large shuffle
        # buffer in in the `input_fn`.
        rng = random.Random(12345)
        rng.shuffle(train_examples)

    model_fn = squad_utils.v2_model_fn_builder(
        albert_config=albert_config,
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_tpu,
        max_seq_length=FLAGS.max_seq_length,
        start_n_top=FLAGS.start_n_top,
        end_n_top=FLAGS.end_n_top,
        dropout_prob=FLAGS.dropout_prob,
        customized=using_customized_optimizer,
        optimizer=FLAGS.optimizer)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tf.logging.info("Use TPUEstimator")
        estimator = contrib_tpu.TPUEstimator(
            use_tpu=FLAGS.use_tpu,
            model_fn=model_fn,
            config=run_config,
            train_batch_size=FLAGS.train_batch_size,
            eval_batch_size=FLAGS.eval_batch_size,
            predict_batch_size=FLAGS.predict_batch_size)
    else:
        tf.logging.info("Use normal Estimator")
        estimator = Estimator(
            model_fn=model_fn,
            params={},
            config=run_config,
        )

    if FLAGS.do_train:
        # We write to a temporary file to avoid storing very large constant tensors
        # in memory.

        if not tf.gfile.Exists(FLAGS.train_feature_file):
            train_writer = squad_utils.FeatureWriter(filename=os.path.join(
                FLAGS.train_feature_file),
                                                     is_training=True)
            squad_utils.convert_examples_to_features(
                examples=train_examples,
                tokenizer=tokenizer,
                max_seq_length=FLAGS.max_seq_length,
                doc_stride=FLAGS.doc_stride,
                max_query_length=FLAGS.max_query_length,
                is_training=True,
                output_fn=train_writer.process_feature,
                do_lower_case=FLAGS.do_lower_case)
            train_writer.close()

        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num orig examples = %d", len(train_examples))
        # tf.logging.info("  Num split examples = %d", train_writer.num_features)
        tf.logging.info(
            f"  Batch size = {FLAGS.train_batch_size} * {NUM_GPUS}")
        tf.logging.info("  Num steps = %d", num_train_steps)
        del train_examples

        train_input_fn = squad_utils.input_fn_builder(
            input_file=FLAGS.train_feature_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True,
            use_tpu=FLAGS.use_tpu,
            bsz=FLAGS.train_batch_size,
            is_v2=True)

        time_hist = TimeHistory()
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
        total_time = sum(time_hist.times)

    if FLAGS.do_predict:
        with tf.gfile.Open(FLAGS.predict_file) as predict_file:
            prediction_json = json.load(predict_file)["data"]
        eval_examples = squad_utils.read_squad_examples(
            input_file=FLAGS.predict_file, is_training=False)

        if (tf.gfile.Exists(FLAGS.predict_feature_file)
                and tf.gfile.Exists(FLAGS.predict_feature_left_file)):
            tf.logging.info("Loading eval features from {}".format(
                FLAGS.predict_feature_left_file))
            with tf.gfile.Open(FLAGS.predict_feature_left_file, "rb") as fin:
                eval_features = pickle.load(fin)
        else:
            eval_writer = squad_utils.FeatureWriter(
                filename=FLAGS.predict_feature_file, is_training=False)
            eval_features = []

            def append_feature(feature):
                eval_features.append(feature)
                eval_writer.process_feature(feature)

            squad_utils.convert_examples_to_features(
                examples=eval_examples,
                tokenizer=tokenizer,
                max_seq_length=FLAGS.max_seq_length,
                doc_stride=FLAGS.doc_stride,
                max_query_length=FLAGS.max_query_length,
                is_training=False,
                output_fn=append_feature,
                do_lower_case=FLAGS.do_lower_case)
            eval_writer.close()

            with tf.gfile.Open(FLAGS.predict_feature_left_file, "wb") as fout:
                pickle.dump(eval_features, fout)

        tf.logging.info("***** Running predictions *****")
        tf.logging.info("  Num orig examples = %d", len(eval_examples))
        tf.logging.info("  Num split examples = %d", len(eval_features))
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_input_fn = squad_utils.input_fn_builder(
            input_file=FLAGS.predict_feature_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=False,
            use_tpu=FLAGS.use_tpu,
            bsz=FLAGS.predict_batch_size,
            is_v2=True)

        def get_result(checkpoint):
            """Evaluate the checkpoint on SQuAD v2.0."""
            # If running eval on the TPU, you will need to specify the number of
            # steps.
            reader = tf.train.NewCheckpointReader(checkpoint)
            global_step = reader.get_tensor(tf.GraphKeys.GLOBAL_STEP)
            all_results = []
            for result in estimator.predict(predict_input_fn,
                                            yield_single_examples=True,
                                            checkpoint_path=checkpoint):
                if len(all_results) % 1000 == 0:
                    tf.logging.info("Processing example: %d" %
                                    (len(all_results)))
                unique_id = int(result["unique_ids"])
                start_top_log_probs = ([
                    float(x) for x in result["start_top_log_probs"].flat
                ])
                start_top_index = [
                    int(x) for x in result["start_top_index"].flat
                ]
                end_top_log_probs = ([
                    float(x) for x in result["end_top_log_probs"].flat
                ])
                end_top_index = [int(x) for x in result["end_top_index"].flat]

                cls_logits = float(result["cls_logits"].flat[0])
                all_results.append(
                    squad_utils.RawResultV2(
                        unique_id=unique_id,
                        start_top_log_probs=start_top_log_probs,
                        start_top_index=start_top_index,
                        end_top_log_probs=end_top_log_probs,
                        end_top_index=end_top_index,
                        cls_logits=cls_logits))

            output_prediction_file = os.path.join(FLAGS.output_dir,
                                                  "predictions.json")
            output_nbest_file = os.path.join(FLAGS.output_dir,
                                             "nbest_predictions.json")
            output_null_log_odds_file = os.path.join(FLAGS.output_dir,
                                                     "null_odds.json")

            result_dict = {}
            cls_dict = {}
            squad_utils.accumulate_predictions_v2(
                result_dict, cls_dict, eval_examples, eval_features,
                all_results, FLAGS.n_best_size, FLAGS.max_answer_length,
                FLAGS.start_n_top, FLAGS.end_n_top)

            return squad_utils.evaluate_v2(
                result_dict, cls_dict, prediction_json, eval_examples,
                eval_features, all_results, FLAGS.n_best_size,
                FLAGS.max_answer_length, output_prediction_file,
                output_nbest_file, output_null_log_odds_file), int(global_step)

        def _find_valid_cands(curr_step):
            filenames = tf.gfile.ListDirectory(FLAGS.output_dir)
            candidates = []
            for filename in filenames:
                if filename.endswith(".index"):
                    ckpt_name = filename[:-6]
                    idx = ckpt_name.split("-")[-1]
                    if idx != "best" and int(idx) > curr_step:
                        candidates.append(filename)
            return candidates

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best")
        key_name = "f1"
        writer = tf.gfile.GFile(output_eval_file, "w")

        avg_time_per_batch = np.mean(time_hist.times)
        writer.write("===== Hyperparameters =====\n")
        writer.write("Training batch size: {}\n".format(
            FLAGS.train_batch_size))
        writer.write("Max sequence length: {}\n".format(FLAGS.max_seq_length))
        writer.write("Learning rate: {}\n".format(FLAGS.learning_rate))
        writer.write("Num of GPU cores: {}\n".format(NUM_GPUS))
        if FLAGS.do_train:
            avg_time_per_batch = np.mean(time_hist.times)
            writer.write("Total time: {}\n".format(total_time))
            writer.write("Speed: {}\n".format(FLAGS.train_batch_size *
                                              NUM_GPUS / avg_time_per_batch))
        if num_train_steps and num_warmup_steps:
            writer.write("Training steps: {}\n".format(num_train_steps))
            writer.write("Warmup steps: {}\n".format(num_warmup_steps))

        if tf.gfile.Exists(checkpoint_path + ".index"):
            result = get_result(checkpoint_path)
            best_perf = result[0][key_name]
            global_step = result[1]
        else:
            global_step = -1
            best_perf = -1
            checkpoint_path = None
        while global_step < num_train_steps:
            steps_and_files = {}
            filenames = tf.gfile.ListDirectory(FLAGS.output_dir)
            for filename in filenames:
                if filename.endswith(".index"):
                    ckpt_name = filename[:-6]
                    cur_filename = os.path.join(FLAGS.output_dir, ckpt_name)
                    if cur_filename.split("-")[-1] == "best":
                        continue
                    gstep = int(cur_filename.split("-")[-1])
                    if gstep not in steps_and_files:
                        tf.logging.info(
                            "Add {} to eval list.".format(cur_filename))
                        steps_and_files[gstep] = cur_filename
            tf.logging.info("found {} files.".format(len(steps_and_files)))
            if not steps_and_files:
                tf.logging.info(
                    "found 0 file, global step: {}. Sleeping.".format(
                        global_step))
                time.sleep(1)
            else:
                for ele in sorted(steps_and_files.items()):
                    step, checkpoint_path = ele
                    if global_step >= step:
                        if len(_find_valid_cands(step)) > 1:
                            for ext in [
                                    "meta", "data-00000-of-00001", "index"
                            ]:
                                src_ckpt = checkpoint_path + ".{}".format(ext)
                                tf.logging.info("removing {}".format(src_ckpt))
                                tf.gfile.Remove(src_ckpt)
                        continue
                    result, global_step = get_result(checkpoint_path)
                    tf.logging.info("***** Eval results *****")
                    for key in sorted(result.keys()):
                        tf.logging.info("  %s = %s", key, str(result[key]))
                        writer.write("%s = %s\n" % (key, str(result[key])))
                    if result[key_name] > best_perf:
                        best_perf = result[key_name]
                        for ext in ["meta", "data-00000-of-00001", "index"]:
                            src_ckpt = checkpoint_path + ".{}".format(ext)
                            tgt_ckpt = checkpoint_path.rsplit(
                                "-", 1)[0] + "-best.{}".format(ext)
                            tf.logging.info("saving {} to {}".format(
                                src_ckpt, tgt_ckpt))
                            tf.gfile.Copy(src_ckpt, tgt_ckpt, overwrite=True)
                            writer.write("saved {} to {}\n".format(
                                src_ckpt, tgt_ckpt))
                    writer.write("best {} = {}\n".format(key_name, best_perf))
                    tf.logging.info("  best {} = {}\n".format(
                        key_name, best_perf))

                    if len(_find_valid_cands(global_step)) > 2:
                        for ext in ["meta", "data-00000-of-00001", "index"]:
                            src_ckpt = checkpoint_path + ".{}".format(ext)
                            tf.logging.info("removing {}".format(src_ckpt))
                            tf.gfile.Remove(src_ckpt)
                    writer.write("=" * 50 + "\n")

        checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best")
        result, global_step = get_result(checkpoint_path)
        tf.logging.info("***** Final Eval results *****")
        tf.logging.info(f"num_gpu_cores =  {NUM_GPUS}")
        writer.write("===== Evuations =====\n")
        for key in sorted(result.keys()):
            tf.logging.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))
        writer.write("best perf happened at step: {}".format(global_step))

Example #20

Show file

File: create_record.py Project: wangbq18/Doc2EDAG_tf

def save_as_record(path, data, vocab_path, max_length, fields):
    path_tag_size = [40, 10, 100]
    max_ner_size = 64
    events = {
        'EquityFreeze': 0,
        'EquityRepurchase': 1,
        'EquityUnderweight': 2,
        'EquityOverweight': 3,
        'EquityPledge': 4
    }
    events_fields = {
        'EquityFreeze': [
            'EquityHolder', 'FrozeShares', 'LegalInstitution',
            'TotalHoldingShares', 'TotalHoldingRatio', 'StartDate', 'EndDate',
            'UnfrozeDate'
        ],
        'EquityRepurchase': [
            'CompanyName', 'HighestTradingPrice', 'LowestTradingPrice',
            'RepurchasedShares', 'ClosingDate', 'RepurchaseAmount'
        ],
        'EquityUnderweight': [
            'EquityHolder', 'TradedShares', 'StartDate', 'EndDate',
            'LaterHoldingShares', 'AveragePrice'
        ],
        'EquityOverweight': [
            'EquityHolder', 'TradedShares', 'StartDate', 'EndDate',
            'LaterHoldingShares', 'AveragePrice'
        ],
        'EquityPledge': [
            'Pledger', 'PledgedShares', 'Pledgee', 'TotalHoldingShares',
            'TotalHoldingRatio', 'TotalPledgedShares', 'StartDate', 'EndDate',
            'ReleasedDate'
        ]
    }

    train_writer = tf.python_io.TFRecordWriter(path)
    vocab = tokenization.FullTokenizer(vocab_path)
    for x in tqdm(data):
        # 处理原文
        sentences, sentences_mask = process(x['sentences'], vocab, max_length)
        # 处理ner 标签
        # 空白的ner_tag
        ner_tag = np.zeros(max_length, dtype=np.int32)
        flag = 0
        for w in x['ann_mspan2dranges'].keys():
            field = w
            tag = x['ann_mspan2guess_field'][field]
            indexs = x['ann_mspan2dranges'][field]
            for index in indexs:
                if index[2] > max_length[1] - 1:
                    flag = 1
                    break
                ner_tag[index[0]][index[1] + 1] = fields['%s_B' %
                                                         str(tag).upper()]
                for i in range(index[1] + 1, index[2]):
                    ner_tag[index[0]][i + 1] = fields['%s_I' %
                                                      str(tag).upper()]
        if flag == 1:
            continue

        # 生成路径tag
        # 维度分别是路径数，字段数，候选值
        path_tag = np.zeros(path_tag_size, dtype=np.int32) + (-1)

        # 存储完整路径
        path_entity_list = np.zeros(path_tag_size[:2], dtype=np.int32) + (-1)
        path_event_type = np.zeros([path_tag_size[0]], dtype=np.int32) + (-1)
        event_tag = [0 for i in range(5)]

        # 辅助数据：实体index
        ners = [k for k in x['ann_mspan2dranges'].keys()]
        ner_index = []
        ner_list_index = [0]
        for k in ners:
            ner_index.extend(x['ann_mspan2dranges'][k])
            # ner_index[-1][1] += 1
            # ner_index[-1][2] += 1
            ner_list_index.append(ner_list_index[-1] +
                                  len(x['ann_mspan2dranges'][k]))

        for k in ner_index:
            k[1] += 1
            k[2] += 1

        for i in range(max_ner_size - len(ner_index)):
            ner_index.append([-1, -1, -1])

        for i in range(max_ner_size - len(ner_list_index)):
            ner_list_index.append(-1)

        event_tree = {}
        for e in x['recguid_eventname_eventdict_list']:
            # 处理事件tag
            event_tag[events[e[1]]] = 1
            if events[e[1]] not in event_tree.keys():
                event_tree[events[e[1]]] = {}
            et = event_tree[events[e[1]]]

            # 合并相同前缀
            for f in events_fields[e[1]]:
                value = e[2][f]
                if value == None:
                    value = 'NA'
                if value not in et.keys():
                    et[value] = {}
                et = et[value]

        # 创建路径tag
        path_type = np.zeros([10], dtype=np.int32) + (-1)
        paths = []
        for index, k in enumerate(event_tree.keys()):
            start = len(paths)
            paths.extend(get_path(event_tree[k]))
            path_type[start:len(paths)] = k
            for index2, path in enumerate(paths[start:]):
                cache = event_tree[k][path[0]]
                # 跳过第一个节点
                if ners.index(path[0]) != 'NA':
                    path_entity_list[start + index2,
                                     0] = ners.index(path[0]) + 1
                else:
                    path_entity_list[start + index2, 0] = 0
                for i, p in enumerate(path[1:]):
                    tag = np.array(
                        [0 if f not in cache.keys() else 1 for f in ners],
                        dtype=np.int32)
                    tag = np.concatenate([
                        tag,
                        np.zeros([path_tag_size[-1] - tag.size],
                                 dtype=np.int32)
                    ],
                                         axis=0)
                    path_tag[start + index2, i + 1, :] = tag
                    if p != 'NA':
                        path_entity_list[start + index2,
                                         i + 1] = ners.index(p) + 1
                    else:
                        path_entity_list[start + index2, i + 1] = 0
                    cache = cache[p]
                path_event_type[start + index2] = k
            tag = np.array(
                [0 if f not in [c[0] for c in paths] else 1 for f in ners],
                dtype=np.int32)
            tag = np.concatenate([
                tag,
                np.zeros([path_tag_size[-1] - tag.size], dtype=np.int32)
            ],
                                 axis=0)
            path_tag[start:len(paths), 0, :] = tag

        if len(ner_list_index) != max_ner_size:
            continue

        if len(ner_index) != max_ner_size:
            continue

        # # test
        # def select_path(path_tag, path_num, path_event_type, path_entity_list):
        #     path_index = np.random.randint(0, path_num[0], size=1, dtype=np.int32)[0]
        #     return path_tag[path_index], path_index, path_event_type[path_index], path_entity_list[path_index]
        #
        # path_tag, path_index, path_event_type, path_entity_list = select_path(path_tag, [len(paths)], path_event_type,
        #                                                                       path_entity_list)
        #
        # # 去除padding的ner_index
        # def select_nert_index(path_entity_list):
        #     size2 = path_entity_list.argmin(axis=0)
        #     return path_entity_list[:size2]
        #
        # path_entity_list = select_nert_index(path_entity_list)

        features = tf.train.Features(
            feature={
                'sentences': get_byte_feature(sentences),  # 原始文本
                'sentences_mask': get_byte_feature(sentences_mask),  # 原始文本长度
                'event_tag': get_byte_feature(event_tag),  # 事件标签
                'ner_tag': get_byte_feature(ner_tag),  # 实体标签
                'path_tag': get_byte_feature(path_tag),  # 路径标签
                'ner_list_index': get_byte_feature(ner_list_index),  #
                'ner_index': get_byte_feature(ner_index),  #
                'path_event_type': get_int_feature(path_event_type),
                'path_num': get_int_feature([len(paths)]),
                'path_entity_list': get_byte_feature(path_entity_list)
                # 'mask1': tf.train.Feature(int64_list=tf.train.Int64List(value=mask1)),
            })

        example = tf.train.Example(features=features)
        train_writer.write(example.SerializeToString())
    train_writer.close()

Example #21

Show file

def transformer(model: str = 'bert', **kwargs):
    """
    Load Transformer sentiment model.

    Parameters
    ----------
    model : str, optional (default='bert')
        Model architecture supported. Allowed values:

        * ``'bert'`` - BERT architecture from google.
        * ``'tiny-bert'`` - BERT architecture from google with smaller parameters.
        * ``'albert'`` - ALBERT architecture from google.
        * ``'tiny-albert'`` - ALBERT architecture from google with smaller parameters.
        * ``'xlnet'`` - XLNET architecture from google.
        * ``'alxlnet'`` - XLNET architecture from google + Malaya.

    Returns
    -------
    BERT : malaya._models._bert_model.BINARY_BERT class
    """

    model = model.lower()
    if model not in _availability:
        raise Exception(
            'model not supported, please check supported models from malaya.similarity.available_transformer_model()'
        )

    check_file(PATH_SIMILARITY[model], S3_PATH_SIMILARITY[model], **kwargs)
    g = load_graph(PATH_SIMILARITY[model]['model'])

    path = PATH_SIMILARITY

    if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
        if model in ['bert', 'tiny-bert']:
            from malaya.transformers.bert import (
                _extract_attention_weights_import, )
            from malaya.transformers.bert import bert_num_layers

            tokenizer = sentencepiece_tokenizer_bert(path[model]['tokenizer'],
                                                     path[model]['vocab'])

        if model in ['albert', 'tiny-albert']:
            from malaya.transformers.albert import (
                _extract_attention_weights_import, )
            from malaya.transformers.albert import bert_num_layers
            from albert import tokenization

            tokenizer = tokenization.FullTokenizer(
                vocab_file=path[model]['vocab'],
                do_lower_case=False,
                spm_model_file=path[model]['tokenizer'],
            )

        return SIAMESE_BERT(
            X=g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'),
            input_masks=g.get_tensor_by_name('import/Placeholder_2:0'),
            logits=g.get_tensor_by_name('import/logits:0'),
            sess=generate_session(graph=g),
            tokenizer=tokenizer,
            label=['not similar', 'similar'],
        )

    if model in ['xlnet', 'alxlnet']:
        if model in ['xlnet']:
            from malaya.transformers.xlnet import (
                _extract_attention_weights_import, )
        if model in ['alxlnet']:
            from malaya.transformers.alxlnet import (
                _extract_attention_weights_import, )

        tokenizer = sentencepiece_tokenizer_xlnet(path[model]['tokenizer'])

        return SIAMESE_XLNET(
            X=g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'),
            input_masks=g.get_tensor_by_name('import/Placeholder_2:0'),
            logits=g.get_tensor_by_name('import/logits:0'),
            sess=generate_session(graph=g),
            tokenizer=tokenizer,
            label=['not similar', 'similar'],
        )

Example #22

Show file

File: extract_albert_features.py Project: zlin888/e2e-coref

def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    layer_indexes = [int(x) for x in FLAGS.layers.split(",")]

    bert_config = modeling.AlbertConfig.from_json_file(FLAGS.bert_config_file)

    tokenizer = tokenization.FullTokenizer(
        spm_model_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        master=FLAGS.master,
        tpu_config=tf.contrib.tpu.TPUConfig(
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    # examples = read_examples(FLAGS.input_file)
    json_examples = []
    for x in ['test', 'train', 'dev']:
        with open(os.path.join(FLAGS.input_file, x + '.english.jsonlines')) as f:
            json_examples.extend((json.loads(jsonline) for jsonline in f.readlines()))

    orig_examples = []
    bert_examples = []
    for i, json_e in enumerate(json_examples):
        e = process_example(json_e, i, should_filter_embedded_mentions=True)
        orig_examples.append(e)
        bert_examples.append(e.bertify(tokenizer))

    model_fn = model_fn_builder(
        bert_config=bert_config,
        init_checkpoint=FLAGS.init_checkpoint,
        layer_indexes=layer_indexes,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_one_hot_embeddings)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        predict_batch_size=FLAGS.batch_size)

    input_fn = input_fn_builder(
        examples=bert_examples, window_size=FLAGS.window_size, stride=FLAGS.stride, tokenizer=tokenizer)

    writer = h5py.File(FLAGS.output_file, 'w')
    with tqdm(total=sum(len(e.tokens) for e in orig_examples)) as t:
        for result in estimator.predict(input_fn, yield_single_examples=True):
            document_index = int(result["unique_ids"])
            bert_example = bert_examples[document_index]
            orig_example = orig_examples[document_index]
            file_key = bert_example.doc_key.replace('/', ':')

            t.update(n=(result['extract_indices'] >= 0).sum())

            for output_index, bert_token_index in enumerate(result['extract_indices']):
                if bert_token_index < 0:
                    continue

                token_index = bert_example.bert_to_orig_map[bert_token_index]
                sentence_index, token_index = orig_example.unravel_token_index(token_index)

                dataset_key ="{}/{}".format(file_key, sentence_index)
                if dataset_key not in writer:
                    writer.create_dataset(dataset_key,
                                          (len(orig_example.sentence_tokens[sentence_index]), bert_config.hidden_size, len(layer_indexes)),
                                          dtype=np.float32)

                dset = writer[dataset_key]
                for j, layer_index in enumerate(layer_indexes):
                    layer_output = result["layer_output_%d" % j]
                    dset[token_index, :, j] = layer_output[output_index]
    writer.close()

Example #23

Show file

def transformer(
    path,
    s3_path,
    class_name,
    label,
    model='bert',
    quantized=False,
    **kwargs,
):
    check_file(path[model], s3_path[model], quantized=quantized, **kwargs)
    if quantized:
        model_path = 'quantized'
    else:
        model_path = 'model'
    g = load_graph(path[model][model_path], **kwargs)

    if len(label) > 2 or class_name == 'relevancy':
        if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
            selected_class = MULTICLASS_BERT
            selected_node = 'import/dense/BiasAdd:0'
        if model in ['xlnet', 'alxlnet']:
            selected_class = MULTICLASS_XLNET
            selected_node = 'import/transpose_3:0'

    else:
        if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
            selected_class = BINARY_BERT
            selected_node = 'import/dense/BiasAdd:0'
        if model in ['xlnet', 'alxlnet']:
            selected_class = BINARY_XLNET
            selected_node = 'import/transpose_3:0'

    if model in ['albert', 'bert', 'tiny-albert', 'tiny-bert']:
        if model in ['bert', 'tiny-bert']:
            from malaya.transformers.bert import (
                _extract_attention_weights_import, )
            from malaya.transformers.bert import bert_num_layers

            tokenizer = sentencepiece_tokenizer_bert(path[model]['tokenizer'],
                                                     path[model]['vocab'])
        if model in ['albert', 'tiny-albert']:
            from malaya.transformers.albert import (
                _extract_attention_weights_import, )
            from malaya.transformers.albert import bert_num_layers
            from albert import tokenization

            tokenizer = tokenization.FullTokenizer(
                vocab_file=path[model]['vocab'],
                do_lower_case=False,
                spm_model_file=path[model]['tokenizer'],
            )

        return selected_class(
            X=g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids=None,
            input_masks=g.get_tensor_by_name('import/Placeholder_1:0'),
            logits=g.get_tensor_by_name('import/logits:0'),
            logits_seq=g.get_tensor_by_name('import/logits_seq:0'),
            vectorizer=g.get_tensor_by_name(selected_node),
            sess=generate_session(graph=g, **kwargs),
            tokenizer=tokenizer,
            label=label,
            attns=_extract_attention_weights_import(bert_num_layers[model], g),
            class_name=class_name,
        )

    if model in ['xlnet', 'alxlnet']:
        if model in ['xlnet']:
            from malaya.transformers.xlnet import (
                _extract_attention_weights_import, )
        if model in ['alxlnet']:
            from malaya.transformers.alxlnet import (
                _extract_attention_weights_import, )

        tokenizer = sentencepiece_tokenizer_xlnet(path[model]['tokenizer'])

        return selected_class(
            X=g.get_tensor_by_name('import/Placeholder:0'),
            segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'),
            input_masks=g.get_tensor_by_name('import/Placeholder_2:0'),
            logits=g.get_tensor_by_name('import/logits:0'),
            logits_seq=g.get_tensor_by_name('import/logits_seq:0'),
            vectorizer=g.get_tensor_by_name(selected_node),
            sess=generate_session(graph=g, **kwargs),
            tokenizer=tokenizer,
            label=label,
            attns=_extract_attention_weights_import(g),
            class_name=class_name,
        )

Example #24

Show file

File: predict.py Project: zoeyhub/bert-for-task

    def trans_to_features(self, example):
        """
        将输入转化为索引表示
        :param example: 输入
        :return:
        """
        tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_path, do_lower_case=True)
        features = []
        unique_id = 1000000000

        query_tokens = tokenizer.tokenize(example['question'])
        # 给定query一个最大长度来控制query的长度
        if len(query_tokens) > self.query_length:
            query_tokens = query_tokens[: self.query_length]

        # 主要是针对context构造索引，之前我们将中文，标点符号，空格，一连串的数字，英文单词分割存储在doc_tokens中
        # 但在bert的分词器中会将一连串的数字，中文，英文等分割成子词，也就是说经过bert的分词之后得到的tokens和之前
        # 获得的doc_tokens是不一样的，因此我们仍需要对start和end position从doc_tokens中的位置映射到当前tokens的位置
        tok_to_orig_index = []  # 存储未分词的token的索引，但长度和下面的相等
        orig_to_tok_index = []  # 存储分词后的token的索引，但索引不是连续的，会存在跳跃的情况
        all_doc_tokens = []  # 存储分词后的token，理论上长度是要大于all_tokens的

        for (i, token) in enumerate(example['doc_tokens']):
            sub_tokens = tokenizer.tokenize(token)
            # orig_to_tok_index的长度等于doc_tokens，里面每个值存储的是doc_tokens中的token在all_doc_tokens中的起止索引值
            # 用来将在all_token中的start和end转移到all_doc_tokens中
            orig_to_tok_index.append([len(all_doc_tokens)])
            for sub_token in sub_tokens:
                # tok_to_orig_index的长度等于all_doc_tokens, 里面会有重复的值
                tok_to_orig_index.append(i)
                all_doc_tokens.append(sub_token)
            orig_to_tok_index[-1].append(len(all_doc_tokens) - 1)

        # The -3 accounts for [CLS], [SEP] and [SEP]
        max_tokens_for_doc = self.max_length - len(query_tokens) - 3

        doc_spans = []
        _DocSpan = collections.namedtuple("DocSpan", ["start", "length"])

        # 在使用bert的时候，一般会将最大的序列长度控制在512，因此对于长度大于最大长度的context，我们需要将其分成多个片段
        # 采用滑窗的方式，滑窗大小是小于最大长度的，因此分割的片段之间是存在重复的子片段。
        start_offset = 0  # 截取的片段的起始位置
        while start_offset < len(all_doc_tokens):
            length = len(all_doc_tokens) - start_offset

            # 当长度超标，需要使用滑窗
            if length > max_tokens_for_doc:
                length = max_tokens_for_doc
            doc_spans.append(_DocSpan(start=start_offset, length=length))
            if start_offset + length == len(all_doc_tokens):  # 当length < max_len时，该条件成立
                break
            start_offset += min(length, self.doc_stride)

        # 组合query和context的片段成一个序列输入到bert中
        for (doc_span_index, doc_span) in enumerate(doc_spans):
            tokens = []
            token_to_orig_map = {}
            # 因为片段之间会存在重复的子片段，但是子片段中的token在不同的片段中的重要性是不一样的，
            # 在这里根据上下文的数量来决定token的重要性，在之后预测时对于出现在两个片段中的token，只取重要性高的片段
            # 中的token的分数作为该token的分数
            token_is_max_context = {}
            segment_ids = []
            tokens.append("[CLS]")
            segment_ids.append(0)
            for token in query_tokens:
                tokens.append(token)
                segment_ids.append(0)
            tokens.append("[SEP]")
            segment_ids.append(0)

            for i in range(doc_span.length):
                split_token_index = doc_span.start + i
                token_to_orig_map[len(tokens)] = tok_to_orig_index[
                    split_token_index]  # 映射当前span组成的句子对的索引到原始token的索引

                # 在利用滑窗分割多个span时会存在有的词出现在两个span中，但最后统计的时候，我们只能选择一个span，因此
                # 作者根据该词上下文词的数量构建了一个分数，取分数最高的那个span
                is_max_context = self._check_is_max_context(doc_spans, doc_span_index, split_token_index)
                token_is_max_context[len(tokens)] = is_max_context
                tokens.append(all_doc_tokens[split_token_index])
                segment_ids.append(1)
            tokens.append("[SEP]")
            segment_ids.append(1)

            input_ids = tokenizer.convert_tokens_to_ids(tokens)

            # The mask has 1 for real tokens and 0 for padding tokens. Only real
            # tokens are attended to.
            input_mask = [1] * len(input_ids)

            # Zero-pad up to the sequence length.
            while len(input_ids) < self.max_length:
                input_ids.append(0)
                input_mask.append(0)
                segment_ids.append(0)

            assert len(input_ids) == self.max_length
            assert len(input_mask) == self.max_length
            assert len(segment_ids) == self.max_length

            features.append({'unique_id': unique_id,
                             'doc_span_index': doc_span_index,
                             'tokens': tokens,
                             'token_to_orig_map': token_to_orig_map,
                             'token_is_max_context': token_is_max_context,
                             'input_ids': input_ids,
                             'input_mask': input_mask,
                             'segment_ids': segment_ids,
                             'start_position': -1,
                             'end_position': -1})
            unique_id += 1
        return features