Exemple #1
0
def predict_squad(strategy, input_meta_data):
  """Makes predictions for the squad dataset."""
  bert_config = bert_configs.BertConfig.from_json_file(FLAGS.bert_config_file)
  tokenizer = tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
  run_squad_helper.predict_squad(
      strategy, input_meta_data, tokenizer, bert_config, squad_lib_wp)
Exemple #2
0
    def __init__(self,
                 bert_config_file,
                 bert_init_ckpt,
                 bert_max_seq_length,
                 bert_vocab_file=None,
                 do_lower_case=None):
        """Constructor.

    Args:
      bert_config_file: (string) path to Bert configuration file.
      bert_init_ckpt: (string)  path to pretrained Bert checkpoint.
      bert_max_seq_length: (int) maximum input sequence length (#words) after
        WordPiece tokenization. Sequences longer than this will be truncated,
        and shorter than this will be padded.
      bert_vocab_file (optional): (string) path to Bert vocabulary file.
      do_lower_case (optional): (bool) whether to lower case the input text.
        This should be aligned with the `vocab_file`.
    """
        self._bert_config_file = bert_config_file
        self._bert_init_ckpt = bert_init_ckpt
        self._bert_max_seq_length = bert_max_seq_length

        self._tokenizer = None
        if bert_vocab_file is not None and do_lower_case is not None:
            self._tokenizer = tokenization.FullTokenizer(
                vocab_file=bert_vocab_file, do_lower_case=do_lower_case)
def generate_classifier_dataset():
    """Generates classifier dataset and returns input meta data."""
    assert FLAGS.input_data_dir and FLAGS.classification_task_name

    processors = {
        "cola": classifier_data_lib.ColaProcessor,
        "mnli": classifier_data_lib.MnliProcessor,
        "mrpc": classifier_data_lib.MrpcProcessor,
        "qnli": classifier_data_lib.QnliProcessor,
        "sst-2": classifier_data_lib.SstProcessor,
        "xnli": classifier_data_lib.XnliProcessor,
    }
    task_name = FLAGS.classification_task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    if FLAGS.tokenizer_impl == "word_piece":
        tokenizer = tokenization.FullTokenizer(
            vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
        processor_text_fn = tokenization.convert_to_unicode
    else:
        assert FLAGS.tokenizer_impl == "sentence_piece"
        tokenizer = tokenization.FullSentencePieceTokenizer(
            FLAGS.sp_model_file)
        processor_text_fn = functools.partial(tokenization.preprocess_text,
                                              lower=FLAGS.do_lower_case)

    processor = processors[task_name](processor_text_fn)
    return classifier_data_lib.generate_tf_record_from_data_file(
        processor,
        FLAGS.input_data_dir,
        tokenizer,
        train_data_output_path=FLAGS.train_data_output_path,
        eval_data_output_path=FLAGS.eval_data_output_path,
        max_seq_length=FLAGS.max_seq_length)
Exemple #4
0
def main(_):
    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    input_files = []
    for input_pattern in FLAGS.input_file.split(","):
        input_files.extend(tf.io.gfile.glob(input_pattern))

    logging.info("*** Reading from input files ***")
    for input_file in input_files:
        logging.info("  %s", input_file)

    rng = random.Random(FLAGS.random_seed)
    instances = create_training_instances(
        input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
        FLAGS.short_seq_prob, FLAGS.masked_lm_prob,
        FLAGS.max_predictions_per_seq, rng, FLAGS.do_whole_word_mask,
        FLAGS.max_ngram_size)

    output_files = FLAGS.output_file.split(",")
    logging.info("*** Writing to output files ***")
    for output_file in output_files:
        logging.info("  %s", output_file)

    write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
                                    FLAGS.max_predictions_per_seq,
                                    output_files, FLAGS.gzip_compress)
 def test_single_cell(self, cell, text, exepected=None):
   with tempfile.TemporaryDirectory() as temp_dir:
     vocab_file = os.path.join(temp_dir, "vocab.txt")
     self._get_vocab_file(
         vocab_file,
         [
             "a",
             "b",
             "bb",
             "##b",
             "3",
             ".",
             "5",
             "insti",
             "##tuto",
             "reacao",
             "##d",
         ],
     )
     detokenizer = e2e_eval_utils.DeTokenizer(vocab_file)
     tokenizer = tokenization.FullTokenizer(
         vocab_file,
         do_lower_case=True,
         split_on_punc=True,
     )
   table = interaction_pb2.Table()
   table.rows.add().cells.add().text = cell
   token_ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
   actual = detokenizer.detokenize(
       table,
       token_ids,
   )
   if exepected is None:
     exepected = text
   self.assertEqual(actual, exepected)
Exemple #6
0
 def __init__(self, max_sequence_length: int, **kwargs):
     super().__init__(**kwargs)
     self.tokenizer = tokenization.FullTokenizer(
         vocab_file=constants.BERT_VOCAB_PATH,
         do_lower_case=True,
     )
     self.max_sequence_length = max_sequence_length
Exemple #7
0
def generate_regression_dataset():
    """Generates regression dataset and returns input meta data."""
    if FLAGS.tokenizer_impl == "word_piece":
        tokenizer = tokenization.FullTokenizer(
            vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
        processor_text_fn = tokenization.convert_to_unicode
    else:
        assert FLAGS.tokenizer_impl == "sentence_piece"
        tokenizer = tokenization.FullSentencePieceTokenizer(
            FLAGS.sp_model_file)
        processor_text_fn = functools.partial(tokenization.preprocess_text,
                                              lower=FLAGS.do_lower_case)

    if FLAGS.tfds_params:
        processor = classifier_data_lib.TfdsProcessor(
            tfds_params=FLAGS.tfds_params, process_text_fn=processor_text_fn)
        return classifier_data_lib.generate_tf_record_from_data_file(
            processor,
            None,
            tokenizer,
            train_data_output_path=FLAGS.train_data_output_path,
            eval_data_output_path=FLAGS.eval_data_output_path,
            test_data_output_path=FLAGS.test_data_output_path,
            max_seq_length=FLAGS.max_seq_length)
    else:
        raise ValueError(
            "No data processor found for the given regression task.")
Exemple #8
0
def generate_retrieval_dataset():
    """Generate retrieval test and dev dataset and returns input meta data."""
    assert (FLAGS.input_data_dir and FLAGS.retrieval_task_name)
    if FLAGS.tokenizer_impl == "word_piece":
        tokenizer = tokenization.FullTokenizer(
            vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
        processor_text_fn = tokenization.convert_to_unicode
    else:
        assert FLAGS.tokenizer_impl == "sentence_piece"
        tokenizer = tokenization.FullSentencePieceTokenizer(
            FLAGS.sp_model_file)
        processor_text_fn = functools.partial(tokenization.preprocess_text,
                                              lower=FLAGS.do_lower_case)

    processors = {
        "bucc": sentence_retrieval_lib.BuccProcessor,
        "tatoeba": sentence_retrieval_lib.TatoebaProcessor,
    }

    task_name = FLAGS.retrieval_task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: %s" % task_name)

    processor = processors[task_name](process_text_fn=processor_text_fn)

    return sentence_retrieval_lib.generate_sentence_retrevial_tf_record(
        processor, FLAGS.input_data_dir, tokenizer,
        FLAGS.eval_data_output_path, FLAGS.test_data_output_path,
        FLAGS.max_seq_length)
Exemple #9
0
def get_bert_tokenizer(bert_layer):

    vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
    tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

    return tokenizer
Exemple #10
0
def generate_tagging_dataset():
    """Generates tagging dataset."""
    processors = {
        "panx": tagging_data_lib.PanxProcessor,
        "udpos": tagging_data_lib.UdposProcessor,
    }
    task_name = FLAGS.tagging_task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: %s" % task_name)

    if FLAGS.tokenizer_impl == "word_piece":
        tokenizer = tokenization.FullTokenizer(
            vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
        processor_text_fn = tokenization.convert_to_unicode
    elif FLAGS.tokenizer_impl == "sentence_piece":
        tokenizer = tokenization.FullSentencePieceTokenizer(
            FLAGS.sp_model_file)
        processor_text_fn = functools.partial(tokenization.preprocess_text,
                                              lower=FLAGS.do_lower_case)
    else:
        raise ValueError("Unsupported tokenizer_impl: %s" %
                         FLAGS.tokenizer_impl)

    processor = processors[task_name]()
    return tagging_data_lib.generate_tf_record_from_data_file(
        processor, FLAGS.input_data_dir, tokenizer, FLAGS.max_seq_length,
        FLAGS.train_data_output_path, FLAGS.eval_data_output_path,
        FLAGS.test_data_output_path, processor_text_fn)
Exemple #11
0
def eval_squad(strategy, input_meta_data):
  """Evaluate on the squad dataset."""
  bert_config = bert_configs.BertConfig.from_json_file(FLAGS.bert_config_file)
  tokenizer = tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
  eval_metrics = run_squad_helper.eval_squad(
      strategy, input_meta_data, tokenizer, bert_config, squad_lib_wp)
  return eval_metrics
Exemple #12
0
 def build(self):
   """Builds the class. Used for lazy initialization."""
   if self.is_built:
     return
   self.vocab_file = os.path.join(
       registry.resolver(self.uri), 'assets', 'vocab.txt')
   self.tokenizer = tokenization.FullTokenizer(self.vocab_file,
                                               self.do_lower_case)
Exemple #13
0
 def __init__(self, vocab_file, do_lower_case=True, split_on_punc=True):
   self._whitespace_tokenizer = tokenization.BasicTokenizer(
       do_lower_case=False, split_on_punc=False)
   self._punctuation_tokenizer = tokenization.BasicTokenizer(
       do_lower_case=False, split_on_punc=split_on_punc)
   self._full_tokenizer = tokenization.FullTokenizer(
       vocab_file, do_lower_case=do_lower_case, split_on_punc=split_on_punc)
   self._vocab = list(self._full_tokenizer.vocab.keys())
Exemple #14
0
    def _preprocess_eval_data(self, params):
        eval_examples = self.squad_lib.read_squad_examples(
            input_file=params.input_path,
            is_training=False,
            version_2_with_negative=params.version_2_with_negative)

        temp_file_path = params.input_preprocessed_data_path or self.logging_dir
        if not temp_file_path:
            raise ValueError(
                'You must specify a temporary directory, either in '
                'params.input_preprocessed_data_path or logging_dir to '
                'store intermediate evaluation TFRecord data.')
        eval_writer = self.squad_lib.FeatureWriter(filename=os.path.join(
            temp_file_path, 'eval.tf_record'),
                                                   is_training=False)
        eval_features = []

        def _append_feature(feature, is_padding):
            if not is_padding:
                eval_features.append(feature)
            eval_writer.process_feature(feature)

        # XLNet preprocesses SQuAD examples in a P, Q, class order whereas
        # BERT preprocesses in a class, Q, P order.
        xlnet_ordering = self.task_config.model.encoder.type == 'xlnet'
        kwargs = dict(examples=eval_examples,
                      max_seq_length=params.seq_length,
                      doc_stride=params.doc_stride,
                      max_query_length=params.query_length,
                      is_training=False,
                      output_fn=_append_feature,
                      batch_size=params.global_batch_size,
                      xlnet_format=xlnet_ordering)

        if params.tokenization == 'SentencePiece':
            # squad_lib_sp requires one more argument 'do_lower_case'.
            kwargs['do_lower_case'] = params.do_lower_case
            kwargs['tokenizer'] = tokenization.FullSentencePieceTokenizer(
                sp_model_file=params.vocab_file)
        elif params.tokenization == 'WordPiece':
            kwargs['tokenizer'] = tokenization.FullTokenizer(
                vocab_file=params.vocab_file,
                do_lower_case=params.do_lower_case)
        else:
            raise ValueError('Unexpected tokenization: %s' %
                             params.tokenization)

        eval_dataset_size = self.squad_lib.convert_examples_to_features(
            **kwargs)
        eval_writer.close()

        logging.info('***** Evaluation input stats *****')
        logging.info('  Num orig examples = %d', len(eval_examples))
        logging.info('  Num split examples = %d', len(eval_features))
        logging.info('  Batch size = %d', params.global_batch_size)
        logging.info('  Dataset size = %d', eval_dataset_size)

        return eval_writer.filename, eval_examples, eval_features
def generate_tf_record_from_data_file(processor,
                                      data_dir,
                                      vocab_file,
                                      train_data_output_path=None,
                                      eval_data_output_path=None,
                                      max_seq_length=128,
                                      do_lower_case=True):
    """Generates and saves training data into a tf record file.

  Arguments:
      processor: Input processor object to be used for generating data. Subclass
        of `DataProcessor`.
      data_dir: Directory that contains train/eval data to process. Data files
        should be in from "dev.tsv", "test.tsv", or "train.tsv".
      vocab_file: Text file with words to be used for training/evaluation.
      train_data_output_path: Output to which processed tf record for training
        will be saved.
      eval_data_output_path: Output to which processed tf record for evaluation
        will be saved.
      max_seq_length: Maximum sequence length of the to be generated
        training/eval data.
      do_lower_case: Whether to lower case input text.

  Returns:
      A dictionary containing input meta data.
  """
    assert train_data_output_path or eval_data_output_path

    label_list = processor.get_labels()
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case)
    assert train_data_output_path
    train_input_data_examples = processor.get_train_examples(data_dir)
    file_based_convert_examples_to_features(train_input_data_examples,
                                            label_list, max_seq_length,
                                            tokenizer, train_data_output_path)
    num_training_data = len(train_input_data_examples)

    if eval_data_output_path:
        eval_input_data_examples = processor.get_dev_examples(data_dir)
        file_based_convert_examples_to_features(eval_input_data_examples,
                                                label_list, max_seq_length,
                                                tokenizer,
                                                eval_data_output_path)

    meta_data = {
        "task_type": "bert_classification",
        "processor_type": processor.get_processor_name(),
        "num_labels": len(processor.get_labels()),
        "train_data_size": num_training_data,
        "max_seq_length": max_seq_length,
    }

    if eval_data_output_path:
        meta_data["eval_data_size"] = len(eval_input_data_examples)

    return meta_data
Exemple #16
0
 def create_tokenizer(self):
     """
     Create tokenizer
     :return: None
     """
     vocab_file = self.bert_layer.resolved_object.vocab_file.asset_path.numpy(
     )
     do_lower_case = self.bert_layer.resolved_object.do_lower_case.numpy()
     self.tokenizer = tokenization.FullTokenizer(
         vocab_file=vocab_file, do_lower_case=do_lower_case)
Exemple #17
0
def main(_):
    gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_param)
    if not FLAGS.model_dir:
        FLAGS.model_dir = '/tmp/bert20/'

    bert_config = bert_configs.BertConfig.from_json_file(
        FLAGS.bert_config_file)
    tokeninzer = tokenization.FullTokenizer(FLAGS.vocab_file,
                                            do_lower_case=True)
    if FLAGS.mode == 'export_only':
        raise NotImplementedError()
        return

    strategy = distribution_utils.get_distribution_strategy(
        distribution_strategy=FLAGS.distribution_strategy,
        num_gpus=FLAGS.num_gpus)

    eval_dataset = NERDataset(tokeninzer, FLAGS.eval_data_path, FLAGS.mode,
                              FLAGS.label_file, FLAGS.max_seq_length)
    eval_input_fn = get_dataset_fn(eval_dataset,
                                   FLAGS.eval_batch_size,
                                   is_training=False,
                                   pad_value=_PADDING_LABEL_ID)

    if FLAGS.mode != 'train_and_eval':
        raise ValueError('Unsupported mode is specified: %s' % FLAGS.mode)
    train_dataset = NERDataset(tokeninzer, FLAGS.train_data_path, FLAGS.mode,
                               FLAGS.label_file, FLAGS.max_seq_length)
    train_input_fn = get_dataset_fn(train_dataset,
                                    FLAGS.train_batch_size,
                                    is_training=True,
                                    pad_value=_PADDING_LABEL_ID)

    input_meta_data = {
        "max_seq_length":
        FLAGS.max_seq_length,
        'num_labels':
        train_dataset.label_num,
        "train_data_size":
        FLAGS.train_data_size
        if FLAGS.train_data_size else train_dataset.data_size,
        "eval_data_size":
        FLAGS.eval_data_size
        if FLAGS.eval_data_size else eval_dataset.data_size,
        "id2label":
        train_dataset.id2label_map
    }

    run_bert(
        strategy,
        input_meta_data,
        bert_config,
        train_input_fn,
        eval_input_fn,
    )
  def __init__(self, vocab_dir):
    self.seq_len = 384
    self.predict_batch_size = 8
    self.query_len = 64
    self.doc_stride = 128

    vocab_file = os.path.join(vocab_dir, 'vocab.txt')
    vocab = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', 'good', 'bad']
    with open(vocab_file, 'w') as f:
      f.write('\n'.join(vocab))
    self.tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case=True)
Exemple #19
0
def main(args):
    tknzr = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                       do_lower_case=True)
    print('Input:')
    print(args.input)
    print('\nTokenized:')
    tokenized = tknzr.tokenize(args.input)
    print(tokenized)
    print('\nTokenized and converted to IDs:')
    ids = tknzr.convert_tokens_to_ids(tokenized)
    print(ids)
def main(args):
    vocab_file = os.path.join('..', 'vocabs', PRETRAINED_MODELS[args.model_class]['vocab_file'])
    tknzr = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True)
    print('Input:')
    print(args.input)
    print('\nTokenized:')
    tokenized = tknzr.tokenize(args.input)
    print(tokenized)
    print('\nTokenized and converted to IDs:')
    ids = tknzr.convert_tokens_to_ids(tokenized)
    print(ids)
Exemple #21
0
def load_bert_model():
    label_list = [0, 1]  # Label categories
    max_seq_length = 60  # maximum length of (token) input sequences

    model_path = "C:/Users/USER-PC/Downloads/bert_en_uncased_L-12_H-768_A-12_2"
    bert_layer = hub.KerasLayer(model_path, trainable=True)

    vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
    tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

    return label_list, max_seq_length, tokenizer
Exemple #22
0
 def __init__(
     self,
     dstc8_data_dir,
     collection,
     vocab_file="./models/uncased_L-12_H-768_A-12/vocab.txt",
     do_lower_case=True,
     max_seq_length=DEFAULT_MAX_SEQ_LENGTH,
 ):
     self.dstc8_data_dir = dstc8_data_dir
     self._file_ranges = FILE_RANGES[collection]
     # BERT tokenizer
     self._tokenizer = tokenization.FullTokenizer(
         vocab_file=vocab_file, do_lower_case=do_lower_case)
     self._max_seq_length = max_seq_length
Exemple #23
0
def generate_classifier_dataset():
    """Generates classifier dataset and returns input meta data."""
    assert FLAGS.input_data_dir

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)
    processor_text_fn = tokenization.convert_to_unicode
    processor = classifier_data_lib.WeiboProcessor(processor_text_fn)
    return classifier_data_lib.generate_predict_tf_record_from_data_file(
        processor,
        FLAGS.input_data_dir,
        tokenizer,
        predict_data_output_path=FLAGS.predict_data_output_path,
        max_seq_length=FLAGS.max_seq_length)
    def load_pre_trained_bert_tf_hub_from_url(self, url=TF_HUB_URL, trainable_flag=True):
        """Loads the TF Hub pre-trained BERT model from URL.

        Args:
            url (str[optional]): URL string of the TF Hub link (default url is stored in TF_HUB_URL)
            trainable_flag (bool[optional]): True if want to train also BERT layer wights (highly suggested), False otherwise. Default is True
        """
        self.pre_trained_bert_layer = hub.KerasLayer(url, trainable=trainable_flag)
        self.pre_trained_bert_layer._name = "bert_layer"
        self.info_dict.update([("trained_bert_weights", trainable_flag)])

        self.vocab_file = self.pre_trained_bert_layer.resolved_object.vocab_file.asset_path.numpy()
        self.do_lower_case = self.pre_trained_bert_layer.resolved_object.do_lower_case.numpy()
        self.tokenizer = tokenization.FullTokenizer(self.vocab_file, self.do_lower_case)
        return
Exemple #25
0
    def test_generate_tf_record(self, task_type):
        processor = self.processors[task_type]()
        input_data_dir = os.path.join(self.get_temp_dir(), task_type)
        tf.io.gfile.mkdir(input_data_dir)
        # Write fake train file.
        _create_fake_file(os.path.join(input_data_dir, "train-en.tsv"),
                          processor.get_labels(),
                          is_test=False)

        # Write fake dev file.
        _create_fake_file(os.path.join(input_data_dir, "dev-en.tsv"),
                          processor.get_labels(),
                          is_test=False)

        # Write fake test files.
        for lang in processor.supported_languages:
            _create_fake_file(os.path.join(input_data_dir,
                                           "test-%s.tsv" % lang),
                              processor.get_labels(),
                              is_test=True)

        output_path = os.path.join(self.get_temp_dir(), task_type, "output")
        tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_file,
                                               do_lower_case=True)
        metadata = tagging_data_lib.generate_tf_record_from_data_file(
            processor,
            input_data_dir,
            tokenizer,
            max_seq_length=8,
            train_data_output_path=os.path.join(output_path, "train.tfrecord"),
            eval_data_output_path=os.path.join(output_path, "eval.tfrecord"),
            test_data_output_path=os.path.join(output_path,
                                               "test_{}.tfrecord"),
            text_preprocessing=tokenization.convert_to_unicode)

        self.assertEqual(metadata["train_data_size"], 5)
        files = tf.io.gfile.glob(output_path + "/*")
        expected_files = []
        expected_files.append(os.path.join(output_path, "train.tfrecord"))
        expected_files.append(os.path.join(output_path, "eval.tfrecord"))
        for lang in processor.supported_languages:
            expected_files.append(
                os.path.join(output_path, "test_%s.tfrecord" % lang))

        self.assertCountEqual(files, expected_files)
    def setUp(self):
        super(BertClassifierLibTest, self).setUp()
        self.model_dir = self.get_temp_dir()
        self.processors = {
            "CB": classifier_data_lib.CBProcessor,
            "SUPERGLUE-RTE": classifier_data_lib.SuperGLUERTEProcessor,
            "BOOLQ": classifier_data_lib.BoolQProcessor,
        }

        vocab_tokens = [
            "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un",
            "runn", "##ing", ","
        ]
        with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
            vocab_writer.write("".join([x + "\n" for x in vocab_tokens
                                        ]).encode("utf-8"))
        vocab_file = vocab_writer.name
        self.tokenizer = tokenization.FullTokenizer(vocab_file)
    def load_pre_trained_bert_tf_hub_from_dir(self, bert_tf_hub_dir, trainable_flag=True):
        """Loads the TF Hub pre-trained BERT model from local disk.

        The TF Hub module can be downloaded on local disk with the following command:
            !wget "https://storage.googleapis.com/tfhub-modules/tensorflow/bert_en_uncased_L-12_H-768_A-12/bert_en_uncased_L-12_H-768_A-12_2.tar.gz"
            !tar -xvf  '/bert_en_uncased_L-12_H-768_A-12_2.tar.gz' -C 'saved_models/pre_trained/bert_en_uncased_L-12_H-768_A-12_2'

        Args:
            bert_tf_hub_dir (str): PATH string of the TF Hub directory
            trainable_flag (bool[optional]): True if want to train also BERT layer wights (highly suggested), False otherwise. Default is True
        """
        self.pre_trained_bert_layer = hub.KerasLayer(bert_tf_hub_dir, trainable=trainable_flag)
        self.pre_trained_bert_layer._name = "bert_layer"
        self.info_dict.update([("trained_bert_weights", trainable_flag)])

        self.vocab_file = self.pre_trained_bert_layer.resolved_object.vocab_file.asset_path.numpy()
        self.do_lower_case = self.pre_trained_bert_layer.resolved_object.do_lower_case.numpy()
        self.tokenizer = tokenization.FullTokenizer(self.vocab_file, self.do_lower_case)
        return
    def _preprocess_eval_data(self, params):
        eval_examples = self.squad_lib.read_squad_examples(
            input_file=params.input_path,
            is_training=False,
            version_2_with_negative=params.version_2_with_negative)

        temp_file_path = params.input_preprocessed_data_path or '/tmp'
        eval_writer = self.squad_lib.FeatureWriter(filename=os.path.join(
            temp_file_path, 'eval.tf_record'),
                                                   is_training=False)
        eval_features = []

        def _append_feature(feature, is_padding):
            if not is_padding:
                eval_features.append(feature)
            eval_writer.process_feature(feature)

        kwargs = dict(examples=eval_examples,
                      tokenizer=tokenization.FullTokenizer(
                          vocab_file=params.vocab_file,
                          do_lower_case=params.do_lower_case),
                      max_seq_length=params.seq_length,
                      doc_stride=params.doc_stride,
                      max_query_length=params.query_length,
                      is_training=False,
                      output_fn=_append_feature,
                      batch_size=params.global_batch_size)
        if params.tokenization == 'SentencePiece':
            # squad_lib_sp requires one more argument 'do_lower_case'.
            kwargs['do_lower_case'] = params.do_lower_case

        eval_dataset_size = self.squad_lib.convert_examples_to_features(
            **kwargs)
        eval_writer.close()

        logging.info('***** Evaluation input stats *****')
        logging.info('  Num orig examples = %d', len(eval_examples))
        logging.info('  Num split examples = %d', len(eval_features))
        logging.info('  Batch size = %d', params.global_batch_size)
        logging.info('  Dataset size = %d', eval_dataset_size)

        return eval_writer.filename, eval_examples, eval_features
Exemple #29
0
 def __init__(self,
              dstc8_data_dir,
              train_file_range,
              dev_file_range,
              test_file_range,
              vocab_file,
              do_lower_case,
              max_seq_length=DEFAULT_MAX_SEQ_LENGTH,
              log_data_warnings=False):
   self.dstc8_data_dir = dstc8_data_dir
   self._log_data_warnings = log_data_warnings
   self._file_ranges = {
       "train": train_file_range,
       "dev": dev_file_range,
       "test": test_file_range,
   }
   # BERT tokenizer
   self._tokenizer = tokenization.FullTokenizer(
       vocab_file=vocab_file, do_lower_case=do_lower_case)
   self._max_seq_length = max_seq_length
Exemple #30
0
def load_model(model_directory):
    """Loads the fine-tuned model from directory.

    Args:
        model_directory (str): PATH string of the fine-tuned model's directory on local disk
    """
    tokenizer = tokenization.FullTokenizer(vocab_file=os.path.join(model_directory, "assets", "vocab.txt"), do_lower_case=True)

    print(type(tokenizer))

    model = keras.models.load_model(model_directory)

    max_seq_length = 256

    extractor = tf.keras.Model(inputs=model.inputs,
                               outputs=[model.get_layer("bert_layer").output])

    model.summary()

    return model, tokenizer, max_seq_length, extractor