コード例 #1
0
def main(_):
  # Read in wordcount file.
  with open(FLAGS.input_path) as wordcount_file:
    word_counts = [(line.split()[0], int(line.split()[1]))
                   for line in wordcount_file]

  # Add in padding tokens.
  reserved_tokens = FLAGS.reserved_tokens
  if FLAGS.num_pad_tokens:
    padded_tokens = ['<pad>']
    padded_tokens += ['<pad%d>' % i for i in range(1, FLAGS.num_pad_tokens)]
    reserved_tokens = padded_tokens + reserved_tokens

  params = learner.Params(FLAGS.upper_thresh, FLAGS.lower_thresh,
                          FLAGS.num_iterations, FLAGS.max_input_tokens,
                          FLAGS.max_token_length, FLAGS.max_unique_chars,
                          FLAGS.vocab_size, FLAGS.slack_ratio,
                          FLAGS.include_joiner_token, FLAGS.joiner,
                          reserved_tokens)

  vocab = learner.learn(word_counts, params)
  vocab = ''.join([line + '\n' for line in vocab])

  # Write vocab to file.
  with open(FLAGS.output_path, 'w') as vocab_file:
    vocab_file.write(vocab)
コード例 #2
0
def bert_vocab_from_dataset(dataset,
                            vocab_size: int,
                            reserved_tokens: List[str],
                            bert_tokenizer_params=None,
                            learn_params=None) -> List[str]:
  """Generate a Bert wordpiece vocabulary from a `tf.data.Dataset` of texts.

  ```
  import tensorflow_text as text

  vocab = bert_vocab_from_dataset(dataset, vocab_size, reserved_tokens,
                                  bert_tokenizer_params, learn_params)
  bert_tokenizer = text.BertTokenizer(vocab, **bert_tokenizer_params)
  token_ids = bert_tokenizer.tokenize(text)
  ```

  This uses Bert's splitting algorithm to split the text into words before
  generating the subword vocabulary from the resulting words.

  The resulting vocabulary _can_ be used directly with a
  `text.WordpieceTokenizer`, but note that the vocabulary will be sub-optimal or
  **broken** if you split the text into words a different way.

  ```
  wordpiece_tokenizer = text.WordpieceTokenizer(vocab, ...)
  words = split(text)
  token_ids = wordpiece_tokenizer.tokenize(words)
  ```

  Args:
    dataset: `A tf.data.Dataset` containing string-tensor elements.
    vocab_size: The target vocabulary size. This is the maximum size.
    reserved_tokens: A list of tokens that must be included in the vocabulary.
    bert_tokenizer_params: The `text.BertTokenizer` arguments relavant for to
      vocabulary-generation:
      * `lower_case`
      * `keep_whitespace`
      * `normalization_form`
      * `preserve_unused_token`

      See `BertTokenizer` for details. You should use the same values for
      these to both generate the vocabulary and build the `BertTokenizer`.
    learn_params: A dict of additional key word arguments for the the vocabulary
      learning function. See `wordpiece_tokenizer_learner_lib.learn` for
      details.

  Returns:
    A list strings containing the vocabulary.

  Raises:
    TypeError: If the dataset contains structured elements instead of single
      tensors.
  """
  if bert_tokenizer_params is None:
    bert_tokenizer_params = {}

  if learn_params is None:
    learn_params = {}

  element_spec = dataset.element_spec

  try:
    element_spec.shape
  except AttributeError:
    raise TypeError("The dataset should contain single-tensor elements.")

  tokenizer = bert_tokenizer.BasicTokenizer(**bert_tokenizer_params)
  words_dataset = dataset.map(tokenizer.tokenize)
  word_counts = learner.count_words(words_dataset)

  vocab = learner.learn(word_counts, vocab_size, reserved_tokens,
                        **learn_params)

  return vocab
コード例 #3
0
ファイル: utils.py プロジェクト: samaritanhu/text
 def process(self, wordcounts):
     return learner.learn(wordcounts, self._params)