Esempi in Python per TextFileIdTableInitializer, esempi in Python per tensorflow.python.ops.lookup_ops.TextFileIdTableInitializer

Esempio n. 1

0

Mostra file

File: bert_tokenizer.py Progetto: ALICE-Natural-Language-Processing-Lab/Text-processing-in-Tensorflow

    def __init__(self,
                 vocab_lookup_table,
                 suffix_indicator="##",
                 max_bytes_per_word=100,
                 max_chars_per_token=None,
                 token_out_type=dtypes.int64,
                 unknown_token="[UNK]",
                 split_unknown_characters=False,
                 lower_case=False,
                 keep_whitespace=False,
                 normalization_form=None,
                 preserve_unused_token=False):
        super(BertTokenizer, self).__init__()
        _tf_text_bert_tokenizer_op_create_counter.get_cell().increase_by(1)
        if isinstance(vocab_lookup_table, str) or isinstance(
                vocab_lookup_table, ops.Tensor):
            init = lookup_ops.TextFileIdTableInitializer(vocab_lookup_table)
            vocab_lookup_table = lookup_ops.StaticVocabularyTableV1(
                init, num_oov_buckets=1, lookup_key_dtype=dtypes.string)

        self._basic_tokenizer = BasicTokenizer(lower_case, keep_whitespace,
                                               normalization_form,
                                               preserve_unused_token)
        self._wordpiece_tokenizer = WordpieceTokenizer(
            vocab_lookup_table, suffix_indicator, max_bytes_per_word,
            max_chars_per_token, token_out_type, unknown_token,
            split_unknown_characters)

Esempio n. 2

0

Mostra file

    def __init__(self,
                 vocab_lookup_table,
                 suffix_indicator="##",
                 max_bytes_per_word=100,
                 max_chars_per_token=None,
                 token_out_type=dtypes.int64,
                 unknown_token="[UNK]",
                 split_unknown_characters=False,
                 lower_case=False,
                 keep_whitespace=False,
                 normalization_form=None,
                 preserve_unused_token=False):
        if isinstance(vocab_lookup_table, str) or isinstance(
                vocab_lookup_table, ops.Tensor):
            init = lookup_ops.TextFileIdTableInitializer(vocab_lookup_table)
            vocab_lookup_table = lookup_ops.StaticVocabularyTableV1(
                init, num_oov_buckets=1, lookup_key_dtype=dtypes.string)

        print("Before ", type(lower_case))
        if isinstance(lower_case, ops.Tensor):
            lower_case = tf.compat.v1.get_default_session().run(lower_case)
        print("After ", type(lower_case))

        self._basic_tokenizer = BasicTokenizer(lower_case, keep_whitespace,
                                               normalization_form,
                                               preserve_unused_token)
        self._wordpiece_tokenizer = WordpieceTokenizer(
            vocab_lookup_table, suffix_indicator, max_bytes_per_word,
            max_chars_per_token, token_out_type, unknown_token,
            split_unknown_characters)

Esempio n. 3

0

Mostra file

    def __init__(self,
                 vocab_lookup_table,
                 suffix_indicator='##',
                 max_bytes_per_word=100,
                 max_chars_per_token=None,
                 token_out_type=dtypes.int64,
                 unknown_token='[UNK]',
                 split_unknown_characters=False):
        """Initializes the WordpieceTokenizer.

    Args:
      vocab_lookup_table: A lookup table implementing the LookupInterface
        containing the vocabulary of subwords or a string which is the file path
        to the vocab.txt file.
      suffix_indicator: (optional) The characters prepended to a wordpiece to
        indicate that it is a suffix to another subword. Default is '##'.
      max_bytes_per_word: (optional) Max size of input token. Default is 100.
      max_chars_per_token: (optional) Max size of subwords, excluding suffix
        indicator. If known, providing this improves the efficiency of decoding
        long words.
      token_out_type: (optional) The type of the token to return. This can be
        `tf.int64` or `tf.int32` IDs, or `tf.string` subwords. The default is
        `tf.int64`.
      unknown_token: (optional) The string value to substitute for an unknown
        token. Default is "[UNK]". If set to `None`, no substitution occurs.
        If `token_out_type` is `tf.int32`/`tf.int64`, the `vocab_lookup_table`
        is used (after substitution) to convert the unknown token to an integer.
      split_unknown_characters: (optional) Whether to split out single unknown
        characters as subtokens. If False (default), words containing unknown
        characters will be treated as single unknown tokens.
    """
        super(WordpieceTokenizer, self).__init__()
        _tf_text_wordpiece_tokenizer_op_create_counter.get_cell().increase_by(
            1)

        if isinstance(vocab_lookup_table,
                      str) or (isinstance(vocab_lookup_table, ops.Tensor)
                               and vocab_lookup_table.dtype == dtypes.string):
            init = lookup_ops.TextFileIdTableInitializer(vocab_lookup_table)
            vocab_lookup_table = lookup_ops.StaticVocabularyTableV1(
                init, num_oov_buckets=1, lookup_key_dtype=dtypes.string)

        if not isinstance(vocab_lookup_table, lookup_ops.LookupInterface):
            raise TypeError('Unable to build a lookup table from {}'.format(
                vocab_lookup_table))

        self._vocab_lookup_table = vocab_lookup_table
        self._suffix_indicator = suffix_indicator
        self._max_bytes_per_word = max_bytes_per_word
        self._max_chars_per_token = (0 if max_chars_per_token is None else
                                     max_chars_per_token)
        self._token_out_type = token_out_type
        self._unknown_token = unknown_token if unknown_token else '[UNK]'
        self._use_unknown_token = True if unknown_token else False
        self._split_unknown_characters = split_unknown_characters

Esempio n. 4

0

Mostra file

File: load_test.py Progetto: pringwong/tensorflow

  def _make_model_with_tables(self):
    default_val = -1
    keys = constant_op.constant(["brain", "salad", "surgery"])
    values = constant_op.constant([0, 1, 2], dtypes.int64)
    table1_initializer = lookup_ops.KeyValueTensorInitializer(keys, values)
    table1 = lookup_ops.HashTable(table1_initializer, default_val)

    table2_file = self._make_asset("test\nfoo\nbrain\n")
    table2_initializer = lookup_ops.TextFileIdTableInitializer(table2_file)
    table2 = lookup_ops.HashTable(table2_initializer, default_val)

    def _make_lookup_function(table):
      signature = [tensor_spec.TensorSpec(None, dtypes.string)]
      return def_function.function(input_signature=signature)(
          lambda x: table.lookup(x))  # pylint: disable=unnecessary-lambda

    root = tracking.AutoTrackable()
    root.table1 = table1
    root.lookup1 = _make_lookup_function(table1)
    root.table2 = table2
    root.lookup2 = _make_lookup_function(table2)
    return root

Esempio n. 5

0

Mostra file

File: bert_tokenizer.py Progetto: kornesh/text

    def __init__(self,
                 vocab_lookup_table,
                 suffix_indicator="##",
                 max_bytes_per_word=100,
                 max_chars_per_token=None,
                 token_out_type=dtypes.int64,
                 unknown_token="[UNK]",
                 split_unknown_characters=False,
                 lower_case=False,
                 keep_whitespace=False,
                 normalization_form=None):
        if isinstance(vocab_lookup_table, str):
            init = lookup_ops.TextFileIdTableInitializer(vocab_lookup_table)
            vocab_lookup_table = lookup_ops.StaticVocabularyTableV1(
                init, num_oov_buckets=1, lookup_key_dtype=dtypes.string)

        self._basic_tokenizer = BasicTokenizer(lower_case, keep_whitespace,
                                               normalization_form)
        self._wordpiece_tokenizer = WordpieceTokenizer(
            vocab_lookup_table, suffix_indicator, max_bytes_per_word,
            max_chars_per_token, token_out_type, unknown_token,
            split_unknown_characters)

Esempio n. 6

0

Mostra file

File: tokenizers_benchmarks.py Progetto: isabella232/text-1

 def _create_table(self, vocab, num_oov=100):
     init = lookup_ops.TextFileIdTableInitializer(vocab)
     return lookup_ops.StaticVocabularyTableV1(init, num_oov)