Ejemplo n.º 1
0
    def __init__(self,
                 tfds_name: str = 'glue/sst2',
                 vocab_path: str = 'vocab.txt',
                 tokenizer: text.Tokenizer = text.WhitespaceTokenizer(),
                 split='train'):
        """Initializes the SST2 data source."""
        self.dataset, self.info = tfds.load(tfds_name,
                                            split=split,
                                            with_info=True)

        # Look up the feature name of the text and label in the dataset.
        # We assume there is one text input and one label.
        text_fields = filter(_is_text_field, self.info.features.items())
        label_fields = filter(_is_class_label, self.info.features.items())
        self.text_feature_name, _ = next(text_fields)
        self.label_feature_name, _ = next(label_fields)

        # Load the vocabulary.
        self.vocab = vocabulary.Vocabulary(vocab_path=vocab_path)

        # Convert the sentences to sequences of token IDs and compute length.
        self.tokenizer = tokenizer
        self.tf_vocab = vocab_to_hashtable(self.vocab,
                                           unk_idx=self.vocab.unk_idx)
        self.examples = self.dataset.map(self.prepare_example,
                                         num_parallel_calls=AUTOTUNE).cache()
Ejemplo n.º 2
0
def _parse_tfrecord_function(example, lookup_table):
    example_fmt = {
        'opcodes': tf.io.FixedLenFeature([], tf.string),
        'label': tf.io.FixedLenFeature([], tf.int64)
    }
    parsed = tf.io.parse_single_example(example, example_fmt)
    tokenizer = text.WhitespaceTokenizer()
    tokens = tokenizer.tokenize(parsed['opcodes'])
    IDs = lookup_table.lookup(tokens)
    return IDs, parsed['label']
Ejemplo n.º 3
0
def get_tokenized_sequences(
        dataset: tf.data.Dataset,
        tokenizer: tftext.Tokenizer = tftext.WhitespaceTokenizer(),
        input_key: str = 'sentence') -> Iterable[Sequence[bytes]]:
    """Returns tokenized sequences for vocabulary building."""
    dataset = dataset.map(
        lambda example: tokenizer.tokenize(example[input_key]),
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
    for sentence in tfds.as_numpy(dataset):
        yield sentence
def _parse_tfrecord_function(example, opcodes_lookup_table, bytes_lookup_table):
    example_fmt = {
        'mnemonics': tf.io.FixedLenFeature([], tf.string),
        'bytes': tf.io.FixedLenFeature([], tf.string),
            'label': tf.io.FixedLenFeature([], tf.int64)
        }
    parsed = tf.io.parse_single_example(example, example_fmt)
    tokenizer = text.WhitespaceTokenizer()
    opcode_tokens = tokenizer.tokenize(parsed['mnemonics'])
    byte_tokens = tokenizer.tokenize(parsed['bytes'])
    opcode_IDs = opcodes_lookup_table.lookup(opcode_tokens)
    byte_IDs = bytes_lookup_table.lookup(byte_tokens)

    return opcode_IDs, byte_IDs, parsed['label']
Ejemplo n.º 5
0
def make_data(sentences, window_size):
    tokenizer = text.WhitespaceTokenizer()
    tokens = tokenizer.tokenize(sentences)
    ngrams = text.ngrams(tokens,
                         window_size + 1,
                         reduction_type=text.Reduction.STRING_JOIN)
    segments = np.array(
        [x[0].decode("UTF-8").split(" ") for x in ngrams.to_list()])
    input_batch = [' '.join(x) for x in segments[:, 0:-1]]
    target_batch = to_categorical(np.vectorize(lambda x: word_index[x] - 1)(
        segments[:, -1]),
                                  n_class,
                                  dtype='float32')
    return input_batch, target_batch
Ejemplo n.º 6
0
  def call(self, strings: tf.Tensor, training=False, **kwargs) -> tf.Tensor:
    """Convert a tensor of strings into a tensor of token ids.

    Uses the WordpieceTokenizer to convert a tensor of shape N to N+1. The added
    dimension is the 'sentence' dimension when the string gets converted into
    split up tokens.
    e.g.
    ["What time is it.", "it's 3 o'clock"] ->
    [
      [2, 3, 4, 5, 0, 0],
      [3, 5, 7, 1, 8, 0]
    ]

    Parameters
    ----------
    strings: A tensor of strings to be tokenized.

    Returns
    -------
    The tensor of tokenized strings.

    """
    input_shape = strings.shape
    output_shape = input_shape.concatenate(
      tf.TensorShape([self.max_seq_length]))

    # Define the tokenizers and tokenize the strings.
    self.whitespace_tokenizer = tf_text.WhitespaceTokenizer()
    self.tokenizer = tf_text.WordpieceTokenizer(
      self.vocab_table,
      token_out_type=tf.int64
    )
    tokens = self.whitespace_tokenizer.tokenize('[SEP] ' + strings)
    tokens = self.tokenizer.tokenize(tokens)

    # Collapse the ragged tensor dimension by one convert to a regular tensor.
    tokens = self._merge_dims(tokens, -2)

    tokens = tokens.to_tensor(default_value=0)
    rank = len(tokens.shape)

    # Slice off some of the dim if it's too long or pad if it's too short.
    tokens = tokens[..., :self.max_seq_length]
    seq_len = tf.shape(tokens)[-1]
    paddings = [[0, 0]] * (rank - 1) + [[0, self.max_seq_length - seq_len]]
    tokens = tf.pad(tokens, paddings, 'CONSTANT', constant_values=0)

    tokens = tf.ensure_shape(tokens, output_shape)
    return tokens
def _parse_tfrecord_function(example, opcodes_lookup_table,
                             bytes_lookup_table):
    example_fmt = {
        'opcodes': tf.io.FixedLenFeature([], tf.string),
        'bytes': tf.io.FixedLenFeature([], tf.string),
        'APIs': tf.io.FixedLenFeature([], tf.string),
        'label': tf.io.FixedLenFeature([], tf.int64)
    }
    parsed = tf.io.parse_single_example(example, example_fmt)

    tokenizer = text.WhitespaceTokenizer()

    opcodes_tokens = tokenizer.tokenize(parsed['opcodes'])
    opcodes_IDs = opcodes_lookup_table.lookup(opcodes_tokens)

    bytes_tokens = tokenizer.tokenize(parsed['bytes'])
    bytes_IDs = bytes_lookup_table.lookup(bytes_tokens)

    feature_vector = tf.io.decode_raw(parsed['APIs'], tf.float32)
    return opcodes_IDs, bytes_IDs, feature_vector, parsed['label']
Ejemplo n.º 8
0
def tokenize(ds, dataset_name):
    """Tokenizes a line into words with alphanum characters."""
    def extract_strings(example):
        if dataset_name == 'shakespeare':
            return tf.expand_dims(example['snippets'], 0)
        elif dataset_name == 'stackoverflow':
            return tf.expand_dims(example['tokens'], 0)
        else:
            raise app.UsageError('Dataset not supported: ', dataset_name)

    def tokenize_line(line):
        return tf.data.Dataset.from_tensor_slices(tokenizer.tokenize(line)[0])

    def mask_all_symbolic_words(word):
        return tf.math.logical_not(
            tf_text.wordshape(word, tf_text.WordShape.IS_PUNCT_OR_SYMBOL))

    tokenizer = tf_text.WhitespaceTokenizer()
    ds = ds.map(extract_strings)
    ds = ds.flat_map(tokenize_line)
    ds = ds.map(tf_text.case_fold_utf8)
    ds = ds.filter(mask_all_symbolic_words)
    return ds
Ejemplo n.º 9
0
    def __call__(self, x):
        # Constrained sequence
        cs_scores = np.array([[10.0, 12.0, 6.0, 4.0], [13.0, 12.0, 11.0,
                                                       10.0]])
        cs_input = np.array([cs_scores, cs_scores, cs_scores],
                            dtype=np.float32)
        cs_transition_weights = np.array(
            [[-1.0, 1.0, -2.0, 2.0, 0.0], [3.0, -3.0, 4.0, -4.0, 0.0],
             [5.0, 1.0, 10.0, 1.0, 1.0], [-7.0, 7.0, -8.0, 8.0, 0.0],
             [0.0, 1.0, 2.0, 3.0, 0.0]],
            dtype=np.float32)
        cs_allowed_transitions = np.array([[True, True, True, True, True],
                                           [True, True, True, True, True],
                                           [True, False, True, False, False],
                                           [True, True, True, True, True],
                                           [True, False, True, True, True]])
        constrained_sequence = text.viterbi_constrained_sequence(
            cs_input, [2, 2, 2],
            allowed_transitions=cs_allowed_transitions,
            transition_weights=cs_transition_weights,
            use_log_space=True,
            use_start_and_end_states=True)
        # Max Spanning Tree
        mst_num_nodes = tf.constant([4, 3], tf.int32)
        mst_scores = tf.constant(
            [[[0, 0, 0, 0], [1, 0, 0, 0], [1, 2, 0, 0], [1, 2, 3, 4]],
             [[4, 3, 2, 9], [0, 0, 2, 9], [0, 0, 0, 9], [9, 9, 9, 9]]],
            tf.int32)  # pyformat: disable
        (max_spanning_tree,
         _) = text.max_spanning_tree(mst_num_nodes, mst_scores)
        # Normalize
        normalized = text.case_fold_utf8(['A String'])
        normalized = text.normalize_utf8(normalized)
        # Regex split
        regex_split = text.regex_split(input=['Yo dawg!'],
                                       delim_regex_pattern=r'\s')
        # Rouge-L
        rl_hypotheses = tf.ragged.constant(
            [['captain', 'of', 'the', 'delta', 'flight'],
             ['the', '1990', 'transcript']])
        rl_references = tf.ragged.constant(
            [['delta', 'air', 'lines', 'flight'],
             ['this', 'concludes', 'the', 'transcript']])
        (rouge_l, _, _) = text.metrics.rouge_l(rl_hypotheses, rl_references)
        # Sentence breaking version 1 (token dependent)
        sb_token_word = [['Welcome', 'to', 'the', 'U.S.', '!', 'Harry'],
                         ['Wu', 'Tang', 'Clan', ';', 'ain\'t', 'nothing']]
        sb_token_properties = [[0, 0, 0, 256, 0, 0], [0, 0, 0, 0, 0, 0]]
        sb_token_starts = []
        sb_token_ends = []
        for sentence in sb_token_word:
            sentence_string = ''
            sentence_start = []
            sentence_end = []
            for word in sentence:
                sentence_start.append(len(sentence_string))
                sentence_string = sentence_string.join([word, ' '])
                sentence_end.append(len(sentence_string))
            sb_token_starts.append(sentence_start)
            sb_token_ends.append(sentence_end)
        sb_token_starts = tf.constant(sb_token_starts, dtype=tf.int64)
        sb_token_ends = tf.constant(sb_token_ends, dtype=tf.int64)
        sb_token_properties = tf.ragged.constant(sb_token_properties,
                                                 dtype=tf.int64)
        (sentence_breaking, _, _,
         _) = text.sentence_fragments(sb_token_word, sb_token_starts,
                                      sb_token_ends, sb_token_properties)
        # Sentence breaking version 2 (StateBasedSentenceBreaker)
        sbv2_text_input = [['Welcome to the U.S.! Harry'],
                           ['Wu Tang Clan; ain\'t nothing']]
        sentence_breaker_v2 = text.StateBasedSentenceBreaker()
        sbv2_fragment_text, _, _ = (
            sentence_breaker_v2.break_sentences_with_offsets(sbv2_text_input))
        # Sentencepiece tokenizer
        sp_model_file = (
            'third_party/tensorflow_text/python/ops/test_data/test_oss_model.model'
        )
        sp_model = open(sp_model_file, 'rb').read()
        sp_tokenizer = text.SentencepieceTokenizer(sp_model)
        sentencepiece = sp_tokenizer.tokenize(['A sentence of things.'])
        sentencepiece = sp_tokenizer.detokenize(sentencepiece)
        (sentencepiece, _,
         _) = sp_tokenizer.tokenize_with_offsets(sentencepiece)
        sentencepiece_size = sp_tokenizer.vocab_size()
        sentencepiece_id = sp_tokenizer.id_to_string(1)
        # Split merge tokenizer
        sm_tokenizer = text.SplitMergeTokenizer()
        split_merge = sm_tokenizer.tokenize(b'IloveFlume!',
                                            [0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0])
        # Split merge from logits tokenizer
        smfl_tokenizer = text.SplitMergeFromLogitsTokenizer()
        split_merge_from_logits = smfl_tokenizer.tokenize(
            b'IloveFlume!',
            # One pair of logits for each Unicode character from the text.  Each
            # pair indicates a "split" action if the first component is greater than
            # the second one, and a "merge" otherwise.
            [
                [2.7, -0.3],  # I: split
                [4.1, 0.82],  # l: split
                [-2.3, 4.3],  # o: merge
                [3.1, 12.2],  # v: merge
                [-3.0, 4.7],  # e: merge
                [2.7, -0.7],  # F: split
                [0.7, 15.0],  # l: merge
                [1.6, 23.0],  # u: merge
                [2.1, 11.0],  # m: merge
                [0.0, 20.0],  # e: merge
                [18.0, 0.7],  # !: split
            ])
        # Confirm TF unicode_script op that requires ICU works
        tf_unicode_script = tf.strings.unicode_script(
            [ord('a'), 0x0411, 0x82b8, ord(',')])
        # Unicode script tokenizer
        us_tokenizer = text.UnicodeScriptTokenizer()
        unicode_script = us_tokenizer.tokenize(['a string'])
        # Whitespace tokenizer
        ws_tokenizer = text.WhitespaceTokenizer()
        whitespace = ws_tokenizer.tokenize(['a string'])
        # Wordpiece tokenizer
        wp_initializer = tf.lookup.KeyValueTensorInitializer(
            ['i'], [1], key_dtype=tf.string, value_dtype=tf.int64)
        self.wp_vocab_table = tf.lookup.StaticHashTable(wp_initializer,
                                                        default_value=-1)
        wp_tokenizer = text.WordpieceTokenizer(self.wp_vocab_table)
        wordpiece = wp_tokenizer.tokenize(['i am'])
        # Wordshape
        wordshapes = text.wordshape([u'a-b', u'a\u2010b'.encode('utf-8')],
                                    text.WordShape.HAS_PUNCTUATION_DASH)

        # Assertion method
        def assert_check(tensor):
            return tf.assert_equal(tensor, tf.identity(tensor))

        # Assertions
        constrained_sequence_assert = assert_check(
            constrained_sequence.to_tensor())
        max_spanning_tree_assert = assert_check(max_spanning_tree)
        normalized_assert = assert_check(normalized)
        regex_split_assert = assert_check(regex_split.to_tensor())
        rouge_l_assert = assert_check(rouge_l)
        sentence_breaking_assert = assert_check(sentence_breaking.to_tensor())
        sentence_breaking_v2_assert = assert_check(
            sbv2_fragment_text.to_tensor())
        sentencepiece_assert = assert_check(sentencepiece.to_tensor())
        sentencepiece_id_assert = assert_check(sentencepiece_id)
        sentencepiece_size_assert = assert_check(sentencepiece_size)
        split_merge_assert = assert_check(split_merge)
        split_merge_from_logits_assert = assert_check(split_merge_from_logits)
        tf_unicode_script_assert = assert_check(tf_unicode_script)
        unicode_script_assert = assert_check(unicode_script.to_tensor())
        whitespace_assert = assert_check(whitespace.to_tensor())
        wordpiece_assert = assert_check(wordpiece.to_tensor())
        wordshapes_assert = assert_check(wordshapes)

        with tf.control_dependencies([
                constrained_sequence_assert, max_spanning_tree_assert,
                normalized_assert, regex_split_assert, rouge_l_assert,
                sentence_breaking_assert, sentence_breaking_v2_assert,
                sentencepiece_assert, sentencepiece_id_assert,
                sentencepiece_size_assert, split_merge_assert,
                split_merge_from_logits_assert, tf_unicode_script_assert,
                unicode_script_assert, whitespace_assert, wordpiece_assert,
                wordshapes_assert
        ]):
            y = tf.add(x, [1])
        return {'y': y}
def tokenize_fun(tokenizer):
    """Standard text processing function."""
    wsp = text.WhitespaceTokenizer()
    return utils.compose(tokenizer.tokenize, wsp.tokenize, text.case_fold_utf8)
Ejemplo n.º 11
0
def main():
    # Unicode
    docs = tf.constant([
        u'Everything not saved will be lost.'.encode('UTF-16-BE'),
        u'Sad☹'.encode('UTF-16-BE')
    ])
    _ = tf.strings.unicode_transcode(docs,
                                     input_encoding='UTF-16-BE',
                                     output_encoding='UTF-8')

    # Tokenization
    # WhitespaceTokenizer
    tokenizer = text.UnicodeScriptTokenizer()
    tokens = tokenizer.tokenize(
        ['everything not saved will be lost', u'Sad☹'.encode('UTF-8')])
    print(f'Tokens: {tokens.to_list()}')

    # Unicode split
    tokens = tf.strings.unicode_split([u"仅今年前".encode('UTF-8')], 'UTF-8')
    print(f'Tokens: {tokens.to_list()}')

    # Offsets
    tokenizer = text.UnicodeScriptTokenizer()
    (tokens, _, end_offsets) = tokenizer.tokenize_with_offsets(
        ['everything not saved will be lost.', u'Sad☹'.encode('UTF-8')])

    print(f'Tokens: {tokens.to_list()}')
    print(f'Offsets: {end_offsets.to_list()}')

    # TF.Data Example
    docs = tf.data.Dataset.from_tensor_slices([['Never tell me the odds.'],
                                               ["It's a trap!"]])
    tokenizer = text.WhitespaceTokenizer()
    tokenized_docs = docs.map(lambda x: tokenizer.tokenize(x))
    iterator = iter(tokenized_docs)
    print(f'First sentence tokens: {next(iterator).to_list()}')
    print(f'Seconds sentence tokens: {next(iterator).to_list()}')

    # Other Text Ops
    # Wordshape
    tokenizer = text.WhitespaceTokenizer()
    tokens = tokenizer.tokenize(
        ['Everything not saved will be lost.', u'Sad☹'.encode('UTF-8')])

    # Is capitalized?
    f1 = text.wordshape(tokens, text.WordShape.HAS_TITLE_CASE)
    # Are all letters uppercased
    f2 = text.wordshape(tokens, text.WordShape.IS_UPPERCASE)
    # Does the token contain punctuation?
    f3 = text.wordshape(tokens, text.WordShape.HAS_SOME_PUNCT_OR_SYMBOL)
    # Is the token a number?
    f4 = text.wordshape(tokens, text.WordShape.IS_NUMERIC_VALUE)

    print(f'Is capitalized? {f1.to_list()}')
    print(f'Are all letters uppercased? {f2.to_list()}')
    print(f'Does the token contain punctuation? {f3.to_list()}')
    print(f'Is the token a number? {f4.to_list()}')

    # N-grams & Sliding Window
    tokenizer = text.WhitespaceTokenizer()
    tokens = tokenizer.tokenize(
        ['Everything not saved will be lost.', u'Sad☹'.encode('UTF-8')])

    # Ngrams, in this case bi-gram (n = 2)
    bigrams = text.ngrams(tokens, 2, reduction_type=text.Reduction.STRING_JOIN)

    print(f'Bi-grams: {bigrams.to_list()}')
Ejemplo n.º 12
0
    def __call__(self, x):
        # Constrained sequence
        cs_scores = np.array([[10.0, 12.0, 6.0, 4.0], [13.0, 12.0, 11.0,
                                                       10.0]])
        cs_input = np.array([cs_scores, cs_scores, cs_scores],
                            dtype=np.float32)
        cs_transition_weights = np.array(
            [[-1.0, 1.0, -2.0, 2.0, 0.0], [3.0, -3.0, 4.0, -4.0, 0.0],
             [5.0, 1.0, 10.0, 1.0, 1.0], [-7.0, 7.0, -8.0, 8.0, 0.0],
             [0.0, 1.0, 2.0, 3.0, 0.0]],
            dtype=np.float32)
        cs_allowed_transitions = np.array([[True, True, True, True, True],
                                           [True, True, True, True, True],
                                           [True, False, True, False, False],
                                           [True, True, True, True, True],
                                           [True, False, True, True, True]])
        constrained_sequence = text.viterbi_constrained_sequence(
            cs_input, [2, 2, 2],
            allowed_transitions=cs_allowed_transitions,
            transition_weights=cs_transition_weights,
            use_log_space=True,
            use_start_and_end_states=True)
        # Max Spanning Tree
        mst_num_nodes = tf.constant([4, 3], tf.int32)
        mst_scores = tf.constant(
            [[[0, 0, 0, 0], [1, 0, 0, 0], [1, 2, 0, 0], [1, 2, 3, 4]],
             [[4, 3, 2, 9], [0, 0, 2, 9], [0, 0, 0, 9], [9, 9, 9, 9]]],
            tf.int32)  # pyformat: disable
        (max_spanning_tree,
         _) = text.max_spanning_tree(mst_num_nodes, mst_scores)
        # Normalize
        normalized = text.case_fold_utf8(['A String'])
        normalized = text.normalize_utf8(normalized)
        # Regex split
        regex_split = text.regex_split(input=['Yo dawg!'],
                                       delim_regex_pattern=r'\s')
        # Rouge-L
        rl_hypotheses = tf.ragged.constant(
            [['captain', 'of', 'the', 'delta', 'flight'],
             ['the', '1990', 'transcript']])
        rl_references = tf.ragged.constant(
            [['delta', 'air', 'lines', 'flight'],
             ['this', 'concludes', 'the', 'transcript']])
        (rouge_l, _, _) = text.metrics.rouge_l(rl_hypotheses, rl_references)
        # Sentence breaking
        sb_token_word = [['Welcome', 'to', 'the', 'U.S.', '!', 'Harry'],
                         ['Wu', 'Tang', 'Clan', ';', 'ain\'t', 'nothing']]
        sb_token_properties = [[0, 0, 0, 256, 0, 0], [0, 0, 0, 0, 0, 0]]
        sb_token_starts = []
        sb_token_ends = []
        for sentence in sb_token_word:
            sentence_string = ''
            sentence_start = []
            sentence_end = []
            for word in sentence:
                sentence_start.append(len(sentence_string))
                sentence_string = sentence_string.join([word, ' '])
                sentence_end.append(len(sentence_string))
            sb_token_starts.append(sentence_start)
            sb_token_ends.append(sentence_end)
        sb_token_starts = tf.constant(sb_token_starts, dtype=tf.int64)
        sb_token_ends = tf.constant(sb_token_ends, dtype=tf.int64)
        sb_token_properties = tf.ragged.constant(sb_token_properties,
                                                 dtype=tf.int64)
        (sentence_breaking, _, _,
         _) = text.sentence_fragments(sb_token_word, sb_token_starts,
                                      sb_token_ends, sb_token_properties)
        # Sentencepiece tokenizer
        sp_model_file = (
            'third_party/tensorflow_text/python/ops/test_data/test_oss_model.model'
        )
        sp_model = open(sp_model_file, 'rb').read()
        sp_tokenizer = text.SentencepieceTokenizer(sp_model)
        sentencepiece = sp_tokenizer.tokenize(['A sentence of things.'])
        sentencepiece = sp_tokenizer.detokenize(sentencepiece)
        (sentencepiece, _,
         _) = sp_tokenizer.tokenize_with_offsets(sentencepiece)
        sentencepiece_size = sp_tokenizer.vocab_size()
        sentencepiece_id = sp_tokenizer.id_to_string(1)
        # Split merge tokenizer - not in this version
        sm_tokenizer = text.SplitMergeTokenizer()
        split_merge = sm_tokenizer.tokenize(b'IloveFlume!',
                                            [0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0])
        # Unicode script tokenizer
        us_tokenizer = text.UnicodeScriptTokenizer()
        unicode_script = us_tokenizer.tokenize(['a string'])
        # Whitespace tokenizer
        ws_tokenizer = text.WhitespaceTokenizer()
        whitespace = ws_tokenizer.tokenize(['a string'])
        # Wordpiece tokenizer
        wp_initializer = tf.lookup.KeyValueTensorInitializer(
            ['i'], [1], key_dtype=tf.string, value_dtype=tf.int64)
        self.wp_vocab_table = tf.lookup.StaticHashTable(wp_initializer,
                                                        default_value=-1)
        wp_tokenizer = text.WordpieceTokenizer(self.wp_vocab_table)
        wordpiece = wp_tokenizer.tokenize(['i am'])
        # Wordshape
        wordshapes = text.wordshape([u'a-b', u'a\u2010b'.encode('utf-8')],
                                    text.WordShape.HAS_PUNCTUATION_DASH)

        with tf.control_dependencies([
                constrained_sequence, max_spanning_tree, normalized,
                regex_split, rouge_l, sentence_breaking, sentencepiece,
                sentencepiece_id, sentencepiece_size, split_merge,
                unicode_script, whitespace, wordpiece, wordshapes
        ]):
            y = tf.add(x, [1])
        return {'y': y}
def get_datasets(n_devices,
                 task_name,
                 data_dir=None,
                 batch_size=256,
                 max_length=2000):
    """Get algorithmic datasets."""
    if batch_size % n_devices:
        raise ValueError("Batch size %d isn't divided evenly by n_devices %d" %
                         (batch_size, n_devices))

    train_path = data_dir + task_name + '_train.tsv'
    val_path = data_dir + task_name + '_val.tsv'
    test_path = data_dir + task_name + '_test.tsv'

    train_dataset = preprocess_dataset(train_path, batch_size)
    val_dataset = preprocess_dataset(val_path, batch_size)
    test_dataset = preprocess_dataset(test_path, batch_size)

    tf.logging.info('Finished preprocessing')
    tf.logging.info('Building vocab')
    # build vocab
    vocab_set = set()
    tokenizer = text.WhitespaceTokenizer()

    lengths = []
    for i, data in enumerate(val_dataset):
        examples = data['Source']
        examples = tokenizer.tokenize(examples.numpy())
        examples = np.reshape(examples, (-1)).tolist()
        lengths.append(len(examples))
        vocab_set.update(examples)
        if i % 1000 == 0:
            tf.logging.info('Processed {}'.format(i))
        if i > 1000:
            break
    vocab_set = list(set(vocab_set))
    tf.logging.info('Finished processing vocab size={}'.format(len(vocab_set)))

    encoder = tfds.deprecated.text.TokenTextEncoder(vocab_set)

    def tf_encode(x):
        result = tf.py_function(
            lambda s: tf.constant(encoder.encode(s.numpy())), [
                x,
            ], tf.int32)
        result.set_shape([None])
        return result

    def tokenize(d):
        return {
            'inputs': tf_encode(d['Source'])[:max_length],
            'targets': d['Target']
        }

    train_dataset = train_dataset.map(tokenize, num_parallel_calls=AUTOTUNE)
    val_dataset = val_dataset.map(tokenize, num_parallel_calls=AUTOTUNE)
    test_dataset = test_dataset.map(tokenize, num_parallel_calls=AUTOTUNE)

    max_shape = {'inputs': [max_length], 'targets': []}
    train_dataset = train_dataset.shuffle(
        buffer_size=1024,
        reshuffle_each_iteration=True).padded_batch(batch_size,
                                                    padded_shapes=max_shape)
    val_dataset = val_dataset.padded_batch(batch_size, padded_shapes=max_shape)
    test_dataset = test_dataset.padded_batch(batch_size,
                                             padded_shapes=max_shape)

    train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)
    val_dataset = val_dataset.prefetch(tf.data.experimental.AUTOTUNE)
    test_dataset = test_dataset.prefetch(tf.data.experimental.AUTOTUNE)

    return train_dataset, val_dataset, test_dataset, encoder
Ejemplo n.º 14
0
def _call_whitespace_tokenizer_to_ragged(test_case):
    tokenizer = tf_text.WhitespaceTokenizer()
    return tokenizer.tokenize(test_case)
Ejemplo n.º 15
0
def tokenize(dataset):
    tokenizer = text.WhitespaceTokenizer()
    return tokenizer.tokenize(dataset)
Ejemplo n.º 16
0
 def __init__(self):
     super(WhitespaceTokenizer, self).__init__()
     self._tokenizer = tf_text.WhitespaceTokenizer()
Ejemplo n.º 17
0
# https://www.tensorflow.org/tutorials/tensorflow_text/intro

import tensorflow as tf

import tensorflow_text as text

docs = tf.constant([
    u'Everything not saved will be lost.'.encode('UTF-16-BE'),
    u'Sad☹'.encode('UTF-16-BE')
])
utf8_docs = tf.strings.unicode_transcode(docs,
                                         input_encoding='UTF-16-BE',
                                         output_encoding='UTF-8')

tokenizer = text.WhitespaceTokenizer()
tokens = tokenizer.tokenize(
    ['everything not saved will be lost.', u'Sad☹'.encode('UTF-8')])
print(tokens.to_list())

tokenizer = text.UnicodeScriptTokenizer()
tokens = tokenizer.tokenize(
    ['everything not saved will be lost.', u'Sad☹'.encode('UTF-8')])
print(tokens.to_list())

tokens = tf.strings.unicode_split([u"仅今年前".encode('UTF-8')], 'UTF-8')
print(tokens.to_list())

tokenizer = text.UnicodeScriptTokenizer()
(tokens, offset_starts, offset_limits) = tokenizer.tokenize_with_offsets(
    ['everything not saved will be lost.', u'Sad☹'.encode('UTF-8')])
print(tokens.to_list())
Ejemplo n.º 18
0
def tokenize_w_punctuation(tokenizer):
    """Text processing function which splits off punctuation."""
    wsp = text.WhitespaceTokenizer()
    return utils.compose(tokenizer.tokenize, wsp.tokenize,
                         tensor_punctuation_separator, text.case_fold_utf8)