def token_to_word_id_mapper(self, keys_to_map, suffix="_wid"): """Creates a mapping function to augment a tf.data.Dataset with word ids. Suppose we have a `dataset` with outputs `str1` and `str2` of arbitrary shape. Here is an example usage of this function: embeddings = PretrainedWordEmbeddings("/path/to/emb.txt", 10000) dataset = dataset.map(embeddings.token_to_word_id_mapper(['str1', 'str2'])) Now the dataset will also include outputs `str1_wid` and `str2_wid` that can be used as features in a model. The 'str1_wid' feature has the same shape as the 'str1' feature, except the string are replaced with their int32 word IDs. Args: keys_to_map: List of strings that are keys for tf.string Tensors to map to word ids. suffix: String to append to the given keys to indicate the mapped Tensors. Returns: _mapper: A mapping function that can be used with the tf.data.Dataset API. """ return dataset_utils.string_to_int_mapper( keys_to_map=keys_to_map, mapping=self._idx2str, num_oov_buckets=self._num_oov_buckets, suffix=suffix)
def preprocess_mapper(features, lookup_table): """Model-specific preprocessing of features from the dataset.""" features = _num_context_mapper(["context"])(features) features = _string_to_tokens_dataset_mapper(["question", "context"])(features) # Add the word IDs to the dataset ("question_wid" and "context_wid"). features = dataset_utils.string_to_int_mapper( ["question_tok", "context_tok"], mapping=lookup_table, suffix="_wid")(features) return features
def preprocess_mapper(features, lookup_table): """Model-specific preprocessing of features from the dataset.""" # Truncate contexts that are too long. features["context"] = features["context"][:FLAGS.max_context_len] # Add the input lengths to the dataset ("question_len" and "context_len"). features = dataset_utils.length_mapper(["question", "context"])(features) # Add the word IDs to the dataset ("question_wid" and "context_wid"). features = dataset_utils.string_to_int_mapper(["question", "context"], mapping=lookup_table, suffix="_wid")(features) # Add the character IDs to the dataset ("question_cid" and "context_cid"). features = char_utils.token_to_char_ids_mapper(["question", "context"])(features) return features
def test_string_to_int_mapper(self): with tf.Graph().as_default(): dataset = tf.data.Dataset.from_tensor_slices({ "s": [["a", "b"], ["c", "d"]] }) dataset = dataset.map(dataset_utils.string_to_int_mapper( ["s"], ["a", "c"])) dataset = dataset.batch(2) self.assertDictEqual(dataset.output_types, {"s": tf.string, "s_id": tf.int32}) iterator = dataset.make_initializable_iterator() features = iterator.get_next() with tf.Session() as sess: sess.run([tf.tables_initializer(), iterator.initializer]) tf_s, tf_s_id = sess.run([features["s"], features["s_id"]]) self.assertAllEqual(tf_s, [["a", "b"], ["c", "d"]]) self.assertAllEqual(tf_s_id, [[0, 2], [1, 2]])