Exemple #1
0
    def token_to_word_id_mapper(self, keys_to_map, suffix="_wid"):
        """Creates a mapping function to augment a tf.data.Dataset with word ids.

    Suppose we have a `dataset` with outputs `str1` and `str2` of arbitrary
    shape. Here is an example usage of this function:

    embeddings = PretrainedWordEmbeddings("/path/to/emb.txt", 10000)
    dataset = dataset.map(embeddings.token_to_word_id_mapper(['str1', 'str2']))

    Now the dataset will also include outputs `str1_wid` and `str2_wid` that
    can be used as features in a model.  The 'str1_wid' feature has the same
    shape as the 'str1' feature, except the string are replaced with their
    int32 word IDs.

    Args:
      keys_to_map: List of strings that are keys for tf.string Tensors to map to
          word ids.
      suffix: String to append to the given keys to indicate the mapped Tensors.

    Returns:
      _mapper: A mapping function that can be used with the tf.data.Dataset API.
    """
        return dataset_utils.string_to_int_mapper(
            keys_to_map=keys_to_map,
            mapping=self._idx2str,
            num_oov_buckets=self._num_oov_buckets,
            suffix=suffix)
Exemple #2
0
def preprocess_mapper(features, lookup_table):
    """Model-specific preprocessing of features from the dataset."""
    features = _num_context_mapper(["context"])(features)
    features = _string_to_tokens_dataset_mapper(["question",
                                                 "context"])(features)

    # Add the word IDs to the dataset ("question_wid" and "context_wid").
    features = dataset_utils.string_to_int_mapper(
        ["question_tok", "context_tok"], mapping=lookup_table,
        suffix="_wid")(features)

    return features
Exemple #3
0
def preprocess_mapper(features, lookup_table):
    """Model-specific preprocessing of features from the dataset."""
    # Truncate contexts that are too long.
    features["context"] = features["context"][:FLAGS.max_context_len]

    # Add the input lengths to the dataset ("question_len" and "context_len").
    features = dataset_utils.length_mapper(["question", "context"])(features)

    # Add the word IDs to the dataset ("question_wid" and "context_wid").
    features = dataset_utils.string_to_int_mapper(["question", "context"],
                                                  mapping=lookup_table,
                                                  suffix="_wid")(features)

    # Add the character IDs to the dataset ("question_cid" and "context_cid").
    features = char_utils.token_to_char_ids_mapper(["question",
                                                    "context"])(features)
    return features
Exemple #4
0
  def test_string_to_int_mapper(self):
    with tf.Graph().as_default():
      dataset = tf.data.Dataset.from_tensor_slices({
          "s": [["a", "b"], ["c", "d"]]
      })
      dataset = dataset.map(dataset_utils.string_to_int_mapper(
          ["s"], ["a", "c"]))
      dataset = dataset.batch(2)

      self.assertDictEqual(dataset.output_types, {"s": tf.string,
                                                  "s_id": tf.int32})
      iterator = dataset.make_initializable_iterator()
      features = iterator.get_next()

      with tf.Session() as sess:
        sess.run([tf.tables_initializer(), iterator.initializer])
        tf_s, tf_s_id = sess.run([features["s"], features["s_id"]])

      self.assertAllEqual(tf_s, [["a", "b"], ["c", "d"]])
      self.assertAllEqual(tf_s_id, [[0, 2], [1, 2]])