Exemple #1
0
def load_embeddings():
    """Load the fastText embeddings."""
    return embedding_utils.PretrainedWordEmbeddings(
        lowercase=FLAGS.lowercase,
        embeddings_path=FLAGS.fasttext_embeddings,
        max_vocab_size=FLAGS.max_vocab_size,
        skip_header=True)
Exemple #2
0
def experiment_functions():
  """Get the necessary functions to run an experiment."""
  # Build the memory-intensive embeddings once and enclose them in all the
  # functions that use it.
  embeddings = embedding_utils.PretrainedWordEmbeddings(
      embeddings_path=FLAGS.embeddings_path,
      max_vocab_size=FLAGS.max_vocab_size,
      lowercase=True)
  model_fn = partial(model_function, embeddings=embeddings)
  train_input_fn = partial(input_function, is_train=True, embeddings=embeddings)
  eval_input_fn = partial(input_function, is_train=False, embeddings=embeddings)
  serving_input_receiver_fn = partial(
      serving_input_receiver_function, embeddings=embeddings)
  return model_fn, train_input_fn, eval_input_fn, serving_input_receiver_fn
Exemple #3
0
    def _test_pretrained_word_embeddings(self, trainable, num_oov_buckets):
        temp_dir = tempfile.mkdtemp(prefix="embedding_utils_test")
        temp_path = os.path.join(temp_dir, "emb.txt")
        with tf.gfile.Open(temp_path, "w") as temp_file:
            temp_file.write("a 0.5 0.8 -0.1 0.0 1.0\n")
            temp_file.write("b -0.5 -0.8 0.1 -0.0 -1.0\n")
            temp_file.write("c -2.5 10 0.1 0 -0.005\n")
            temp_file.write("d -3.5 -3 1.1 0.5 1.5\n")
        max_vocab_size = 2
        embeddings = embedding_utils.PretrainedWordEmbeddings(
            temp_path,
            max_vocab_size=max_vocab_size,
            num_oov_buckets=num_oov_buckets)
        with tf.Graph().as_default():
            embedding_weights, embedding_scaffold = embeddings.get_params(
                trainable=trainable)
            dataset = tf.data.Dataset.from_tensor_slices(
                {"s": ["a", "b", "<UNK>"]})
            dataset = dataset.map(embeddings.token_to_word_id_mapper(["s"]))
            self.assertDictEqual(dataset.output_types, {
                "s": tf.string,
                "s_wid": tf.int32
            })
            self.assertDictEqual(dataset.output_shapes, {"s": [], "s_wid": []})

            dataset = dataset.batch(3)
            iterator = dataset.make_initializable_iterator()
            features = iterator.get_next()

            emb = tf.nn.embedding_lookup(embedding_weights, features["s_wid"])

            _, oov_metric = embeddings.oov_metric(features["s_wid"])

            with tf.Session() as sess:
                sess.run([
                    tf.global_variables_initializer(),
                    tf.tables_initializer(), iterator.initializer
                ])
                embedding_scaffold.init_fn(sess)
                sess.run([tf.local_variables_initializer()])

                tf_embedding_weights = sess.run(embedding_weights)
                self.assertAllEqual(tf_embedding_weights.shape,
                                    [max_vocab_size + num_oov_buckets, 5])

                tf_s, tf_s_wid, tf_emb, tf_oov_metric = (sess.run(
                    [features["s"], features["s_wid"], emb, oov_metric]))

            self.assertAllEqual(tf_s, ["a", "b", "<UNK>"])
            self.assertAllEqual(tf_s_wid[:2], [0, 1])
            self.assertGreater(tf_s_wid[2], 1)

            expected_a_emb = embedding_utils.l2_normalize(
                [0.5, 0.8, -0.1, 0.0, 1.0])
            expected_b_emb = embedding_utils.l2_normalize(
                [-0.5, -0.8, 0.1, 0.0, -1.0])
            expected_emb = np.stack([expected_a_emb, expected_b_emb])
            self.assertAllClose(tf_emb[:2, :], expected_emb)

            if num_oov_buckets == 1:
                expected_unk_emb = np.zeros([5])
                self.assertAllClose(tf_emb[2, :], expected_unk_emb)

            self.assertListEqual(["a", "b"], embeddings.get_vocab())
            self.assertEqual(5, embeddings.get_dims())
            self.assertEqual(max_vocab_size + num_oov_buckets,
                             embeddings.get_vocab_size_with_oov())

            self.assertAllClose(tf_oov_metric, 1. / 3)