Example #1
0
 def testWordEmbeddingInitializerFailIfBothTaskContextAndVocabulary(self):
   with self.test_session():
     with self.assertRaises(Exception):
       gen_parser_ops.word_embedding_initializer(
           vectors='/dev/null',
           task_context='/dev/null',
           vocabulary='/dev/null').eval()
Example #2
0
  def testWordEmbeddingInitializerRepeatability(self):
    records_path = os.path.join(FLAGS.test_tmpdir, 'records2')
    writer = tf.python_io.TFRecordWriter(records_path)
    writer.write(self._token_embedding('.', [1, 2, 3]))  # 3 dims
    del writer

    # As long as there is one non-zero seed, the result should be repeatable.
    for seed1, seed2 in [(0, 1), (1, 0), (123, 456)]:
      with tf.Graph().as_default(), self.test_session():
        embeddings1 = gen_parser_ops.word_embedding_initializer(
            vectors=records_path,
            task_context=self._task_context,
            seed=seed1,
            seed2=seed2)
        embeddings2 = gen_parser_ops.word_embedding_initializer(
            vectors=records_path,
            task_context=self._task_context,
            seed=seed1,
            seed2=seed2)

        # The number of terms is based on the word map, which may change if the
        # test corpus is updated.  Just assert that there are some terms.
        self.assertGreater(tf.shape(embeddings1)[0].eval(), 0)
        self.assertGreater(tf.shape(embeddings2)[0].eval(), 0)
        self.assertEqual(tf.shape(embeddings1)[1].eval(), 3)
        self.assertEqual(tf.shape(embeddings2)[1].eval(), 3)
        self.assertAllEqual(embeddings1.eval(), embeddings2.eval())
Example #3
0
  def testWordEmbeddingInitializerRepeatability(self):
    records_path = os.path.join(FLAGS.test_tmpdir, 'records2')
    writer = tf.python_io.TFRecordWriter(records_path)
    writer.write(self._token_embedding('.', [1, 2, 3]))  # 3 dims
    del writer

    # As long as there is one non-zero seed, the result should be repeatable.
    for seed1, seed2 in [(0, 1), (1, 0), (123, 456)]:
      with tf.Graph().as_default(), self.test_session():
        embeddings1 = gen_parser_ops.word_embedding_initializer(
            vectors=records_path,
            task_context=self._task_context,
            seed=seed1,
            seed2=seed2)
        embeddings2 = gen_parser_ops.word_embedding_initializer(
            vectors=records_path,
            task_context=self._task_context,
            seed=seed1,
            seed2=seed2)

        # The number of terms is based on the word map, which may change if the
        # test corpus is updated.  Just assert that there are some terms.
        self.assertGreater(tf.shape(embeddings1)[0].eval(), 0)
        self.assertGreater(tf.shape(embeddings2)[0].eval(), 0)
        self.assertEqual(tf.shape(embeddings1)[1].eval(), 3)
        self.assertEqual(tf.shape(embeddings2)[1].eval(), 3)
        self.assertAllEqual(embeddings1.eval(), embeddings2.eval())
Example #4
0
    def testWordEmbeddingInitializer(self):
        def _TokenEmbedding(token, embedding):
            e = dictionary_pb2.TokenEmbedding()
            e.token = token
            e.vector.values.extend(embedding)
            return e.SerializeToString()

        # Provide embeddings for the first three words in the word map.
        records_path = os.path.join(FLAGS.test_tmpdir, "sstable-00000-of-00001")
        writer = tf.python_io.TFRecordWriter(records_path)
        writer.write(_TokenEmbedding(".", [1, 2]))
        writer.write(_TokenEmbedding(",", [3, 4]))
        writer.write(_TokenEmbedding("the", [5, 6]))
        del writer

        with self.test_session():
            embeddings = gen_parser_ops.word_embedding_initializer(
                vectors=records_path, task_context=self._task_context
            ).eval()
        self.assertAllClose(
            np.array(
                [
                    [1.0 / (1 + 4) ** 0.5, 2.0 / (1 + 4) ** 0.5],
                    [3.0 / (9 + 16) ** 0.5, 4.0 / (9 + 16) ** 0.5],
                    [5.0 / (25 + 36) ** 0.5, 6.0 / (25 + 36) ** 0.5],
                ]
            ),
            embeddings[:3,],
        )
Example #5
0
    def _Initializer(shape, dtype=tf.float32, partition_info=None):
      unused_dtype = dtype
      t = gen_parser_ops.word_embedding_initializer(
          vectors=embeddings_path,
          task_context=task_context,
          embedding_init=self._embedding_init)

      t.set_shape(shape)
      return t
Example #6
0
  def testWordEmbeddingInitializerVocabularyFileWithDuplicates(self):
    records_path = os.path.join(FLAGS.test_tmpdir, 'records4')
    writer = tf.python_io.TFRecordWriter(records_path)
    writer.write(self._token_embedding('a', [1, 2, 3]))
    writer.write(self._token_embedding('b', [2, 3, 4]))
    writer.write(self._token_embedding('c', [3, 4, 5]))
    writer.write(self._token_embedding('d', [4, 5, 6]))
    writer.write(self._token_embedding('e', [5, 6, 7]))
    del writer

    vocabulary_path = os.path.join(FLAGS.test_tmpdir, 'vocabulary4')
    with open(vocabulary_path, 'w') as vocabulary_file:
      vocabulary_file.write('a\nc\ne\nx\ny\nx')  # 'x' duplicated

    with self.test_session():
      with self.assertRaises(Exception):
        gen_parser_ops.word_embedding_initializer(
            vectors=records_path, vocabulary=vocabulary_path).eval()
Example #7
0
        def _Initializer(shape, dtype=tf.float32, partition_info=None):
            """Variable initializer that loads pretrained embeddings."""
            unused_dtype = dtype
            seed1, seed2 = tf.get_seed(self._seed)
            t = gen_parser_ops.word_embedding_initializer(
                vectors=embeddings_path,
                task_context=task_context,
                embedding_init=self._embedding_init,
                seed=seed1,
                seed2=seed2)

            t.set_shape(shape)
            return t
Example #8
0
    def _Initializer(shape, dtype=tf.float32, partition_info=None):
      """Variable initializer that loads pretrained embeddings."""
      unused_dtype = dtype
      seed1, seed2 = tf.get_seed(self._seed)
      t = gen_parser_ops.word_embedding_initializer(
          vectors=embeddings_path,
          task_context=task_context,
          embedding_init=self._embedding_init,
          seed=seed1,
          seed2=seed2)

      t.set_shape(shape)
      return t
Example #9
0
    def testWordEmbeddingInitializer(self):
        # Provide embeddings for the first three words in the word map.
        records_path = os.path.join(test_flags.temp_dir(), 'records1')
        writer = tf.python_io.TFRecordWriter(records_path)
        writer.write(self._token_embedding('.', [1, 2]))
        writer.write(self._token_embedding(',', [3, 4]))
        writer.write(self._token_embedding('the', [5, 6]))
        del writer

        with self.test_session():
            embeddings = gen_parser_ops.word_embedding_initializer(
                vectors=records_path, task_context=self._task_context).eval()
        self.assertAllClose(
            np.array([[1. / (1 + 4)**.5, 2. / (1 + 4)**.5],
                      [3. / (9 + 16)**.5, 4. / (9 + 16)**.5],
                      [5. / (25 + 36)**.5, 6. / (25 + 36)**.5]]),
            embeddings[:3, ])
Example #10
0
    def testWordEmbeddingInitializerPresetRowNumber(self):
        records_path = os.path.join(test_flags.temp_dir(), 'records3')
        writer = tf.python_io.TFRecordWriter(records_path)
        writer.write(self._token_embedding('a', [1, 2, 3]))
        writer.write(self._token_embedding('b', [2, 3, 4]))
        writer.write(self._token_embedding('c', [3, 4, 5]))
        writer.write(self._token_embedding('d', [4, 5, 6]))
        writer.write(self._token_embedding('e', [5, 6, 7]))
        del writer

        vocabulary_path = os.path.join(test_flags.temp_dir(), 'vocabulary3')
        with open(vocabulary_path, 'w') as vocabulary_file:
            vocabulary_file.write(
                'a\nc\ne\nx\n')  # 'x' not in pretrained embeddings

        # Enumerate a variety of configurations.
        for cache_vectors_locally in [False, True]:
            for num_special_embeddings in [None, 1, 2,
                                           5]:  # None = use default of 3
                for override_num_embeddings in [-1, 8, 10]:
                    with self.test_session():
                        embeddings = gen_parser_ops.word_embedding_initializer(
                            vectors=records_path,
                            vocabulary=vocabulary_path,
                            override_num_embeddings=override_num_embeddings,
                            cache_vectors_locally=cache_vectors_locally,
                            num_special_embeddings=num_special_embeddings)

                        # Expect 4 embeddings from the vocabulary plus special embeddings.
                        expected_num_embeddings = 4 + (num_special_embeddings
                                                       or 3)
                        if override_num_embeddings > 0:
                            expected_num_embeddings = override_num_embeddings
                        self.assertAllEqual([expected_num_embeddings, 3],
                                            tf.shape(embeddings).eval())

                        # The first 3 embeddings should be pretrained.
                        norm_a = (1.0 + 4.0 + 9.0)**0.5
                        norm_c = (9.0 + 16.0 + 25.0)**0.5
                        norm_e = (25.0 + 36.0 + 49.0)**0.5
                        self.assertAllClose(
                            [[1.0 / norm_a, 2.0 / norm_a, 3.0 / norm_a],
                             [3.0 / norm_c, 4.0 / norm_c, 5.0 / norm_c],
                             [5.0 / norm_e, 6.0 / norm_e, 7.0 / norm_e]],
                            embeddings[:3].eval())
Example #11
0
  def testWordEmbeddingInitializer(self):
    # Provide embeddings for the first three words in the word map.
    records_path = os.path.join(FLAGS.test_tmpdir, 'records1')
    writer = tf.python_io.TFRecordWriter(records_path)
    writer.write(self._token_embedding('.', [1, 2]))
    writer.write(self._token_embedding(',', [3, 4]))
    writer.write(self._token_embedding('the', [5, 6]))
    del writer

    with self.test_session():
      embeddings = gen_parser_ops.word_embedding_initializer(
          vectors=records_path,
          task_context=self._task_context).eval()
    self.assertAllClose(
        np.array([[1. / (1 + 4) ** .5, 2. / (1 + 4) ** .5],
                  [3. / (9 + 16) ** .5, 4. / (9 + 16) ** .5],
                  [5. / (25 + 36) ** .5, 6. / (25 + 36) ** .5]]),
        embeddings[:3,])
Example #12
0
  def testWordEmbeddingInitializerPresetRowNumber(self):
    records_path = os.path.join(test_flags.temp_dir(), 'records3')
    writer = tf.python_io.TFRecordWriter(records_path)
    writer.write(self._token_embedding('a', [1, 2, 3]))
    writer.write(self._token_embedding('b', [2, 3, 4]))
    writer.write(self._token_embedding('c', [3, 4, 5]))
    writer.write(self._token_embedding('d', [4, 5, 6]))
    writer.write(self._token_embedding('e', [5, 6, 7]))
    del writer

    vocabulary_path = os.path.join(test_flags.temp_dir(), 'vocabulary3')
    with open(vocabulary_path, 'w') as vocabulary_file:
      vocabulary_file.write('a\nc\ne\nx\n')  # 'x' not in pretrained embeddings

    # Enumerate a variety of configurations.
    for cache_vectors_locally in [False, True]:
      for num_special_embeddings in [None, 1, 2, 5]:  # None = use default of 3
        for override_num_embeddings in [-1, 8, 10]:
          with self.test_session():
            embeddings = gen_parser_ops.word_embedding_initializer(
                vectors=records_path,
                vocabulary=vocabulary_path,
                override_num_embeddings=override_num_embeddings,
                cache_vectors_locally=cache_vectors_locally,
                num_special_embeddings=num_special_embeddings)

            # Expect 4 embeddings from the vocabulary plus special embeddings.
            expected_num_embeddings = 4 + (num_special_embeddings or 3)
            if override_num_embeddings > 0:
              expected_num_embeddings = override_num_embeddings
            self.assertAllEqual([expected_num_embeddings, 3],
                                tf.shape(embeddings).eval())

            # The first 3 embeddings should be pretrained.
            norm_a = (1.0 + 4.0 + 9.0)**0.5
            norm_c = (9.0 + 16.0 + 25.0)**0.5
            norm_e = (25.0 + 36.0 + 49.0)**0.5
            self.assertAllClose([[1.0 / norm_a, 2.0 / norm_a, 3.0 / norm_a], [
                3.0 / norm_c, 4.0 / norm_c, 5.0 / norm_c
            ], [5.0 / norm_e, 6.0 / norm_e, 7.0 / norm_e]],
                                embeddings[:3].eval())
Example #13
0
    def testWordEmbeddingInitializer(self):
        def _TokenEmbedding(token, embedding):
            e = dictionary_pb2.TokenEmbedding()
            e.token = token
            e.vector.values.extend(embedding)
            return e.SerializeToString()

        # Provide embeddings for the first three words in the word map.
        records_path = os.path.join(FLAGS.test_tmpdir,
                                    'sstable-00000-of-00001')
        writer = tf.python_io.TFRecordWriter(records_path)
        writer.write(_TokenEmbedding('.', [1, 2]))
        writer.write(_TokenEmbedding(',', [3, 4]))
        writer.write(_TokenEmbedding('the', [5, 6]))
        del writer

        with self.test_session():
            embeddings = gen_parser_ops.word_embedding_initializer(
                vectors=records_path, task_context=self._task_context).eval()
        self.assertAllClose(
            np.array([[1. / (1 + 4)**.5, 2. / (1 + 4)**.5],
                      [3. / (9 + 16)**.5, 4. / (9 + 16)**.5],
                      [5. / (25 + 36)**.5, 6. / (25 + 36)**.5]]),
            embeddings[:3, ])