def testWordEmbeddingInitializerFailIfBothTaskContextAndVocabulary(self): with self.test_session(): with self.assertRaises(Exception): gen_parser_ops.word_embedding_initializer( vectors='/dev/null', task_context='/dev/null', vocabulary='/dev/null').eval()
def testWordEmbeddingInitializerRepeatability(self): records_path = os.path.join(FLAGS.test_tmpdir, 'records2') writer = tf.python_io.TFRecordWriter(records_path) writer.write(self._token_embedding('.', [1, 2, 3])) # 3 dims del writer # As long as there is one non-zero seed, the result should be repeatable. for seed1, seed2 in [(0, 1), (1, 0), (123, 456)]: with tf.Graph().as_default(), self.test_session(): embeddings1 = gen_parser_ops.word_embedding_initializer( vectors=records_path, task_context=self._task_context, seed=seed1, seed2=seed2) embeddings2 = gen_parser_ops.word_embedding_initializer( vectors=records_path, task_context=self._task_context, seed=seed1, seed2=seed2) # The number of terms is based on the word map, which may change if the # test corpus is updated. Just assert that there are some terms. self.assertGreater(tf.shape(embeddings1)[0].eval(), 0) self.assertGreater(tf.shape(embeddings2)[0].eval(), 0) self.assertEqual(tf.shape(embeddings1)[1].eval(), 3) self.assertEqual(tf.shape(embeddings2)[1].eval(), 3) self.assertAllEqual(embeddings1.eval(), embeddings2.eval())
def testWordEmbeddingInitializer(self): def _TokenEmbedding(token, embedding): e = dictionary_pb2.TokenEmbedding() e.token = token e.vector.values.extend(embedding) return e.SerializeToString() # Provide embeddings for the first three words in the word map. records_path = os.path.join(FLAGS.test_tmpdir, "sstable-00000-of-00001") writer = tf.python_io.TFRecordWriter(records_path) writer.write(_TokenEmbedding(".", [1, 2])) writer.write(_TokenEmbedding(",", [3, 4])) writer.write(_TokenEmbedding("the", [5, 6])) del writer with self.test_session(): embeddings = gen_parser_ops.word_embedding_initializer( vectors=records_path, task_context=self._task_context ).eval() self.assertAllClose( np.array( [ [1.0 / (1 + 4) ** 0.5, 2.0 / (1 + 4) ** 0.5], [3.0 / (9 + 16) ** 0.5, 4.0 / (9 + 16) ** 0.5], [5.0 / (25 + 36) ** 0.5, 6.0 / (25 + 36) ** 0.5], ] ), embeddings[:3,], )
def _Initializer(shape, dtype=tf.float32, partition_info=None): unused_dtype = dtype t = gen_parser_ops.word_embedding_initializer( vectors=embeddings_path, task_context=task_context, embedding_init=self._embedding_init) t.set_shape(shape) return t
def testWordEmbeddingInitializerVocabularyFileWithDuplicates(self): records_path = os.path.join(FLAGS.test_tmpdir, 'records4') writer = tf.python_io.TFRecordWriter(records_path) writer.write(self._token_embedding('a', [1, 2, 3])) writer.write(self._token_embedding('b', [2, 3, 4])) writer.write(self._token_embedding('c', [3, 4, 5])) writer.write(self._token_embedding('d', [4, 5, 6])) writer.write(self._token_embedding('e', [5, 6, 7])) del writer vocabulary_path = os.path.join(FLAGS.test_tmpdir, 'vocabulary4') with open(vocabulary_path, 'w') as vocabulary_file: vocabulary_file.write('a\nc\ne\nx\ny\nx') # 'x' duplicated with self.test_session(): with self.assertRaises(Exception): gen_parser_ops.word_embedding_initializer( vectors=records_path, vocabulary=vocabulary_path).eval()
def _Initializer(shape, dtype=tf.float32, partition_info=None): """Variable initializer that loads pretrained embeddings.""" unused_dtype = dtype seed1, seed2 = tf.get_seed(self._seed) t = gen_parser_ops.word_embedding_initializer( vectors=embeddings_path, task_context=task_context, embedding_init=self._embedding_init, seed=seed1, seed2=seed2) t.set_shape(shape) return t
def testWordEmbeddingInitializer(self): # Provide embeddings for the first three words in the word map. records_path = os.path.join(test_flags.temp_dir(), 'records1') writer = tf.python_io.TFRecordWriter(records_path) writer.write(self._token_embedding('.', [1, 2])) writer.write(self._token_embedding(',', [3, 4])) writer.write(self._token_embedding('the', [5, 6])) del writer with self.test_session(): embeddings = gen_parser_ops.word_embedding_initializer( vectors=records_path, task_context=self._task_context).eval() self.assertAllClose( np.array([[1. / (1 + 4)**.5, 2. / (1 + 4)**.5], [3. / (9 + 16)**.5, 4. / (9 + 16)**.5], [5. / (25 + 36)**.5, 6. / (25 + 36)**.5]]), embeddings[:3, ])
def testWordEmbeddingInitializerPresetRowNumber(self): records_path = os.path.join(test_flags.temp_dir(), 'records3') writer = tf.python_io.TFRecordWriter(records_path) writer.write(self._token_embedding('a', [1, 2, 3])) writer.write(self._token_embedding('b', [2, 3, 4])) writer.write(self._token_embedding('c', [3, 4, 5])) writer.write(self._token_embedding('d', [4, 5, 6])) writer.write(self._token_embedding('e', [5, 6, 7])) del writer vocabulary_path = os.path.join(test_flags.temp_dir(), 'vocabulary3') with open(vocabulary_path, 'w') as vocabulary_file: vocabulary_file.write( 'a\nc\ne\nx\n') # 'x' not in pretrained embeddings # Enumerate a variety of configurations. for cache_vectors_locally in [False, True]: for num_special_embeddings in [None, 1, 2, 5]: # None = use default of 3 for override_num_embeddings in [-1, 8, 10]: with self.test_session(): embeddings = gen_parser_ops.word_embedding_initializer( vectors=records_path, vocabulary=vocabulary_path, override_num_embeddings=override_num_embeddings, cache_vectors_locally=cache_vectors_locally, num_special_embeddings=num_special_embeddings) # Expect 4 embeddings from the vocabulary plus special embeddings. expected_num_embeddings = 4 + (num_special_embeddings or 3) if override_num_embeddings > 0: expected_num_embeddings = override_num_embeddings self.assertAllEqual([expected_num_embeddings, 3], tf.shape(embeddings).eval()) # The first 3 embeddings should be pretrained. norm_a = (1.0 + 4.0 + 9.0)**0.5 norm_c = (9.0 + 16.0 + 25.0)**0.5 norm_e = (25.0 + 36.0 + 49.0)**0.5 self.assertAllClose( [[1.0 / norm_a, 2.0 / norm_a, 3.0 / norm_a], [3.0 / norm_c, 4.0 / norm_c, 5.0 / norm_c], [5.0 / norm_e, 6.0 / norm_e, 7.0 / norm_e]], embeddings[:3].eval())
def testWordEmbeddingInitializer(self): # Provide embeddings for the first three words in the word map. records_path = os.path.join(FLAGS.test_tmpdir, 'records1') writer = tf.python_io.TFRecordWriter(records_path) writer.write(self._token_embedding('.', [1, 2])) writer.write(self._token_embedding(',', [3, 4])) writer.write(self._token_embedding('the', [5, 6])) del writer with self.test_session(): embeddings = gen_parser_ops.word_embedding_initializer( vectors=records_path, task_context=self._task_context).eval() self.assertAllClose( np.array([[1. / (1 + 4) ** .5, 2. / (1 + 4) ** .5], [3. / (9 + 16) ** .5, 4. / (9 + 16) ** .5], [5. / (25 + 36) ** .5, 6. / (25 + 36) ** .5]]), embeddings[:3,])
def testWordEmbeddingInitializerPresetRowNumber(self): records_path = os.path.join(test_flags.temp_dir(), 'records3') writer = tf.python_io.TFRecordWriter(records_path) writer.write(self._token_embedding('a', [1, 2, 3])) writer.write(self._token_embedding('b', [2, 3, 4])) writer.write(self._token_embedding('c', [3, 4, 5])) writer.write(self._token_embedding('d', [4, 5, 6])) writer.write(self._token_embedding('e', [5, 6, 7])) del writer vocabulary_path = os.path.join(test_flags.temp_dir(), 'vocabulary3') with open(vocabulary_path, 'w') as vocabulary_file: vocabulary_file.write('a\nc\ne\nx\n') # 'x' not in pretrained embeddings # Enumerate a variety of configurations. for cache_vectors_locally in [False, True]: for num_special_embeddings in [None, 1, 2, 5]: # None = use default of 3 for override_num_embeddings in [-1, 8, 10]: with self.test_session(): embeddings = gen_parser_ops.word_embedding_initializer( vectors=records_path, vocabulary=vocabulary_path, override_num_embeddings=override_num_embeddings, cache_vectors_locally=cache_vectors_locally, num_special_embeddings=num_special_embeddings) # Expect 4 embeddings from the vocabulary plus special embeddings. expected_num_embeddings = 4 + (num_special_embeddings or 3) if override_num_embeddings > 0: expected_num_embeddings = override_num_embeddings self.assertAllEqual([expected_num_embeddings, 3], tf.shape(embeddings).eval()) # The first 3 embeddings should be pretrained. norm_a = (1.0 + 4.0 + 9.0)**0.5 norm_c = (9.0 + 16.0 + 25.0)**0.5 norm_e = (25.0 + 36.0 + 49.0)**0.5 self.assertAllClose([[1.0 / norm_a, 2.0 / norm_a, 3.0 / norm_a], [ 3.0 / norm_c, 4.0 / norm_c, 5.0 / norm_c ], [5.0 / norm_e, 6.0 / norm_e, 7.0 / norm_e]], embeddings[:3].eval())
def testWordEmbeddingInitializer(self): def _TokenEmbedding(token, embedding): e = dictionary_pb2.TokenEmbedding() e.token = token e.vector.values.extend(embedding) return e.SerializeToString() # Provide embeddings for the first three words in the word map. records_path = os.path.join(FLAGS.test_tmpdir, 'sstable-00000-of-00001') writer = tf.python_io.TFRecordWriter(records_path) writer.write(_TokenEmbedding('.', [1, 2])) writer.write(_TokenEmbedding(',', [3, 4])) writer.write(_TokenEmbedding('the', [5, 6])) del writer with self.test_session(): embeddings = gen_parser_ops.word_embedding_initializer( vectors=records_path, task_context=self._task_context).eval() self.assertAllClose( np.array([[1. / (1 + 4)**.5, 2. / (1 + 4)**.5], [3. / (9 + 16)**.5, 4. / (9 + 16)**.5], [5. / (25 + 36)**.5, 6. / (25 + 36)**.5]]), embeddings[:3, ])