def test_load_embedding_initializer(self): """Tests for the load_embedding_initializer wrapper.""" embedding_loading_initializer = (checkpoint_ops._load_embedding_initializer( new_vocab_file=self.new_feature_vocab_file, old_vocab_file=self.old_feature_vocab_file, new_vocab_size=5, embedding_dim=16, embedding_tensor_name='some_scope/embeddings', ckpt_path=[self.checkpoint_file], num_oov_buckets=1, initializer=self.initializer)) # The new weight matrix is of size # [5 feature vocab + 1 feature OOV, 16 (embedding dimension)], where the # last vocab row (2nd last row) is newly initialized (wasn't found in # previous vocab) and the actual last row is OOV and also newly initialized. # Use a partitioned variable to confirm that the offset logic works. expected_remapped_embeddings = np.concatenate( [ np.reshape(range(64), [4, 16]), np.reshape([self.init_val] * 32, [2, 16]), ], axis=0) remapped_embeddings = variable_scope.get_variable( name='embedding/obtained_embedding_matrix', shape=[6, 16], initializer=embedding_loading_initializer, partitioner=partitioned_variables.fixed_size_partitioner(2)) with self.cached_session(): self.evaluate(variables.global_variables_initializer()) self.assertAllClose(expected_remapped_embeddings, remapped_embeddings.as_tensor())
def test_load_embedding_initializer(self): """Tests for the load_embedding_initializer wrapper.""" embedding_loading_initializer = (checkpoint_ops._load_embedding_initializer( new_vocab_file=self.new_feature_vocab_file, old_vocab_file=self.old_feature_vocab_file, new_vocab_size=5, embedding_dim=16, embedding_tensor_name='some_scope/embeddings', ckpt_path=[self.checkpoint_file], num_oov_buckets=1, initializer=self.initializer)) expected_remapped_embeddings = np.concatenate( [ np.reshape(range(64), [4, 16]), np.reshape([self.init_val] * 32, [2, 16]), ], axis=0) # The new weight matrix is of size # [5 feature vocab + 1 feature OOV, 16 (embedding dimension)], where the # last vocab row (2nd last row) is newly initialized (wasn't found in # previous vocab) and the actual last row is OOV and also newly initialized. # Use a partitioned variable to confirm that the offset logic works. remapped_embeddings = variable_scope.get_variable( name='embedding/obtained_embedding_matrix', shape=[6, 16], initializer=embedding_loading_initializer, partitioner=partitioned_variables.fixed_size_partitioner(2)) with self.test_session(): variables.global_variables_initializer().run() self.assertAllClose(expected_remapped_embeddings, remapped_embeddings.as_tensor().eval())
def test_load_embedding_initializer_large_oov(self): """Tests for the large OOV case for load_embedding_initializer wrapper.""" self.new_feature_vocab_file = os.path.join( self.get_temp_dir(), 'new_feature_vocab.txt') with open(self.new_feature_vocab_file, 'w') as f: f.write('\n'.join(['one', 'zero', 'two', 'four']) + '\n') # Checkpoint has 5 entries, 3 of which correspond to OOV. self.old_feature_vocab_file = os.path.join( self.get_temp_dir(), 'old_feature_vocab.txt') with open(self.old_feature_vocab_file, 'w') as f: f.write('\n'.join(['zero', 'one']) + '\n') embedding_loading_initializer = (checkpoint_ops._load_embedding_initializer( new_vocab_file=self.new_feature_vocab_file, old_vocab_file=self.old_feature_vocab_file, new_vocab_size=4, embedding_dim=16, embedding_tensor_name='some_scope/embeddings', ckpt_path=[self.checkpoint_file], num_oov_buckets=5, initializer=self.initializer)) expected_remapped_embeddings = np.concatenate( [ np.reshape(range(16, 32), [1, 16]), np.reshape(range(16), [1, 16]), np.reshape([self.init_val] * 112, [7, 16]), ], axis=0) # The new weight matrix is of size # [4 feature vocab + 5 feature OOV, 16 (embedding dimension)], where the # 3rd and 4th rows are not found in the old vocabulary and therefore newly # initialized. The last five rows are OOV and also newly initialized. # Use a partitioned variable to confirm that the offset logic works. remapped_embeddings = variable_scope.get_variable( name='embedding/obtained_embedding_matrix', shape=[9, 16], initializer=embedding_loading_initializer, partitioner=partitioned_variables.fixed_size_partitioner(2)) with self.test_session(): variables.global_variables_initializer().run() self.assertAllClose(expected_remapped_embeddings, remapped_embeddings.as_tensor().eval())
def test_load_embedding_initializer_large_oov(self): """Tests for the large OOV case for load_embedding_initializer wrapper.""" self.new_feature_vocab_file = os.path.join(self.get_temp_dir(), 'new_feature_vocab.txt') with open(self.new_feature_vocab_file, 'w') as f: f.write('\n'.join(['one', 'zero', 'two', 'four']) + '\n') # Checkpoint has 5 entries, 3 of which correspond to OOV. self.old_feature_vocab_file = os.path.join(self.get_temp_dir(), 'old_feature_vocab.txt') with open(self.old_feature_vocab_file, 'w') as f: f.write('\n'.join(['zero', 'one']) + '\n') embedding_loading_initializer = ( checkpoint_ops._load_embedding_initializer( new_vocab_file=self.new_feature_vocab_file, old_vocab_file=self.old_feature_vocab_file, new_vocab_size=4, embedding_dim=16, embedding_tensor_name='some_scope/embeddings', ckpt_path=[self.checkpoint_file], num_oov_buckets=5, initializer=self.initializer)) expected_remapped_embeddings = np.concatenate([ np.reshape(range(16, 32), [1, 16]), np.reshape(range(16), [1, 16]), np.reshape([self.init_val] * 112, [7, 16]), ], axis=0) # The new weight matrix is of size # [4 feature vocab + 5 feature OOV, 16 (embedding dimension)], where the # 3rd and 4th rows are not found in the old vocabulary and therefore newly # initialized. The last five rows are OOV and also newly initialized. # Use a partitioned variable to confirm that the offset logic works. remapped_embeddings = variable_scope.get_variable( name='embedding/obtained_embedding_matrix', shape=[9, 16], initializer=embedding_loading_initializer, partitioner=partitioned_variables.fixed_size_partitioner(2)) with self.test_session(): variables.global_variables_initializer().run() self.assertAllClose(expected_remapped_embeddings, remapped_embeddings.as_tensor().eval())
def test_load_embedding_initializer_old_row_vocab(self): """Tests for load_embedding_initializer where we constrain old vocab.""" embedding_loading_initializer = ( checkpoint_ops._load_embedding_initializer( new_vocab_file=self.new_feature_vocab_file, old_vocab_file=self.old_feature_vocab_file, # Considered old vocabulary becomes ['zero', 'one', 'two']. This # means 'three' in the new vocabulary is newly initialized. old_vocab_size=3, new_vocab_size=5, embedding_dim=16, embedding_tensor_name='some_scope/embeddings', ckpt_path=[self.checkpoint_file], num_oov_buckets=1, initializer=self.initializer)) # The new weight matrix is of size # [5 feature vocab + 1 feature OOV, 16 (embedding dimension)], where the # last vocab row (2nd last row) is newly initialized (wasn't found in # previous vocab) and the actual last row is OOV and also newly initialized. # Use a partitioned variable to confirm that the offset logic works. expected_remapped_embeddings = np.concatenate( [ np.reshape(range(48), [3, 16]), np.reshape([self.init_val] * 48, [3, 16]), ], axis=0) remapped_embeddings = variable_scope.get_variable( name='embedding/obtained_embedding_matrix', shape=[6, 16], initializer=embedding_loading_initializer, partitioner=partitioned_variables.fixed_size_partitioner(2)) with self.test_session(): variables.global_variables_initializer().run() self.assertAllClose(expected_remapped_embeddings, remapped_embeddings.as_tensor().eval())