def test_embedding_to_matrix(self): embedding = {'a': np.array(2), 'b': np.array(3), 'c': np.array(4)} token_index = {'a': 1, 'b': 2, 'd': 3} matrix = embedding_to_matrix(embedding, token_index, 1) np.testing.assert_array_equal(matrix[1], np.array(2)) np.testing.assert_array_equal(matrix[2], np.array(3)) # random values for zero index and tokens not in embedding self.assertTrue(-1 < float(matrix[0]) < 1) self.assertTrue(-1 < float(matrix[3]) < 1)
def _init_model(self, summarizer: Summarizer, train_data: Iterable[Tuple[str, str]]) -> None: tokenizer_encoder, tokenizer_decoder = self._create_tokenizers( train_data) self.logger.info( 'vocab encoder: {vocab_enc}, vocab decoder: {vocab_dec}'.format( vocab_enc=tokenizer_encoder.vocab_size, vocab_dec=tokenizer_decoder.vocab_size)) vectorizer = Vectorizer(tokenizer_encoder, tokenizer_decoder, max_input_len=self.max_input_len, max_output_len=self.max_output_len) embedding_weights_encoder, embedding_weights_decoder = None, None if self.embedding_path_encoder is not None: self.logger.info('loading encoder embedding from {}'.format( self.embedding_path_encoder)) embedding = read_embedding(self.embedding_path_encoder, summarizer.embedding_size) embedding_weights_encoder = embedding_to_matrix( embedding=embedding, token_index=tokenizer_encoder.token_index, embedding_dim=summarizer.embedding_size) if self.embedding_path_decoder is not None: self.logger.info('loading decoder embedding from {}'.format( self.embedding_path_decoder)) embedding = read_embedding(self.embedding_path_decoder, summarizer.embedding_size) embedding_weights_decoder = embedding_to_matrix( embedding=embedding, token_index=tokenizer_decoder.token_index, embedding_dim=summarizer.embedding_size) summarizer.init_model( preprocessor=self.preprocessor, vectorizer=vectorizer, embedding_weights_encoder=embedding_weights_encoder, embedding_weights_decoder=embedding_weights_decoder)