def load_model_by_config( checkpoint: int, config: lmp.config.BaseConfig, tokenizer: lmp.tokenizer.BaseTokenizer ) -> lmp.model.BaseRNNModel: r"""Helper function for constructing language model. Load model from pre-trained checkpoint when `checkpoint != -1`. Args: checkpoint: Pre-trained model's checkpoint. config: Configuration object with attributes `d_emb`, `d_hid`, `dropout`, `device`, `experiment`, `model_class`, `num_linear_layer` and `num_rnn_layer`. tokenizer: Tokenizer object with attributes `pad_token_id` and `vocab_size`. Returns: Same as `load_model`. """ return load_model( checkpoint=checkpoint, d_emb=config.d_emb, d_hid=config.d_hid, device=config.device, dropout=config.dropout, experiment=config.experiment, model_class=config.model_class, num_linear_layers=config.num_linear_layers, num_rnn_layers=config.num_rnn_layers, pad_token_id=tokenizer.convert_token_to_id(tokenizer.pad_token), vocab_size=tokenizer.vocab_size )
def test_return_value(self): r"""Perplexity is greater than or equal to zero.""" msg = 'Perplexity must be greater than or equal to zero.' for ( d_emb, d_hid, dropout, is_uncased, model_cstr, num_linear_layers, num_rnn_layers, sequence, tokenizer_cstr, ) in product(*self.__class__.model_parameters.values()): tokenizer = tokenizer_cstr(is_uncased=is_uncased) pad_token_id = tokenizer.convert_token_to_id(tokenizer.pad_token) vocab_size = tokenizer.vocab_size model = model_cstr(d_emb=d_emb, d_hid=d_hid, dropout=dropout, num_linear_layers=num_linear_layers, num_rnn_layers=num_rnn_layers, pad_token_id=pad_token_id, vocab_size=vocab_size) self.assertGreaterEqual(lmp.util.perplexity_eval( device=torch.device('cpu'), model=model, sequence=sequence, tokenizer=tokenizer), 0, msg=msg)
def test_return_type(self): r"""Return `float`.""" msg = 'Must return `float`.' for ( d_emb, d_hid, dropout, is_uncased, model_cstr, num_linear_layers, num_rnn_layers, sequence, tokenizer_cstr, ) in product(*self.__class__.model_parameters.values()): tokenizer = tokenizer_cstr(is_uncased=is_uncased) pad_token_id = tokenizer.convert_token_to_id(tokenizer.pad_token) vocab_size = tokenizer.vocab_size model = model_cstr(d_emb=d_emb, d_hid=d_hid, dropout=dropout, num_linear_layers=num_linear_layers, num_rnn_layers=num_rnn_layers, pad_token_id=pad_token_id, vocab_size=vocab_size) self.assertIsInstance(lmp.util.perplexity_eval( device=torch.device('cpu'), model=model, sequence=sequence, tokenizer=tokenizer), float, msg=msg)
def test_return_type(self): r"""Return `str`.""" msg = 'Must return `str`.' for ( d_emb, d_hid, dropout, is_uncased, model_cstr, num_linear_layers, num_rnn_layers, tokenizer_cstr, ) in product(*self.__class__.model_parameters.values()): tokenizer = tokenizer_cstr(is_uncased=is_uncased) pad_token_id = tokenizer.convert_token_to_id(tokenizer.pad_token) vocab_size = tokenizer.vocab_size model = model_cstr(d_emb=d_emb, d_hid=d_hid, dropout=dropout, num_linear_layers=num_linear_layers, num_rnn_layers=num_rnn_layers, pad_token_id=pad_token_id, vocab_size=vocab_size) pred_word = analogy_inference(device=self.device, model=model, tokenizer=tokenizer, word_a=self.word_a, word_b=self.word_b, word_c=self.word_c) self.assertIsInstance(pred_word, str, msg=msg)
def load_model_by_config( checkpoint: int, config: lmp.config.BaseConfig, tokenizer: lmp.tokenizer.BaseTokenizer ) -> Union[lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel]: r"""Helper function for constructing language model. Load model from pre-trained checkpoint when `checkpoint != -1`. Args: checkpoint: Pre-trained model's checkpoint. Must be bigger than or equal to `-1`. config: Configuration object with attributes `d_emb`, `d_hid`, `dropout`, `device`, `experiment`, `model_class`, `num_linear_layer` and `num_rnn_layer`. tokenizer: Tokenizer object with attributes `pad_token_id` and `vocab_size`. Raises: TypeError: When `config` is not an instance of `lmp.config.BaseConfig` or `tokenizer` is not an instance of `lmp.tokenizer.BaseTokenizer`. ValueError: When `checkpoint < -1` or `config.model_class` does not support. Returns: Same as `load_model`. """ # Type check. if not isinstance(config, lmp.config.BaseConfig): raise TypeError( '`config` must be an instance of `lmp.config.BaseConfig`.') if not isinstance(tokenizer, lmp.tokenizer.BaseTokenizer): raise TypeError( '`tokenizer` must be an instance of `lmp.tokenizer.BaseTokenizer`.' ) return load_model(checkpoint=checkpoint, d_emb=config.d_emb, d_hid=config.d_hid, device=config.device, dropout=config.dropout, experiment=config.experiment, model_class=config.model_class, num_linear_layers=config.num_linear_layers, num_rnn_layers=config.num_rnn_layers, pad_token_id=tokenizer.convert_token_to_id( tokenizer.pad_token), vocab_size=tokenizer.vocab_size)
def test_pure_function(self): r"""Perplexity must be the same when given the same input.""" msg = 'Perplexity must be the same when given the same input' for ( d_emb, d_hid, dropout, is_uncased, model_cstr, num_linear_layers, num_rnn_layers, sequence, tokenizer_cstr, ) in product(*self.__class__.model_parameters.values()): tokenizer = tokenizer_cstr(is_uncased=is_uncased) pad_token_id = tokenizer.convert_token_to_id(tokenizer.pad_token) vocab_size = tokenizer.vocab_size model = model_cstr(d_emb=d_emb, d_hid=d_hid, dropout=dropout, num_linear_layers=num_linear_layers, num_rnn_layers=num_rnn_layers, pad_token_id=pad_token_id, vocab_size=vocab_size) self.assertEqual( lmp.util.perplexity_eval(device=torch.device('cpu'), model=model, sequence=sequence, tokenizer=tokenizer), lmp.util.perplexity_eval(device=torch.device('cpu'), model=model, sequence=sequence, tokenizer=tokenizer), msg=msg)
def test_load_result(self): r"""Load result must be consistent.""" msg = 'Inconsistent load result.' test_path = os.path.join(self.__class__.test_dir, f'model-{self.__class__.checkpoint}.pt') for (d_emb, d_hid, dropout, is_uncased, (model_class, model_cstr), num_linear_layers, num_rnn_layers, (tokenizer_class, tokenizer_cstr)) in product( *self.__class__.model_parameters.values()): config = lmp.config.BaseConfig( d_emb=d_emb, d_hid=d_hid, dataset=self.__class__.dataset, dropout=dropout, experiment=self.__class__.experiment, model_class=model_class, num_linear_layers=num_linear_layers, num_rnn_layers=num_rnn_layers, tokenizer_class=tokenizer_class) tokenizer = tokenizer_cstr(is_uncased=is_uncased) pad_token_id = tokenizer.convert_token_to_id(tokenizer.pad_token) vocab_size = tokenizer.vocab_size try: # Create test file. ans_model = model_cstr(d_emb=d_emb, d_hid=d_hid, dropout=dropout, num_linear_layers=num_linear_layers, num_rnn_layers=num_rnn_layers, pad_token_id=pad_token_id, vocab_size=vocab_size) torch.save(ans_model.state_dict(), test_path) self.assertTrue(os.path.exists(test_path), msg=msg) model_1 = lmp.util.load_model_by_config( checkpoint=self.__class__.checkpoint, config=config, tokenizer=tokenizer) model_1 = model_1.to('cpu') model_2 = lmp.util.load_model_by_config( checkpoint=self.__class__.checkpoint, config=config, tokenizer=tokenizer) model_2 = model_2.to('cpu') self.assertEqual(len(list(ans_model.parameters())), len(list(model_1.parameters())), msg=msg) self.assertEqual(len(list(ans_model.parameters())), len(list(model_2.parameters())), msg=msg) for p1, p2 in zip(ans_model.parameters(), model_2.parameters()): self.assertTrue((p1 == p2).all().item(), msg=msg) finally: # Clean up test file. os.remove(test_path)
def analogy_inference(device: torch.device, model: Union[lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel], tokenizer: lmp.tokenizer.BaseTokenizer, word_a: str, word_b: str, word_c: str) -> str: r"""Generate analog word based on `word_a`, `word_b` and `word_c`. This function perform word analogy based on the following rule: `word_a` : `word_b` = `word_c` : `word_d` Where `word_d` is the prediction target. Args: device: Model running device. model: Language model. tokenizer: Converting token (including `word_a`, `word_b` and `word_c`) into token id and convert token id back to token (`word_d`). This is need since we use word embedding layer in our language model. word_a: word_b: word_c: Query words for word analogy. Raises: TypeError: When one of the arguments are not an instance of their type annotation respectively. Returns: Predict word following word analogy. """ # Type check. if not isinstance(device, torch.device): raise TypeError('`device` must be an instance of `torch.device`.') if not isinstance(model, (lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel)): raise TypeError( '`model` must be an instance of ' '`Union[lmp.model.BaseRNNModel, lmp.model.BaseResRNNModel]`.') if not isinstance(tokenizer, lmp.tokenizer.BaseTokenizer): raise TypeError( '`tokenizer` must be an instance of `lmp.tokenizer.BaseTokenizer`.' ) if not isinstance(word_a, str): raise TypeError('`word_a` must be an instance of `str`.') if not isinstance(word_b, str): raise TypeError('`word_b` must be an instance of `str`.') if not isinstance(word_c, str): raise TypeError('`word_c` must be an instance of `str`.') # Evaluation mode. model.eval() model = model.to(device) # Convert tokens (query words) into token ids. word_a_id = torch.LongTensor([tokenizer.convert_token_to_id(word_a)]) word_b_id = torch.LongTensor([tokenizer.convert_token_to_id(word_b)]) word_c_id = torch.LongTensor([tokenizer.convert_token_to_id(word_c)]) # Perform analogy calculation. # Shape: `(1, E)`. out = (model.emb_layer(word_b_id.to(device)) - model.emb_layer(word_a_id.to(device)) + model.emb_layer(word_c_id.to(device))) # Calculate cosine similarity. # Shape: `(V)`. pred = torch.nn.functional.cosine_similarity( out, model.emb_layer.weight, ) # Get the token id with maximum consine similarity. # Shape: `(1)`. word_d_id = pred.argmax(dim=0).to('cpu').item() # Convert back to token. return tokenizer.convert_id_to_token(word_d_id)