def __init__(self, training_data_ids_path, validation_data_ids_path, language_model_model_dir, data_itos_path, cuda_device_id=0, batch_size=32, dropout_multiplier=0.7): torch.cuda.set_device(cuda_device_id) self.use_cuda = True if cuda_device_id >= 0 else False self.inspire_data_itos = pickle.load(open(data_itos_path, 'rb')) self.vocabulary_size = len(self.inspire_data_itos) number_of_backpropagation_through_time_steps = 70 number_of_hidden_units = 1150 number_of_layers = 3 self.embedding_size = 400 optimization_function = partial(optim.Adam, betas=(0.8, 0.99)) training_token_ids = np.load(training_data_ids_path) training_token_ids = np.concatenate(training_token_ids) validation_token_ids = np.load(validation_data_ids_path) validation_token_ids = np.concatenate(validation_token_ids) training_dataloader = LanguageModelLoader( nums=training_token_ids, bs=batch_size, bptt=number_of_backpropagation_through_time_steps) validation_dataloader = LanguageModelLoader( nums=validation_token_ids, bs=batch_size, bptt=number_of_backpropagation_through_time_steps) model = LanguageModelData( path=language_model_model_dir, pad_idx=1, n_tok=self.vocabulary_size, trn_dl=training_dataloader, val_dl=validation_dataloader, bs=batch_size, bptt=number_of_backpropagation_through_time_steps) dropouts = np.array([0.25, 0.1, 0.2, 0.02, 0.15]) * dropout_multiplier self.learner = model.get_model(opt_fn=optimization_function, emb_sz=self.embedding_size, n_hid=number_of_hidden_units, n_layers=number_of_layers, dropouti=dropouts[0], dropout=dropouts[1], wdrop=dropouts[2], dropoute=dropouts[3], dropouth=dropouts[4]) self.learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1) self.learner.clip = 0.3 self.learner.metrics = [accuracy]
def _train_lm(self, train_ids, batch_size=4, val_ids=None): train_dataloader = LanguageModelLoader(np.concatenate(train_ids), batch_size, self._bptt) val_dataloader = LanguageModelLoader(np.concatenate(val_ids), batch_size, self._bptt) md = LanguageModelData("tmp", 1, self._vocab.size, train_dataloader, val_dataloader, bs=batch_size, bptt=self._bptt) self._language_model = md.get_model(self.OPT_FN, self._embedding_size, self._n_hidden_activations, self._n_layers, dropouti=self._dropouts_lm[0], dropout=self._dropouts_lm[1], wdrop=self._dropouts_lm[2], dropoute=self._dropouts_lm[3], dropouth=self._dropouts_lm[4]) self._language_model.metrics = [accuracy] self._language_model.unfreeze() lr = 1e-3 self._language_model.lr_find(start_lr=lr / 10, end_lr=lr * 50, linear=True) self._language_model.fit( lr / 2, 1, wds=self._wd, use_clr=(32, 2), cycle_len=1, callbacks=[LoggingCallback(save_path="./tmp/log")]) self._language_model.lr_find(start_lr=lr / 10, end_lr=lr * 10, linear=True) self._language_model.fit( lr, 1, wds=self._wd, use_clr=(32, 2), cycle_len=20, callbacks=[LoggingCallback(save_path="./tmp/log")]) self._language_model.save_encoder("enc_weights")
def get_voc_stats(tokens): total_tokens = np.sum([x.shape[0] for x in tokens]) unks = np.sum([np.sum(x == 1) for x in tokens]) print("Total tokens: %d\nUnknown Percentage: %.2f %%" % (total_tokens, unks * 100 / total_tokens)) get_voc_stats(tokens) # In[7]: bptt = 50 batch_size = 64 n_tok = int(np.max([np.max(x) for x in tokens]) + 1) trn_loader = LanguageModelLoader( np.concatenate(trn_tokens), batch_size, bptt) val_loader = LanguageModelLoader( np.concatenate(val_tokens), batch_size, bptt) tst_loader = LanguageModelLoader( np.concatenate(tst_tokens), batch_size, bptt) # In[8]: from collections import Counter tmp = [] for i in range(10000): for j in range(1, trn_tokens[i].shape[0]): if trn_tokens[i][j] == 1: tmp.append(trn_tokens[i][j-1])
new_matrix[BEG, :] = 0 hits, hits * 100 / len(itos[3:]) # In[19]: weights['0.encoder.weight'] = T(new_matrix) weights['0.encoder_with_dropout.embed.weight'] = T(np.copy(new_matrix)) weights['1.decoder.weight'] = T(np.copy(new_matrix)) # ## Languange Model # In[20]: bs = 64 bptt = 50 trn_dl = LanguageModelLoader(np.concatenate(tokens_train), bs, bptt) val_dl = LanguageModelLoader(np.concatenate(tokens_val), bs, bptt) # In[21]: np.max(np.array(list(itertools.chain.from_iterable(tokens_train)))) # In[23]: model_data = LanguageModelData(path, 2, n_toks, trn_dl, val_dl, bs=bs, bptt=bptt)