Ejemplo n.º 1
0
    def __init__(self,
                 training_data_ids_path,
                 validation_data_ids_path,
                 language_model_model_dir,
                 data_itos_path,
                 cuda_device_id=0,
                 batch_size=32,
                 dropout_multiplier=0.7):
        torch.cuda.set_device(cuda_device_id)
        self.use_cuda = True if cuda_device_id >= 0 else False

        self.inspire_data_itos = pickle.load(open(data_itos_path, 'rb'))
        self.vocabulary_size = len(self.inspire_data_itos)

        number_of_backpropagation_through_time_steps = 70
        number_of_hidden_units = 1150
        number_of_layers = 3
        self.embedding_size = 400
        optimization_function = partial(optim.Adam, betas=(0.8, 0.99))

        training_token_ids = np.load(training_data_ids_path)
        training_token_ids = np.concatenate(training_token_ids)
        validation_token_ids = np.load(validation_data_ids_path)
        validation_token_ids = np.concatenate(validation_token_ids)

        training_dataloader = LanguageModelLoader(
            nums=training_token_ids,
            bs=batch_size,
            bptt=number_of_backpropagation_through_time_steps)
        validation_dataloader = LanguageModelLoader(
            nums=validation_token_ids,
            bs=batch_size,
            bptt=number_of_backpropagation_through_time_steps)
        model = LanguageModelData(
            path=language_model_model_dir,
            pad_idx=1,
            n_tok=self.vocabulary_size,
            trn_dl=training_dataloader,
            val_dl=validation_dataloader,
            bs=batch_size,
            bptt=number_of_backpropagation_through_time_steps)

        dropouts = np.array([0.25, 0.1, 0.2, 0.02, 0.15]) * dropout_multiplier

        self.learner = model.get_model(opt_fn=optimization_function,
                                       emb_sz=self.embedding_size,
                                       n_hid=number_of_hidden_units,
                                       n_layers=number_of_layers,
                                       dropouti=dropouts[0],
                                       dropout=dropouts[1],
                                       wdrop=dropouts[2],
                                       dropoute=dropouts[3],
                                       dropouth=dropouts[4])
        self.learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
        self.learner.clip = 0.3
        self.learner.metrics = [accuracy]
Ejemplo n.º 2
0
    def _train_lm(self, train_ids, batch_size=4, val_ids=None):
        train_dataloader = LanguageModelLoader(np.concatenate(train_ids),
                                               batch_size, self._bptt)
        val_dataloader = LanguageModelLoader(np.concatenate(val_ids),
                                             batch_size, self._bptt)

        md = LanguageModelData("tmp",
                               1,
                               self._vocab.size,
                               train_dataloader,
                               val_dataloader,
                               bs=batch_size,
                               bptt=self._bptt)

        self._language_model = md.get_model(self.OPT_FN,
                                            self._embedding_size,
                                            self._n_hidden_activations,
                                            self._n_layers,
                                            dropouti=self._dropouts_lm[0],
                                            dropout=self._dropouts_lm[1],
                                            wdrop=self._dropouts_lm[2],
                                            dropoute=self._dropouts_lm[3],
                                            dropouth=self._dropouts_lm[4])

        self._language_model.metrics = [accuracy]
        self._language_model.unfreeze()

        lr = 1e-3
        self._language_model.lr_find(start_lr=lr / 10,
                                     end_lr=lr * 50,
                                     linear=True)
        self._language_model.fit(
            lr / 2,
            1,
            wds=self._wd,
            use_clr=(32, 2),
            cycle_len=1,
            callbacks=[LoggingCallback(save_path="./tmp/log")])

        self._language_model.lr_find(start_lr=lr / 10,
                                     end_lr=lr * 10,
                                     linear=True)

        self._language_model.fit(
            lr,
            1,
            wds=self._wd,
            use_clr=(32, 2),
            cycle_len=20,
            callbacks=[LoggingCallback(save_path="./tmp/log")])

        self._language_model.save_encoder("enc_weights")
Ejemplo n.º 3
0

def get_voc_stats(tokens):
    total_tokens = np.sum([x.shape[0] for x in tokens])
    unks = np.sum([np.sum(x == 1) for x in tokens])
    print("Total tokens: %d\nUnknown Percentage: %.2f %%" % (total_tokens, unks * 100 / total_tokens))
get_voc_stats(tokens)


# In[7]:


bptt = 50
batch_size = 64
n_tok = int(np.max([np.max(x) for x in tokens]) + 1)
trn_loader = LanguageModelLoader(
    np.concatenate(trn_tokens), batch_size, bptt)
val_loader = LanguageModelLoader(
    np.concatenate(val_tokens), batch_size, bptt)
tst_loader = LanguageModelLoader(
    np.concatenate(tst_tokens), batch_size, bptt)


# In[8]:


from collections import Counter
tmp = []
for i in range(10000):
    for j in range(1, trn_tokens[i].shape[0]):
        if trn_tokens[i][j] == 1:
            tmp.append(trn_tokens[i][j-1])
new_matrix[BEG, :] = 0
hits, hits * 100 / len(itos[3:])

# In[19]:

weights['0.encoder.weight'] = T(new_matrix)
weights['0.encoder_with_dropout.embed.weight'] = T(np.copy(new_matrix))
weights['1.decoder.weight'] = T(np.copy(new_matrix))

# ## Languange Model

# In[20]:

bs = 64
bptt = 50
trn_dl = LanguageModelLoader(np.concatenate(tokens_train), bs, bptt)
val_dl = LanguageModelLoader(np.concatenate(tokens_val), bs, bptt)

# In[21]:

np.max(np.array(list(itertools.chain.from_iterable(tokens_train))))

# In[23]:

model_data = LanguageModelData(path,
                               2,
                               n_toks,
                               trn_dl,
                               val_dl,
                               bs=bs,
                               bptt=bptt)