Ejemplo n.º 1
0
    def initialize_learner(self):
        optimization_function = partial(optim.Adam, betas=(0.8, 0.99))

        self.learner = RNN_Learner(data=self.model_data, models=TextModel(to_gpu(self.model)),
                                   opt_fn=optimization_function)
        self.learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
        self.learner.clip = 25.
        self.learner.metrics = [accuracy]
Ejemplo n.º 2
0
class Classifier(object):
    def __init__(self,
                 data_itos_path,
                 cuda_device_id=0,
                 dropout_multiplier=0.5,
                 number_of_classes=3):
        torch.cuda.set_device(cuda_device_id)

        inspire_data_itos = pickle.load(open(data_itos_path, 'rb'))
        self.vocabulary_size = len(inspire_data_itos)
        self.inspire_data_stoi = collections.defaultdict(
            lambda: 0,
            {str(v): int(k)
             for k, v in enumerate(inspire_data_itos)})

        dropouts = np.array([0.4, 0.5, 0.05, 0.3, 0.4]) * dropout_multiplier

        number_of_back_propagation_through_time_steps = 70
        number_of_hidden_units = 1150
        number_of_layers = 3
        embedding_size = 400

        self.model = get_rnn_classifer(
            bptt=number_of_back_propagation_through_time_steps,
            max_seq=20 * number_of_back_propagation_through_time_steps,
            n_class=number_of_classes,
            n_tok=self.vocabulary_size,
            emb_sz=embedding_size,
            n_hid=number_of_hidden_units,
            n_layers=number_of_layers,
            pad_token=1,
            layers=[embedding_size * 3, 50, number_of_classes],
            drops=[dropouts[4], 0.1],
            dropouti=dropouts[0],
            wdrop=dropouts[1],
            dropoute=dropouts[2],
            dropouth=dropouts[3])

    def load_training_and_validation_data(self,
                                          training_data_ids_path,
                                          training_data_labels_path,
                                          validation_data_ids_path,
                                          validation_data_labels_path,
                                          classifier_data_dir,
                                          batch_size=10):
        training_token_ids = np.load(training_data_ids_path)
        validation_token_ids = np.load(validation_data_ids_path)
        training_labels = np.load(training_data_labels_path)
        validation_labels = np.load(validation_data_labels_path)

        training_labels = training_labels.flatten()
        validation_labels = validation_labels.flatten()
        training_labels -= training_labels.min()
        validation_labels -= validation_labels.min()

        training_dataset = TextDataset(training_token_ids, training_labels)
        validation_dataset = TextDataset(validation_token_ids,
                                         validation_labels)
        training_data_sampler = SortishSampler(
            data_source=training_token_ids,
            key=lambda x: len(training_token_ids[x]),
            bs=batch_size // 2)
        validation_data_sampler = SortSampler(
            data_source=validation_token_ids,
            key=lambda x: len(validation_token_ids[x]))
        training_dataloader = DataLoader(dataset=training_dataset,
                                         batch_size=batch_size // 2,
                                         transpose=True,
                                         num_workers=1,
                                         pad_idx=1,
                                         sampler=training_data_sampler)
        validation_dataloader = DataLoader(dataset=validation_dataset,
                                           batch_size=batch_size,
                                           transpose=True,
                                           num_workers=1,
                                           pad_idx=1,
                                           sampler=validation_data_sampler)
        self.model_data = ModelData(path=classifier_data_dir,
                                    trn_dl=training_dataloader,
                                    val_dl=validation_dataloader)

    def initialize_learner(self):
        optimization_function = partial(optim.Adam, betas=(0.8, 0.99))

        self.learner = RNN_Learner(data=self.model_data,
                                   models=TextModel(to_gpu(self.model)),
                                   opt_fn=optimization_function)
        self.learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
        self.learner.clip = 25.
        self.learner.metrics = [accuracy]

    def load_finetuned_language_model_weights(
            self, finetuned_language_model_encoder_path):
        load_model(self.learner.model[0],
                   finetuned_language_model_encoder_path)

    def train(self,
              trained_classifier_save_path,
              learning_rates=np.array([1e-4, 1e-4, 1e-4, 1e-3, 1e-2]),
              weight_decay=1e-6,
              cycle_length=14):
        self.learner.freeze_to(-1)
        self.learner.fit(learning_rates,
                         n_cycle=1,
                         wds=weight_decay,
                         cycle_len=1,
                         use_clr=(8, 3))
        self.learner.freeze_to(-2)
        self.learner.fit(learning_rates,
                         n_cycle=1,
                         wds=weight_decay,
                         cycle_len=1,
                         use_clr=(8, 3))

        self.learner.unfreeze()
        self.learner.fit(learning_rates,
                         n_cycle=1,
                         wds=weight_decay,
                         cycle_len=cycle_length,
                         use_clr=(32, 10))
        save_model(self.learner.model, trained_classifier_save_path)

    def load_trained_classifier_weights(self, trained_classifier_path):
        self.model.load_state_dict(
            torch.load(trained_classifier_path,
                       map_location=lambda storage, loc: storage))

    def predict(self, text):
        self.model.reset()
        self.model.eval()

        input_string = 'xbos xfld 1 ' + text
        texts = [input_string]
        tokens = Tokenizer(lang='en_core_web_sm').proc_all_mp(
            partition_by_cores(texts), lang='en_core_web_sm')
        encoded_tokens = [self.inspire_data_stoi[p] for p in tokens[0]]
        token_array = np.reshape(np.array(encoded_tokens), (-1, 1))
        token_array = Variable(torch.from_numpy(token_array))
        prediction_scores = self.model(token_array)
        prediction_scores_numpy = prediction_scores[0].data.cpu().numpy()

        return numpy_softmax(prediction_scores_numpy[0])[0]
Ejemplo n.º 3
0
    def _train_classifier(self,
                          train_ids,
                          train_labels,
                          batch_size=4,
                          val_ids=None,
                          val_labels=None):
        # change from multi-label to multi-class:

        def one_hot_idxs(idxs, n_classes):
            res = np.zeros(n_classes)
            res[idxs] = 1.
            return res

        onehot_train_labels = np.array(
            [one_hot_idxs(l, self._n_classes) for l in train_labels])
        onehot_val_labels = np.array(
            [one_hot_idxs(l, self._n_classes) for l in val_labels])

        train_ds = TextDataset(train_ids, onehot_train_labels)
        val_ds = TextDataset(val_ids, onehot_val_labels)

        train_sampler = SortishSampler(train_ids,
                                       key=lambda x: len(train_ids[x]),
                                       bs=batch_size)
        val_sampler = SortSampler(val_ids, key=lambda x: len(val_ids[x]))

        train_dl = DataLoader(train_ds,
                              batch_size,
                              num_workers=1,
                              transpose=True,
                              pad_idx=1,
                              sampler=train_sampler)
        val_dl = DataLoader(val_ds,
                            batch_size,
                            num_workers=1,
                            transpose=True,
                            pad_idx=1,
                            sampler=val_sampler)

        md = ModelData("tmp", train_dl, val_dl)

        m = get_rnn_classifier(
            self._bptt,
            20 * 70,
            self._n_classes,
            self._vocab.size,
            emb_sz=self._embedding_size,
            n_hid=self._n_hidden_activations,
            n_layers=self._n_layers,
            pad_token=1,
            layers=[self._embedding_size * 3, 128, self._n_classes],
            drops=[self._dropouts_classifier[4], 0.1],
            dropouti=self._dropouts_classifier[0],
            wdrop=self._dropouts_classifier[1],
            dropoute=self._dropouts_classifier[2],
            dropouth=self._dropouts_classifier[3])

        self._classifier_model = RNN_Learner(md,
                                             TextModel(to_gpu(m)),
                                             opt_fn=self.OPT_FN)
        self._classifier_model.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
        self._classifier_model.clip = 25.  # or 0.3 ?!

        def binary_ce_wrapper(predicted, gt):
            out = F.sigmoid(predicted)
            return binary_cross_entropy(out, gt)

        self._classifier_model.crit = binary_ce_wrapper
        jaccard_0_5 = partial(self.func_metric, func=jaccard_index)
        jaccard_0_5.__name__ = "jaccard_0_5"
        precision_0_5 = partial(self.func_metric, func=precision)
        precision_0_5.__name__ = "precision_0_5"
        recall_0_5 = partial(self.func_metric, func=recall)
        recall_0_5.__name__ = "recall_0_5"
        f1_0_5 = partial(self.func_metric, func=f1)
        f1_0_5.__name__ = "f1_0_5"

        self._classifier_model.metrics = [
            jaccard_0_5, precision_0_5, recall_0_5, f1_0_5
        ]

        lr = 3e-3
        lrm = 2.6
        lrs = np.array(
            [lr / (lrm**4), lr / (lrm**3), lr / (lrm**2), lr / lrm, lr])

        self._classifier_model.load_encoder('enc_weights')

        self._classifier_model.freeze_to(-1)
        self._classifier_model.fit(
            lrs,
            1,
            cycle_len=1,
            use_clr=(8, 3),
            callbacks=[LoggingCallback(save_path="./tmp/log")])
        self._classifier_model.freeze_to(-2)
        self._classifier_model.fit(
            lrs,
            1,
            cycle_len=1,
            use_clr=(8, 3),
            callbacks=[LoggingCallback(save_path="./tmp/log")])
        self._classifier_model.unfreeze()
        self._classifier_model.fit(
            lrs,
            1,
            cycle_len=24,
            use_clr=(32, 10),
            callbacks=[LoggingCallback(save_path="./tmp/log")])

        self._classifier_model.save('classifier_weights')
Ejemplo n.º 4
0
class RNNGenreClassifier(GenreClassifier):
    THRESH = 0.5
    OPT_FN = partial(
        optim.Adam, betas=(0.75, 0.99)
    )  # defaults for Adam dont work well for NLP so we change to this number...

    def __init__(self,
                 embedding_size=250,
                 n_hidden_activations=640,
                 n_layers=3,
                 drop_mul_lm=0.8,
                 drop_mul_classifier=0.6,
                 bptt=70,
                 wd=1e-7,
                 n_classes=31,
                 vocab=None,
                 batch_size=128):  #0.7):
        super(RNNGenreClassifier, self).__init__(n_classes)

        self._vocab = vocab
        self._dropouts_lm = np.array([0.25, 0.1, 0.2, 0.02, 0.15
                                      ]) * drop_mul_lm
        self._dropouts_classifier = np.array([0.4, 0.5, 0.05, 0.3, 0.4
                                              ]) * drop_mul_classifier

        self._embedding_size = embedding_size
        self._n_hidden_activations = n_hidden_activations
        self._n_layers = n_layers
        self._bptt = bptt
        self._wd = wd
        self._batch_size = batch_size

    @staticmethod
    def func_metric(preds, gts, thresh=0.5, func=jaccard_index):
        if len(preds) != len(gts):
            raise RuntimeError(
                "predicted and gt lists must have same size! predicted = %d, gt = %d"
                % (len(preds), len(gts)))

        def _process(p, g):
            g = np.where(g > thresh)[0].tolist()
            p = np.where(sigmoid(p) > thresh)[0].tolist()
            return func(p, g)

        return np.array([_process(p, g) for p, g in zip(preds, gts)]).mean()

    def _train_lm(self, train_ids, batch_size=4, val_ids=None):
        train_dataloader = LanguageModelLoader(np.concatenate(train_ids),
                                               batch_size, self._bptt)
        val_dataloader = LanguageModelLoader(np.concatenate(val_ids),
                                             batch_size, self._bptt)

        md = LanguageModelData("tmp",
                               1,
                               self._vocab.size,
                               train_dataloader,
                               val_dataloader,
                               bs=batch_size,
                               bptt=self._bptt)

        self._language_model = md.get_model(self.OPT_FN,
                                            self._embedding_size,
                                            self._n_hidden_activations,
                                            self._n_layers,
                                            dropouti=self._dropouts_lm[0],
                                            dropout=self._dropouts_lm[1],
                                            wdrop=self._dropouts_lm[2],
                                            dropoute=self._dropouts_lm[3],
                                            dropouth=self._dropouts_lm[4])

        self._language_model.metrics = [accuracy]
        self._language_model.unfreeze()

        lr = 1e-3
        self._language_model.lr_find(start_lr=lr / 10,
                                     end_lr=lr * 50,
                                     linear=True)
        self._language_model.fit(
            lr / 2,
            1,
            wds=self._wd,
            use_clr=(32, 2),
            cycle_len=1,
            callbacks=[LoggingCallback(save_path="./tmp/log")])

        self._language_model.lr_find(start_lr=lr / 10,
                                     end_lr=lr * 10,
                                     linear=True)

        self._language_model.fit(
            lr,
            1,
            wds=self._wd,
            use_clr=(32, 2),
            cycle_len=20,
            callbacks=[LoggingCallback(save_path="./tmp/log")])

        self._language_model.save_encoder("enc_weights")

    def _train_classifier(self,
                          train_ids,
                          train_labels,
                          batch_size=4,
                          val_ids=None,
                          val_labels=None):
        # change from multi-label to multi-class:

        def one_hot_idxs(idxs, n_classes):
            res = np.zeros(n_classes)
            res[idxs] = 1.
            return res

        onehot_train_labels = np.array(
            [one_hot_idxs(l, self._n_classes) for l in train_labels])
        onehot_val_labels = np.array(
            [one_hot_idxs(l, self._n_classes) for l in val_labels])

        train_ds = TextDataset(train_ids, onehot_train_labels)
        val_ds = TextDataset(val_ids, onehot_val_labels)

        train_sampler = SortishSampler(train_ids,
                                       key=lambda x: len(train_ids[x]),
                                       bs=batch_size)
        val_sampler = SortSampler(val_ids, key=lambda x: len(val_ids[x]))

        train_dl = DataLoader(train_ds,
                              batch_size,
                              num_workers=1,
                              transpose=True,
                              pad_idx=1,
                              sampler=train_sampler)
        val_dl = DataLoader(val_ds,
                            batch_size,
                            num_workers=1,
                            transpose=True,
                            pad_idx=1,
                            sampler=val_sampler)

        md = ModelData("tmp", train_dl, val_dl)

        m = get_rnn_classifier(
            self._bptt,
            20 * 70,
            self._n_classes,
            self._vocab.size,
            emb_sz=self._embedding_size,
            n_hid=self._n_hidden_activations,
            n_layers=self._n_layers,
            pad_token=1,
            layers=[self._embedding_size * 3, 128, self._n_classes],
            drops=[self._dropouts_classifier[4], 0.1],
            dropouti=self._dropouts_classifier[0],
            wdrop=self._dropouts_classifier[1],
            dropoute=self._dropouts_classifier[2],
            dropouth=self._dropouts_classifier[3])

        self._classifier_model = RNN_Learner(md,
                                             TextModel(to_gpu(m)),
                                             opt_fn=self.OPT_FN)
        self._classifier_model.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
        self._classifier_model.clip = 25.  # or 0.3 ?!

        def binary_ce_wrapper(predicted, gt):
            out = F.sigmoid(predicted)
            return binary_cross_entropy(out, gt)

        self._classifier_model.crit = binary_ce_wrapper
        jaccard_0_5 = partial(self.func_metric, func=jaccard_index)
        jaccard_0_5.__name__ = "jaccard_0_5"
        precision_0_5 = partial(self.func_metric, func=precision)
        precision_0_5.__name__ = "precision_0_5"
        recall_0_5 = partial(self.func_metric, func=recall)
        recall_0_5.__name__ = "recall_0_5"
        f1_0_5 = partial(self.func_metric, func=f1)
        f1_0_5.__name__ = "f1_0_5"

        self._classifier_model.metrics = [
            jaccard_0_5, precision_0_5, recall_0_5, f1_0_5
        ]

        lr = 3e-3
        lrm = 2.6
        lrs = np.array(
            [lr / (lrm**4), lr / (lrm**3), lr / (lrm**2), lr / lrm, lr])

        self._classifier_model.load_encoder('enc_weights')

        self._classifier_model.freeze_to(-1)
        self._classifier_model.fit(
            lrs,
            1,
            cycle_len=1,
            use_clr=(8, 3),
            callbacks=[LoggingCallback(save_path="./tmp/log")])
        self._classifier_model.freeze_to(-2)
        self._classifier_model.fit(
            lrs,
            1,
            cycle_len=1,
            use_clr=(8, 3),
            callbacks=[LoggingCallback(save_path="./tmp/log")])
        self._classifier_model.unfreeze()
        self._classifier_model.fit(
            lrs,
            1,
            cycle_len=24,
            use_clr=(32, 10),
            callbacks=[LoggingCallback(save_path="./tmp/log")])

        self._classifier_model.save('classifier_weights')

    def train(self, train_data, train_labels, val_data=None, val_labels=None):
        train_ids = self._vocab.numericalize(train_data)
        val_ids = self._vocab.numericalize(val_data)
        self._train_lm(train_ids, batch_size=self._batch_size, val_ids=val_ids)

        self._train_classifier(train_ids,
                               train_labels,
                               batch_size=self._batch_size,
                               val_ids=val_ids,
                               val_labels=val_labels)

    def predict_lm(self, tokens):
        ids = self._vocab.numericalize(tokens)
        self._language_model.predict_array(ids)

    def predict(self, summaries_tokens):
        summaries_ids = self._vocab.numericalize(summaries_tokens)
        pp = []

        for x in summaries_ids:
            x = np.array(x)
            x = np.expand_dims(x, 1)
            res = self._classifier_model.predict_array(x)[0]
            p = np.apply_along_axis(np_sigmoid, 0, res)
            p = np.where(p > self.THRESH)[0].tolist()
            pp.append(p)
        return pp
                           3,
                           n_toks,
                           emb_sz=EMB_DIM,
                           n_hid=500,
                           n_layers=3,
                           pad_token=2,
                           layers=[EMB_DIM * 3, 50, 3],
                           drops=[dps[4], 0.1],
                           dropouti=dps[0],
                           wdrop=dps[1],
                           dropoute=dps[2],
                           dropouth=dps[3])

# In[42]:

learn = RNN_Learner(model_data, TextModel(to_gpu(model)), opt_fn=opt_fn)
learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
learn.clip = 25.
learn.metrics = [accuracy]
learn.load_encoder('lm1_enc')

# In[43]:

learn.freeze_to(-1)
learn.lr_find(lrs / 1000)
learn.sched.plot()

# In[44]:

lr = 2e-4
lrm = 2.6