Beispiel #1
0
    def create(cls, train_ds, valid_ds, test_ds=None, path='.',
               bs=64, equal_lengths=True, length_col=None, tfms=None, **kwargs):
        if equal_lengths:
            return super().create(train_ds, valid_ds, test_ds=test_ds, path=path,
                                  bs=bs, dl_tfms=tfms, **kwargs)
        else:
            datasets = super()._init_ds(train_ds, valid_ds, test_ds)
            train_ds, valid_ds, fix_ds = datasets[:3]
            if len(datasets) == 4:
                test_ds = datasets[3]

            train_lengths = train_ds.lengths(length_col)
            train_sampler = SortishSampler(train_ds.x, key=lambda i: train_lengths[i], bs=bs//2)
            train_dl = DataLoader(train_ds, batch_size=bs, sampler=train_sampler, **kwargs)

            # precalculate lengths ahead of time if they aren't included in xtra
            valid_lengths = valid_ds.lengths(length_col)
            valid_sampler = SortSampler(valid_ds.x, key=lambda i: valid_lengths[i])
            valid_dl = DataLoader(valid_ds, batch_size=bs, sampler=valid_sampler, **kwargs)

            fix_lengths = fix_ds.lengths(length_col)
            fix_sampler = SortSampler(fix_ds.x, key=lambda i: fix_lengths[i])
            fix_dl = DataLoader(fix_ds, batch_size=bs, sampler=fix_sampler, **kwargs)

            dataloaders = [train_dl, valid_dl, fix_dl]
            if test_ds is not None:
                test_dl = DataLoader(test_ds, batch_size=1, **kwargs)
                dataloaders.append(test_dl)

            return cls(*dataloaders, path=path, collate_fn=pad_collate1d, tfms=tfms)
Beispiel #2
0
def test_sort_sampler_sorts_all_descending():
    bs = 4
    n = bs * 100
    data = 2 * np.arange(n)
    samp = list(SortSampler(data, lambda i: data[i]))

    # The sample is a permutation of the indices.
    assert sorted(samp) == list(range(n))
    # And that "permutation" is for descending data order.
    assert all(s1 > s2 for s1, s2 in zip(samp, samp[1:]))
Beispiel #3
0
    def load_training_and_validation_data(self,
                                          training_data_ids_path,
                                          training_data_labels_path,
                                          validation_data_ids_path,
                                          validation_data_labels_path,
                                          classifier_data_dir,
                                          batch_size=10):
        training_token_ids = np.load(training_data_ids_path)
        validation_token_ids = np.load(validation_data_ids_path)
        training_labels = np.load(training_data_labels_path)
        validation_labels = np.load(validation_data_labels_path)

        training_labels = training_labels.flatten()
        validation_labels = validation_labels.flatten()
        training_labels -= training_labels.min()
        validation_labels -= validation_labels.min()

        training_dataset = TextDataset(training_token_ids, training_labels)
        validation_dataset = TextDataset(validation_token_ids,
                                         validation_labels)
        training_data_sampler = SortishSampler(
            data_source=training_token_ids,
            key=lambda x: len(training_token_ids[x]),
            bs=batch_size // 2)
        validation_data_sampler = SortSampler(
            data_source=validation_token_ids,
            key=lambda x: len(validation_token_ids[x]))
        training_dataloader = DataLoader(dataset=training_dataset,
                                         batch_size=batch_size // 2,
                                         transpose=True,
                                         num_workers=1,
                                         pad_idx=1,
                                         sampler=training_data_sampler)
        validation_dataloader = DataLoader(dataset=validation_dataset,
                                           batch_size=batch_size,
                                           transpose=True,
                                           num_workers=1,
                                           pad_idx=1,
                                           sampler=validation_data_sampler)
        self.model_data = ModelData(path=classifier_data_dir,
                                    trn_dl=training_dataloader,
                                    val_dl=validation_dataloader)
Beispiel #4
0
    def create(cls,
               train_ds,
               valid_ds,
               test_ds=None,
               path='.',
               bs=64,
               equal_lengths=True,
               tfms=None,
               **kwargs):
        if equal_lengths:
            return super().create(train_ds,
                                  valid_ds,
                                  test_ds=test_ds,
                                  path=path,
                                  bs=bs,
                                  tfms=tfms,
                                  **kwargs)
        else:
            datasets = [train_ds, valid_ds]
            if test_ds is not None:
                datasets.append(test_ds)

            train_sampler = SortishSampler(
                datasets[0].x,
                key=lambda i: datasets[0][i][0].data.shape[0],
                bs=bs)
            train_dl = DataLoader(datasets[0],
                                  batch_size=bs,
                                  sampler=train_sampler,
                                  **kwargs)
            dataloaders = [train_dl]
            for ds in datasets[1:]:
                sampler = SortSampler(ds.x,
                                      key=lambda i: ds[i][0].data.shape[0])
                dataloaders.append(
                    DataLoader(ds, batch_size=bs, sampler=sampler, **kwargs))
            return cls(*dataloaders,
                       path=path,
                       collate_fn=pad_collate,
                       tfms=tfms)
Beispiel #5
0
    def _train_classifier(self,
                          train_ids,
                          train_labels,
                          batch_size=4,
                          val_ids=None,
                          val_labels=None):
        # change from multi-label to multi-class:

        def one_hot_idxs(idxs, n_classes):
            res = np.zeros(n_classes)
            res[idxs] = 1.
            return res

        onehot_train_labels = np.array(
            [one_hot_idxs(l, self._n_classes) for l in train_labels])
        onehot_val_labels = np.array(
            [one_hot_idxs(l, self._n_classes) for l in val_labels])

        train_ds = TextDataset(train_ids, onehot_train_labels)
        val_ds = TextDataset(val_ids, onehot_val_labels)

        train_sampler = SortishSampler(train_ids,
                                       key=lambda x: len(train_ids[x]),
                                       bs=batch_size)
        val_sampler = SortSampler(val_ids, key=lambda x: len(val_ids[x]))

        train_dl = DataLoader(train_ds,
                              batch_size,
                              num_workers=1,
                              transpose=True,
                              pad_idx=1,
                              sampler=train_sampler)
        val_dl = DataLoader(val_ds,
                            batch_size,
                            num_workers=1,
                            transpose=True,
                            pad_idx=1,
                            sampler=val_sampler)

        md = ModelData("tmp", train_dl, val_dl)

        m = get_rnn_classifier(
            self._bptt,
            20 * 70,
            self._n_classes,
            self._vocab.size,
            emb_sz=self._embedding_size,
            n_hid=self._n_hidden_activations,
            n_layers=self._n_layers,
            pad_token=1,
            layers=[self._embedding_size * 3, 128, self._n_classes],
            drops=[self._dropouts_classifier[4], 0.1],
            dropouti=self._dropouts_classifier[0],
            wdrop=self._dropouts_classifier[1],
            dropoute=self._dropouts_classifier[2],
            dropouth=self._dropouts_classifier[3])

        self._classifier_model = RNN_Learner(md,
                                             TextModel(to_gpu(m)),
                                             opt_fn=self.OPT_FN)
        self._classifier_model.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
        self._classifier_model.clip = 25.  # or 0.3 ?!

        def binary_ce_wrapper(predicted, gt):
            out = F.sigmoid(predicted)
            return binary_cross_entropy(out, gt)

        self._classifier_model.crit = binary_ce_wrapper
        jaccard_0_5 = partial(self.func_metric, func=jaccard_index)
        jaccard_0_5.__name__ = "jaccard_0_5"
        precision_0_5 = partial(self.func_metric, func=precision)
        precision_0_5.__name__ = "precision_0_5"
        recall_0_5 = partial(self.func_metric, func=recall)
        recall_0_5.__name__ = "recall_0_5"
        f1_0_5 = partial(self.func_metric, func=f1)
        f1_0_5.__name__ = "f1_0_5"

        self._classifier_model.metrics = [
            jaccard_0_5, precision_0_5, recall_0_5, f1_0_5
        ]

        lr = 3e-3
        lrm = 2.6
        lrs = np.array(
            [lr / (lrm**4), lr / (lrm**3), lr / (lrm**2), lr / lrm, lr])

        self._classifier_model.load_encoder('enc_weights')

        self._classifier_model.freeze_to(-1)
        self._classifier_model.fit(
            lrs,
            1,
            cycle_len=1,
            use_clr=(8, 3),
            callbacks=[LoggingCallback(save_path="./tmp/log")])
        self._classifier_model.freeze_to(-2)
        self._classifier_model.fit(
            lrs,
            1,
            cycle_len=1,
            use_clr=(8, 3),
            callbacks=[LoggingCallback(save_path="./tmp/log")])
        self._classifier_model.unfreeze()
        self._classifier_model.fit(
            lrs,
            1,
            cycle_len=24,
            use_clr=(32, 10),
            callbacks=[LoggingCallback(save_path="./tmp/log")])

        self._classifier_model.save('classifier_weights')

# In[18]:


df_train.label.value_counts()


# In[19]:


bs = 64
trn_ds = TextDataset(tokens_train, df_train.label.values)
val_ds = TextDataset(tokens_val, df_val.label.values)
trn_samp = SortishSampler(tokens_train, key=lambda x: len(tokens_train[x]), bs=bs//2)
val_samp = SortSampler(tokens_val, key=lambda x: len(tokens_val[x]))
trn_dl = DataLoader(trn_ds, bs//2, transpose=False, num_workers=1, pad_idx=2, sampler=trn_samp)
val_dl = DataLoader(val_ds, bs, transpose=False, num_workers=1, pad_idx=2, sampler=val_samp)
model_data = ModelData(path, trn_dl, val_dl)


# In[20]:


model= get_transformer_classifier(
    n_tok=n_toks, 
    emb_sz=EMB_DIM, 
    n_head=12, 
    n_layer=3, 
    n_ctx=200,
    max_seq_len=100,