def create(cls, train_ds, valid_ds, test_ds=None, path='.', bs=64, equal_lengths=True, length_col=None, tfms=None, **kwargs): if equal_lengths: return super().create(train_ds, valid_ds, test_ds=test_ds, path=path, bs=bs, dl_tfms=tfms, **kwargs) else: datasets = super()._init_ds(train_ds, valid_ds, test_ds) train_ds, valid_ds, fix_ds = datasets[:3] if len(datasets) == 4: test_ds = datasets[3] train_lengths = train_ds.lengths(length_col) train_sampler = SortishSampler(train_ds.x, key=lambda i: train_lengths[i], bs=bs//2) train_dl = DataLoader(train_ds, batch_size=bs, sampler=train_sampler, **kwargs) # precalculate lengths ahead of time if they aren't included in xtra valid_lengths = valid_ds.lengths(length_col) valid_sampler = SortSampler(valid_ds.x, key=lambda i: valid_lengths[i]) valid_dl = DataLoader(valid_ds, batch_size=bs, sampler=valid_sampler, **kwargs) fix_lengths = fix_ds.lengths(length_col) fix_sampler = SortSampler(fix_ds.x, key=lambda i: fix_lengths[i]) fix_dl = DataLoader(fix_ds, batch_size=bs, sampler=fix_sampler, **kwargs) dataloaders = [train_dl, valid_dl, fix_dl] if test_ds is not None: test_dl = DataLoader(test_ds, batch_size=1, **kwargs) dataloaders.append(test_dl) return cls(*dataloaders, path=path, collate_fn=pad_collate1d, tfms=tfms)
def test_sort_sampler_sorts_all_descending(): bs = 4 n = bs * 100 data = 2 * np.arange(n) samp = list(SortSampler(data, lambda i: data[i])) # The sample is a permutation of the indices. assert sorted(samp) == list(range(n)) # And that "permutation" is for descending data order. assert all(s1 > s2 for s1, s2 in zip(samp, samp[1:]))
def load_training_and_validation_data(self, training_data_ids_path, training_data_labels_path, validation_data_ids_path, validation_data_labels_path, classifier_data_dir, batch_size=10): training_token_ids = np.load(training_data_ids_path) validation_token_ids = np.load(validation_data_ids_path) training_labels = np.load(training_data_labels_path) validation_labels = np.load(validation_data_labels_path) training_labels = training_labels.flatten() validation_labels = validation_labels.flatten() training_labels -= training_labels.min() validation_labels -= validation_labels.min() training_dataset = TextDataset(training_token_ids, training_labels) validation_dataset = TextDataset(validation_token_ids, validation_labels) training_data_sampler = SortishSampler( data_source=training_token_ids, key=lambda x: len(training_token_ids[x]), bs=batch_size // 2) validation_data_sampler = SortSampler( data_source=validation_token_ids, key=lambda x: len(validation_token_ids[x])) training_dataloader = DataLoader(dataset=training_dataset, batch_size=batch_size // 2, transpose=True, num_workers=1, pad_idx=1, sampler=training_data_sampler) validation_dataloader = DataLoader(dataset=validation_dataset, batch_size=batch_size, transpose=True, num_workers=1, pad_idx=1, sampler=validation_data_sampler) self.model_data = ModelData(path=classifier_data_dir, trn_dl=training_dataloader, val_dl=validation_dataloader)
def create(cls, train_ds, valid_ds, test_ds=None, path='.', bs=64, equal_lengths=True, tfms=None, **kwargs): if equal_lengths: return super().create(train_ds, valid_ds, test_ds=test_ds, path=path, bs=bs, tfms=tfms, **kwargs) else: datasets = [train_ds, valid_ds] if test_ds is not None: datasets.append(test_ds) train_sampler = SortishSampler( datasets[0].x, key=lambda i: datasets[0][i][0].data.shape[0], bs=bs) train_dl = DataLoader(datasets[0], batch_size=bs, sampler=train_sampler, **kwargs) dataloaders = [train_dl] for ds in datasets[1:]: sampler = SortSampler(ds.x, key=lambda i: ds[i][0].data.shape[0]) dataloaders.append( DataLoader(ds, batch_size=bs, sampler=sampler, **kwargs)) return cls(*dataloaders, path=path, collate_fn=pad_collate, tfms=tfms)
def _train_classifier(self, train_ids, train_labels, batch_size=4, val_ids=None, val_labels=None): # change from multi-label to multi-class: def one_hot_idxs(idxs, n_classes): res = np.zeros(n_classes) res[idxs] = 1. return res onehot_train_labels = np.array( [one_hot_idxs(l, self._n_classes) for l in train_labels]) onehot_val_labels = np.array( [one_hot_idxs(l, self._n_classes) for l in val_labels]) train_ds = TextDataset(train_ids, onehot_train_labels) val_ds = TextDataset(val_ids, onehot_val_labels) train_sampler = SortishSampler(train_ids, key=lambda x: len(train_ids[x]), bs=batch_size) val_sampler = SortSampler(val_ids, key=lambda x: len(val_ids[x])) train_dl = DataLoader(train_ds, batch_size, num_workers=1, transpose=True, pad_idx=1, sampler=train_sampler) val_dl = DataLoader(val_ds, batch_size, num_workers=1, transpose=True, pad_idx=1, sampler=val_sampler) md = ModelData("tmp", train_dl, val_dl) m = get_rnn_classifier( self._bptt, 20 * 70, self._n_classes, self._vocab.size, emb_sz=self._embedding_size, n_hid=self._n_hidden_activations, n_layers=self._n_layers, pad_token=1, layers=[self._embedding_size * 3, 128, self._n_classes], drops=[self._dropouts_classifier[4], 0.1], dropouti=self._dropouts_classifier[0], wdrop=self._dropouts_classifier[1], dropoute=self._dropouts_classifier[2], dropouth=self._dropouts_classifier[3]) self._classifier_model = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=self.OPT_FN) self._classifier_model.reg_fn = partial(seq2seq_reg, alpha=2, beta=1) self._classifier_model.clip = 25. # or 0.3 ?! def binary_ce_wrapper(predicted, gt): out = F.sigmoid(predicted) return binary_cross_entropy(out, gt) self._classifier_model.crit = binary_ce_wrapper jaccard_0_5 = partial(self.func_metric, func=jaccard_index) jaccard_0_5.__name__ = "jaccard_0_5" precision_0_5 = partial(self.func_metric, func=precision) precision_0_5.__name__ = "precision_0_5" recall_0_5 = partial(self.func_metric, func=recall) recall_0_5.__name__ = "recall_0_5" f1_0_5 = partial(self.func_metric, func=f1) f1_0_5.__name__ = "f1_0_5" self._classifier_model.metrics = [ jaccard_0_5, precision_0_5, recall_0_5, f1_0_5 ] lr = 3e-3 lrm = 2.6 lrs = np.array( [lr / (lrm**4), lr / (lrm**3), lr / (lrm**2), lr / lrm, lr]) self._classifier_model.load_encoder('enc_weights') self._classifier_model.freeze_to(-1) self._classifier_model.fit( lrs, 1, cycle_len=1, use_clr=(8, 3), callbacks=[LoggingCallback(save_path="./tmp/log")]) self._classifier_model.freeze_to(-2) self._classifier_model.fit( lrs, 1, cycle_len=1, use_clr=(8, 3), callbacks=[LoggingCallback(save_path="./tmp/log")]) self._classifier_model.unfreeze() self._classifier_model.fit( lrs, 1, cycle_len=24, use_clr=(32, 10), callbacks=[LoggingCallback(save_path="./tmp/log")]) self._classifier_model.save('classifier_weights')
# In[18]: df_train.label.value_counts() # In[19]: bs = 64 trn_ds = TextDataset(tokens_train, df_train.label.values) val_ds = TextDataset(tokens_val, df_val.label.values) trn_samp = SortishSampler(tokens_train, key=lambda x: len(tokens_train[x]), bs=bs//2) val_samp = SortSampler(tokens_val, key=lambda x: len(tokens_val[x])) trn_dl = DataLoader(trn_ds, bs//2, transpose=False, num_workers=1, pad_idx=2, sampler=trn_samp) val_dl = DataLoader(val_ds, bs, transpose=False, num_workers=1, pad_idx=2, sampler=val_samp) model_data = ModelData(path, trn_dl, val_dl) # In[20]: model= get_transformer_classifier( n_tok=n_toks, emb_sz=EMB_DIM, n_head=12, n_layer=3, n_ctx=200, max_seq_len=100,