def train(self, libact_dataset, indexes=None): if indexes is not None: libact_dataset = Dataset( [libact_dataset.data[i][0] for i in indexes], [libact_dataset.data[i][1] for i in indexes]) X, y = libact_dataset.format_sklearn() X = [sentence_features(sentence) for sentence in X] return self.model.fit(X, y)
def train(self, libact_dataset, indexes=None): if self._index_subset and indexes is not None: libact_dataset = Dataset([libact_dataset.data[i][0] for i in indexes], [libact_dataset.data[i][1] for i in indexes]) X, y = libact_dataset.format_sklearn() sentences = make_flair_sentences(X, y, self._tagger.tag_type) corpus = train_dev_split(sentences) if self._start_state: self._tagger.load_state_dict(self._start_state) if self._save_all_models: train_path = next_available_subfolder(self._base_path) else: train_path = self._base_path model_trainer = ModelTrainerFlair(self._tagger, corpus) return model_trainer.train(base_path=train_path, **self._train_args)
def train(self, libact_dataset, new_indexes=None): torch.cuda.empty_cache() def collate_fn(inpt): return tuple(zip(*inpt)) if (new_indexes is not None) and (self._iter % self._iter_retrain) != 0: libact_dataset = Dataset( [libact_dataset.data[i][0] for i in new_indexes], [libact_dataset.data[i][1] for i in new_indexes]) n_epochs = 1 else: n_epochs = self._retrain_epochs X, y = libact_dataset.format_sklearn() if self._string_input: y = convert_y_to_bio_format(X, y) X = [s.split(' ') for s in X] if self._valid_ratio > 0.: X_train, X_valid, y_train, y_valid = train_test_split( X, y, test_size=self._valid_ratio) valid_data = list(zip(X_valid, y_valid)) else: X_train, y_train = X, y valid_data = None train_data = list(zip(X_train, y_train)) if (self._model is None) or self._train_from_scratch: self._model = self._model_ctor() self._trainer = self._trainer_ctor(self._model, len(X_train), train_data, valid_data) gc.collect() torch.cuda.empty_cache() self._trainer.train(self._retrain_epochs) self._iter += 1
def train(self, libact_dataset, new_indexes=None): # print('New indexes', new_indexes) if new_indexes is not None and self._autofill_similar_objects: n_updated = 0 for new_ind in new_indexes: new_example = libact_dataset.data[new_ind] for i in range(len(libact_dataset.data)): if libact_dataset.data[i][1] is not None: continue else: train_object = libact_dataset.data[i][0] if train_object == new_example[0]: libact_dataset.data[i] = (train_object, new_example[1]) n_updated += 1 print('Number of updated examples', n_updated) gc.collect() torch.cuda.empty_cache() collate_fn = lambda inpt: tuple(zip(*inpt)) if (new_indexes is not None) and (self._iter % self._iter_retrain) != 0: libact_dataset = Dataset( [libact_dataset.data[i][0] for i in new_indexes], [libact_dataset.data[i][1] for i in new_indexes]) n_epochs = 1 else: n_epochs = self._retrain_epochs if libact_dataset.get_labeled_entries(): X, y = libact_dataset.format_sklearn() X = X.tolist() y = y.tolist() else: X = [] y = [] X += self._additional_X y += self._additional_y if self._string_input: X, y = convert_to_bio_format(X, y) if not X: return if self._valid_ratio > 0.: X_train, X_valid, y_train, y_valid = train_test_split( X, y, test_size=self._valid_ratio) valid_data = list(zip(X_valid, y_valid)) else: X_train, y_train = X, y valid_data = None train_data = list(zip(X_train, y_train)) if self._n_upsample_positive: n_upsample = self._n_upsample_positive positive_examples = [(x, py) for x, py in zip(X_train, y_train) if not all((tag == 'O' for tag in py))] if type(n_upsample) is float: n_upsample = int( math.ceil( max( 0, n_upsample - (len(positive_examples) / len(X_train))) * len(X_train))) if n_upsample > 0: upsampled_examples = random.choices(positive_examples, k=n_upsample) train_data += upsampled_examples if self._self_training_samples and self._model is not None: unlabeled = libact_dataset.get_unlabeled_entries() unlabeled = random.sample( unlabeled, min(self._self_training_samples, len(unlabeled))) X = [e[1] for e in unlabeled] if self._string_input: X = [sent.split(' ') for sent in X] pred_y = self._model.predict(X)[0] self_training_examples = [(x, py) for x, py in zip(X, pred_y) if not all((tag == 'O' for tag in py))] train_data += self_training_examples self.train_data_for_allenlp = self.reader.from_list_to_dataset( train_data) self.val_data_for_allenlp = self.reader.from_list_to_dataset( valid_data) self.train_data_for_allenlp.index_with(self.vocab) self.val_data_for_allenlp.index_with(self.vocab) self.train_data_loader = DataLoader( dataset=self.train_data_for_allenlp, batch_size=self._batch_size, collate_fn=allennlp_collate) self.val_data_loader = DataLoader(dataset=self.val_data_for_allenlp, batch_size=self._batch_size, collate_fn=allennlp_collate) print('Number of all valid examples: ', len(valid_data)) print('Number of all training examples: ', len(train_data)) if (self._model is None) or self._train_from_scratch: self._model = self._model_ctor() self._trainer = self._trainer_ctor(self._model, len(X_train), self.train_data_loader, self.val_data_loader) gc.collect() torch.cuda.empty_cache() self._trainer.train() self._iter += 1