Esempio n. 1
0
quota = 20  # number of samples to query


#EXECUTE FROM HERE FOR ITERATIONS

qs1 = MultilabelWithAuxiliaryLearner(
trn_ds,
BinaryRelevance(LogisticRegression()),
BinaryRelevance(SVM()),
criterion='hlr')

run(data_CV_train,trn_ds, qs1, quota)

model.train(trn_ds)

X , y = zip(*tst_ds.get_labeled_entries())

pred = model.predict(X)

output = pd.DataFrame()
output['UE_pred'] = [pred[i][0] for i in range(len(pred))]
output['BR_pred'] = [pred[i][1] for i in range(len(pred))]
output['FR_pred'] = [pred[i][2] for i in range(len(pred))]

true = Y.iloc[test_index].reset_index(drop=True)

output['reviewText'] = np.array(data_CV_concat.iloc[test_index]['reviewText'])

output = pd.concat([output,true],axis = 1)

output.to_csv('output_test.csv')
Esempio n. 2
0
    def train(self, libact_dataset, new_indexes=None):
        # print('New indexes', new_indexes)

        if new_indexes is not None and self._autofill_similar_objects:
            n_updated = 0
            for new_ind in new_indexes:
                new_example = libact_dataset.data[new_ind]

                for i in range(len(libact_dataset.data)):
                    if libact_dataset.data[i][1] is not None:
                        continue
                    else:
                        train_object = libact_dataset.data[i][0]
                        if train_object == new_example[0]:
                            libact_dataset.data[i] = (train_object,
                                                      new_example[1])
                            n_updated += 1

            print('Number of updated examples', n_updated)

        gc.collect()
        torch.cuda.empty_cache()

        collate_fn = lambda inpt: tuple(zip(*inpt))

        if (new_indexes
                is not None) and (self._iter % self._iter_retrain) != 0:
            libact_dataset = Dataset(
                [libact_dataset.data[i][0] for i in new_indexes],
                [libact_dataset.data[i][1] for i in new_indexes])
            n_epochs = 1
        else:
            n_epochs = self._retrain_epochs

        if libact_dataset.get_labeled_entries():
            X, y = libact_dataset.format_sklearn()
            X = X.tolist()
            y = y.tolist()
        else:
            X = []
            y = []

        X += self._additional_X
        y += self._additional_y

        if self._string_input:
            X, y = convert_to_bio_format(X, y)

        if not X:
            return

        if self._valid_ratio > 0.:
            X_train, X_valid, y_train, y_valid = train_test_split(
                X, y, test_size=self._valid_ratio)
            valid_data = list(zip(X_valid, y_valid))
        else:
            X_train, y_train = X, y
            valid_data = None

        train_data = list(zip(X_train, y_train))

        if self._n_upsample_positive:
            n_upsample = self._n_upsample_positive

            positive_examples = [(x, py) for x, py in zip(X_train, y_train)
                                 if not all((tag == 'O' for tag in py))]

            if type(n_upsample) is float:
                n_upsample = int(
                    math.ceil(
                        max(
                            0, n_upsample -
                            (len(positive_examples) / len(X_train))) *
                        len(X_train)))

            if n_upsample > 0:
                upsampled_examples = random.choices(positive_examples,
                                                    k=n_upsample)
                train_data += upsampled_examples

        if self._self_training_samples and self._model is not None:
            unlabeled = libact_dataset.get_unlabeled_entries()
            unlabeled = random.sample(
                unlabeled, min(self._self_training_samples, len(unlabeled)))

            X = [e[1] for e in unlabeled]

            if self._string_input:
                X = [sent.split(' ') for sent in X]

            pred_y = self._model.predict(X)[0]
            self_training_examples = [(x, py) for x, py in zip(X, pred_y)
                                      if not all((tag == 'O' for tag in py))]

            train_data += self_training_examples

        self.train_data_for_allenlp = self.reader.from_list_to_dataset(
            train_data)
        self.val_data_for_allenlp = self.reader.from_list_to_dataset(
            valid_data)
        self.train_data_for_allenlp.index_with(self.vocab)
        self.val_data_for_allenlp.index_with(self.vocab)
        self.train_data_loader = DataLoader(
            dataset=self.train_data_for_allenlp,
            batch_size=self._batch_size,
            collate_fn=allennlp_collate)
        self.val_data_loader = DataLoader(dataset=self.val_data_for_allenlp,
                                          batch_size=self._batch_size,
                                          collate_fn=allennlp_collate)
        print('Number of all valid examples: ', len(valid_data))
        print('Number of all training examples: ', len(train_data))

        if (self._model is None) or self._train_from_scratch:
            self._model = self._model_ctor()
            self._trainer = self._trainer_ctor(self._model, len(X_train),
                                               self.train_data_loader,
                                               self.val_data_loader)

            gc.collect()
            torch.cuda.empty_cache()

        self._trainer.train()

        self._iter += 1