Beispiel #1
0
    def test_accuracy(self):
        performance_metrics = [Accuracy()]

        experiment = HoldOutExperiment(
            client=self.__client,
            X=self.__X_class,
            Y=self.__y_class,
            scenario_type=PoolBasedSamplingScenario,
            ml_technique=self.__ml_technique_class,
            performance_metrics=performance_metrics,
            query_strategy=self.__query_strategy,
            oracle=SimulatedOracle(labels=self.__y_class),
            stopping_criteria=MaxIteration(value=10),
            self_partition=True,
            test_ratio=0.3,
            initial_label_rate=0.05,
            all_class=True
        )

        start_time = time.time()
        result = experiment.evaluate(client=self.__client, verbose=True)
        print()
        print("---Active Learning experiment %s seconds ---" % (time.time() - start_time))

        query_analyser = ExperimentAnalyserFactory.experiment_analyser(
            performance_metrics=[metric.metric_name for metric in performance_metrics],
            method_name=self.__query_strategy.query_function_name,
            method_results=result,
            type="queries"
        )

        # get a brief description of the experiment
        query_analyser.plot_learning_curves(title='Active Learning experiment results')
    def test_fifteen_iteration(self):

        experiment = HoldOutExperiment(
            client=None,
            X=self.__X,
            Y=self.__y,
            scenario_type=PoolBasedSamplingScenario,
            train_idx=self.__train_idx,
            test_idx=self.__test_idx,
            label_idx=self.__label_idx,
            unlabel_idx=self.__unlabel_idx,
            ml_technique=self.__ml_technique,
            performance_metrics=[Mse(squared=True)],
            query_strategy=QueryRegressionStd(),
            oracle=SimulatedOracle(labels=self.__y),
            stopping_criteria=MaxIteration(15),
            self_partition=False
        )

        result = experiment.evaluate(verbose=False)
        regressor = result[0].ml_technique

        # plotting the initial estimation
        with plt.style.context('seaborn-white'):
            plt.figure(figsize=(14, 7))
            x = np.linspace(0, 20, 1000)
            pred, std = regressor.predict(x.reshape(-1, 1), return_std=True)
            plt.plot(x, pred)
            plt.fill_between(x, pred.reshape(-1, ) - std, pred.reshape(-1, ) + std, alpha=0.2)
            plt.scatter(self.__X, self.__y, c='k')
            plt.title('Initial estimation')
            plt.show()
    def test_kullback_leibler_divergence(self):

        query_strategy = QueryKullbackLeiblerDivergence(n_jobs=5)

        # init the ALExperiment
        experiment = HoldOutExperiment(
            client=self.__client,
            X=self.__X,
            Y=self.__y,
            scenario_type=PoolBasedSamplingScenario,
            ml_technique=self.__ml_technique,
            performance_metrics=self.__performance_metrics,
            query_strategy=query_strategy,
            oracle=SimulatedOracle(labels=self.__y),
            stopping_criteria=MaxIteration(5),
            self_partition=True,
            test_ratio=0.3,
            initial_label_rate=0.05,
            all_class=True
        )

        start_time = time.time()
        result = experiment.evaluate(client=self.__client, verbose=True)
        print()
        print("---Active Learning experiment %s seconds ---" % (time.time() - start_time))

        query_analyser = ExperimentAnalyserFactory.experiment_analyser(
            performance_metrics=[metric.metric_name for metric in self.__performance_metrics],
            method_name=query_strategy.query_function_name,
            method_results=result,
            type="queries"
        )

        # get a brief description of the experiment
        query_analyser.plot_learning_curves(title='Active Learning experiment results')
    def test_query_regression_std_batch_size(self):

        # Get the data
        X = np.random.choice(np.linspace(0, 20, 1000), size=100, replace=False).reshape(-1, 1)
        y = np.sin(X) + np.random.normal(scale=0.3, size=X.shape)

        # assembling initial training set
        train_idx, test_idx, label_idx, unlabel_idx = split(
            X=X,
            y=y,
            test_ratio=0.3,
            initial_label_rate=0.05,
            split_count=1,
            all_class=True)

        # defining the kernel for the Gaussian process
        ml_technique = GaussianProcessRegressor(
            kernel=RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) \
                   + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1)))

        experiment = HoldOutExperiment(
            client=self.__client,
            X=X,
            Y=y,
            scenario_type=PoolBasedSamplingScenario,
            train_idx=train_idx,
            test_idx=test_idx,
            label_idx=label_idx,
            unlabel_idx=unlabel_idx,
            ml_technique=ml_technique,
            performance_metrics=[Mse(squared=True)],
            query_strategy=QueryRegressionStd(),
            oracle=SimulatedOracle(labels=y),
            stopping_criteria=PercentOfUnlabel(value=70),
            self_partition=False,
            batch_size=self.__batch_size
        )

        result = experiment.evaluate(verbose=True)
        regressor = result[0].ml_technique

        # plotting the initial estimation
        with plt.style.context('seaborn-white'):
            plt.figure(figsize=(14, 7))
            x = np.linspace(0, 20, 1000)
            pred, std = regressor.predict(x.reshape(-1, 1), return_std=True)
            plt.plot(x, pred)
            plt.fill_between(x, pred.reshape(-1, ) - std, pred.reshape(-1, ) + std, alpha=0.2)
            plt.scatter(X, y, c='k')
            plt.title('Initial estimation')
            plt.show()
Beispiel #5
0
    def test_hold_out_self_partitioning(self):

        split_count = 1
        instance_num = 100

        self.__X, self.__y = make_classification(n_samples=instance_num,
                                                 n_features=4,
                                                 n_informative=2,
                                                 n_redundant=2,
                                                 n_repeated=0,
                                                 n_classes=2,
                                                 n_clusters_per_class=2,
                                                 weights=None,
                                                 flip_y=0.01,
                                                 class_sep=1.0,
                                                 hypercube=True,
                                                 shift=0.0,
                                                 scale=1.0,
                                                 shuffle=True,
                                                 random_state=None)

        # init the ALExperiment
        experiment = HoldOutExperiment(self.__X,
                                       self.__y,
                                       self_partition=True,
                                       stopping_criteria=UnlabelSetEmpty(),
                                       test_ratio=0.3,
                                       initial_label_rate=0.05,
                                       all_class=True)

        assert len(experiment._train_idx) == split_count
        assert len(experiment._test_idx) == split_count
        assert len(experiment._label_idx) == split_count
        assert len(experiment._unlabel_idx) == split_count

        for i in range(split_count):
            train = set(experiment._train_idx[i])
            test = set(experiment._test_idx[i])
            lab = set(experiment._label_idx[i])
            unl = set(experiment._unlabel_idx[i])

            assert len(test) == round(0.3 * instance_num)
            assert len(lab) == round(0.05 * len(train))

            # validity
            traintest = train.union(test)
            labun = lab.union(unl)
            assert traintest == set(range(instance_num))
            assert labun == train
Beispiel #6
0
    def test_mse(self):
        performance_metrics = [Mse(squared=False)]

        experiment = HoldOutExperiment(
            client=self.__client,
            X=self.__X_reg,
            Y=self.__y_reg,
            scenario_type=PoolBasedSamplingScenario,
            ml_technique=self.__ml_technique_reg,
            performance_metrics=performance_metrics,
            query_strategy=self.__query_strategy,
            oracle=SimulatedOracle(labels=self.__y_reg),
            stopping_criteria=MaxIteration(value=20),
            self_partition=True,
            test_ratio=0.3,
            initial_label_rate=0.05,
            all_class=True
        )

        start_time = time.time()
        result = experiment.evaluate(client=self.__client, verbose=True)
        print()
        print("---Active Learning experiment %s seconds ---" % (time.time() - start_time))

        query_analyser = ExperimentAnalyserFactory.experiment_analyser(
            performance_metrics=[metric.metric_name for metric in performance_metrics],
            method_name=self.__query_strategy.query_function_name,
            method_results=result,
            type="queries"
        )

        # get a brief description of the experiment
        query_analyser.plot_learning_curves(title='Active Learning experiment results')

        result = experiment.evaluate(verbose=True)
        regressor = result[0].ml_technique

        # plotting the initial estimation
        with plt.style.context('seaborn-white'):
            plt.figure(figsize=(14, 7))
            x = np.linspace(0, 20, 1000)
            pred, std = regressor.predict(x.reshape(-1, 1), return_std=True)
            plt.plot(x, pred)
            plt.fill_between(x, pred.reshape(-1, ) - std, pred.reshape(-1, ) + std, alpha=0.2)
            plt.scatter(self.__X_reg, self.__y_reg, c='k')
            plt.title('Initial estimation')
            plt.show()
Beispiel #7
0
    def test_hold_out_marginSamplingQuery_unlabelSetEmpty(self):

        ml_technique = LogisticRegression()
        stopping_criteria = UnlabelSetEmpty()
        query_strategy = QueryMarginSampling()
        performance_metrics = [
            Accuracy(), F1(average='weighted'),
            HammingLoss()
        ]
        # performance_metrics = [Mse(square=False), Mse(square=True)]

        # init the ALExperiment
        experiment = HoldOutExperiment(client=self.__client,
                                       X=self.__X,
                                       Y=self.__y,
                                       scenario_type=PoolBasedSamplingScenario,
                                       ml_technique=ml_technique,
                                       performance_metrics=performance_metrics,
                                       query_strategy=query_strategy,
                                       oracle=SimulatedOracle(labels=self.__y),
                                       stopping_criteria=stopping_criteria,
                                       self_partition=True,
                                       test_ratio=0.3,
                                       initial_label_rate=0.05,
                                       all_class=False)

        result = experiment.evaluate(client=self.__client, verbose=True)

        query_analyser = ExperimentAnalyserFactory.experiment_analyser(
            performance_metrics=[
                metric.metric_name for metric in performance_metrics
            ],
            method_name=query_strategy.query_function_name,
            method_results=result,
            type="queries")
        # get a brief description of the experiment
        query_analyser.plot_learning_curves(
            title='Active Learning experiment results')

        np.random.seed(0)
        indices = np.random.permutation(len(self.__X))
        iris_X_test = self.__X[indices[-10:]]
        print(result[0].ml_technique.predict(iris_X_test))
Beispiel #8
0
    def test_hold_out_randomQuery_unlabelSetEmpty_ConsoleHumanOracle(self):
        ml_technique = LogisticRegression(solver='sag')
        stopping_criteria = MaxIteration(5)
        query_strategy = QueryInstanceRandom()
        performance_metrics = [
            Accuracy(), F1(average='weighted'),
            HammingLoss()
        ]

        # init the ALExperiment
        experiment = HoldOutExperiment(
            client=self.__client,
            X=self.__X,
            Y=self.__y,
            scenario_type=PoolBasedSamplingScenario,
            ml_technique=ml_technique,
            performance_metrics=performance_metrics,
            query_strategy=query_strategy,
            oracle=ConsoleHumanOracle(labels=self.__y),
            stopping_criteria=stopping_criteria,
            self_partition=True,
            test_ratio=0.3,
            initial_label_rate=0.05,
            all_class=True)

        start_time = time.time()
        result = experiment.evaluate(client=self.__client, verbose=True)
        print()
        print("---Active Learning experiment %s seconds ---" %
              (time.time() - start_time))

        query_analyser = ExperimentAnalyserFactory.experiment_analyser(
            performance_metrics=[
                metric.metric_name for metric in performance_metrics
            ],
            method_name=query_strategy.query_function_name,
            method_results=result,
            type="queries")

        # get a brief description of the experiment
        query_analyser.plot_learning_curves(
            title='Active Learning experiment results')
    def execute_experiment(self, num_iters, file_name):
        for i in range(0, num_iters):
            X, y = make_classification(n_samples=self._instance_num,
                                       n_features=self._feature_num,
                                       n_informative=2 * self._label_num,
                                       n_redundant=self._label_num,
                                       n_repeated=0,
                                       n_classes=self._label_num,
                                       n_clusters_per_class=self._label_num,
                                       weights=None,
                                       flip_y=0.01,
                                       class_sep=1.0,
                                       hypercube=True,
                                       shift=0.0,
                                       scale=1.0,
                                       shuffle=True,
                                       random_state=None,
                                       chunks=self._instance_num * 0.10)

            experiment = HoldOutExperiment(
                self.__client,
                X,
                y,
                scenario_type=PoolBasedSamplingScenario,
                ml_technique=self._ml_technique,
                performance_metrics=self._performance_metrics,
                query_strategy=self._query_strategy,
                oracle=SimulatedOracle(labels=y),
                stopping_criteria=MaxIteration(25),
                self_partition=True,
                test_ratio=0.3,
                initial_label_rate=0.05,
                all_class=True,
                batch_size=100,
                rebalance=True)

            start_time = time.time()
            experiment.evaluate(client=self.__client,
                                multithread=False,
                                verbose=True)
            end_time = time.time() - start_time
            self.dump_iteration(file_name, {"iter": i + 1, "time": end_time})
Beispiel #10
0
    def test_hold_out_marginSamplingQuery_unlabelSetEmpty(self):

        ml_technique = LogisticRegression(solver='liblinear')
        stopping_criteria = MaxIteration(50)
        query_strategy = QueryMarginSampling()
        performance_metrics = [
            Accuracy(),
            F1(average='macro'),
            HammingLoss(),
            Precision(average='macro'),
            Recall(average='macro')
        ]

        # init the ALExperiment
        experiment = HoldOutExperiment(client=None,
                                       X=self.__X.to_numpy(),
                                       Y=self.__y.to_numpy(),
                                       scenario_type=PoolBasedSamplingScenario,
                                       ml_technique=ml_technique,
                                       performance_metrics=performance_metrics,
                                       query_strategy=query_strategy,
                                       oracle=SimulatedOracle(labels=self.__y),
                                       stopping_criteria=stopping_criteria,
                                       self_partition=True,
                                       test_ratio=0.3,
                                       initial_label_rate=0.05,
                                       all_class=False)

        result = experiment.evaluate(verbose=False)

        query_analyser = ExperimentAnalyserFactory.experiment_analyser(
            performance_metrics=[
                metric.metric_name for metric in performance_metrics
            ],
            method_name=query_strategy.query_function_name,
            method_results=result,
            type="queries")
        # get a brief description of the experiment
        query_analyser.plot_learning_curves(
            title='Active Learning experiment results')
Beispiel #11
0
    def test_ActiveLearning_HoldHout(self):

        # INI the ALExperiment -----------------------------------------------------------------------------------------
        al_ml_technique = LogisticRegression(solver='sag')
        stopping_criteria = MaxIteration(10)
        query_strategy = QueryMarginSampling()
        performance_metrics = [
            Accuracy(),
            F1(average='macro'),
            HammingLoss(),
            Precision(average='macro'),
            Recall(average='macro')
        ]

        experiment = HoldOutExperiment(
            client=self.__client,
            X=self.__X.to_numpy(),
            Y=self.__y['BAD'].to_numpy(),
            scenario_type=PoolBasedSamplingScenario,
            train_idx=self.__train_idx,
            test_idx=self.__test_idx,
            label_idx=self.__label_idx,
            unlabel_idx=self.__unlabel_idx,
            ml_technique=al_ml_technique,
            performance_metrics=performance_metrics,
            query_strategy=query_strategy,
            oracle=SimulatedOracle(labels=self.__y['BAD'].to_numpy()),
            stopping_criteria=stopping_criteria,
            self_partition=False,
            rebalance=True,
            batch_size=50)

        print("")
        start_time = time.time()
        result = experiment.evaluate(verbose=True)
        print("---Active Learning experiment %s seconds ---" %
              (time.time() - start_time))

        query_analyser = ExperimentAnalyserFactory.experiment_analyser(
            performance_metrics=[
                metric.metric_name for metric in performance_metrics
            ],
            method_name=query_strategy.query_function_name,
            method_results=result,
            type="queries")
        # get a brief description of the experiment
        query_analyser.plot_learning_curves(
            title='Active Learning experiment results')

        foldIndex = 0
        train_x = self.__X.iloc[self.__train_idx[foldIndex], :]
        train_y = self.__y.iloc[self.__train_idx[foldIndex], :]
        test_x = self.__X.iloc[self.__test_idx[foldIndex], :]
        test_y = self.__y.iloc[self.__test_idx[foldIndex], :]

        active_y_pred = result[0].ml_technique.predict(test_x)

        print("Active Learning Accuracy score : ",
              accuracy_score(test_y, active_y_pred))
        print(
            "Active Learning F1 score: ",
            f1_score(test_y, active_y_pred, average='macro', zero_division=0))
        print("Active Learning Hamming Loss",
              hamming_loss(test_y, active_y_pred))
        print(
            "Active Learning Precision score : ",
            precision_score(test_y,
                            active_y_pred,
                            average='macro',
                            zero_division=0))
        print(
            "Active Learning Recall score : ",
            recall_score(test_y,
                         active_y_pred,
                         average='macro',
                         zero_division=0))

        # END the ALExperiment -----------------------------------------------------------------------------------------

        # INI the PLExperiment -----------------------------------------------------------------------------------------
        pl_ml_technique = LogisticRegression(solver='liblinear')

        print("")
        start_time = time.time()
        pl_ml_technique.fit(train_x, train_y)
        print("---Passive Learning experiment %s seconds ---" %
              (time.time() - start_time))

        passive_y_pred = pl_ml_technique.predict(test_x)

        print("Pasive Learning Accuracy score : ",
              accuracy_score(test_y, passive_y_pred))
        print(
            "Pasive Learning F1 score: ",
            f1_score(test_y, passive_y_pred, average='macro', zero_division=0))
        print("Pasive Learning Hamming Loss",
              hamming_loss(test_y, passive_y_pred))
        print(
            "Pasive Learning Precision score : ",
            precision_score(test_y,
                            passive_y_pred,
                            average='macro',
                            zero_division=0))
        print(
            "Pasive Learning Recall score : ",
            recall_score(test_y,
                         passive_y_pred,
                         average='macro',
                         zero_division=0))