def test_accuracy(self): performance_metrics = [Accuracy()] experiment = HoldOutExperiment( client=self.__client, X=self.__X_class, Y=self.__y_class, scenario_type=PoolBasedSamplingScenario, ml_technique=self.__ml_technique_class, performance_metrics=performance_metrics, query_strategy=self.__query_strategy, oracle=SimulatedOracle(labels=self.__y_class), stopping_criteria=MaxIteration(value=10), self_partition=True, test_ratio=0.3, initial_label_rate=0.05, all_class=True ) start_time = time.time() result = experiment.evaluate(client=self.__client, verbose=True) print() print("---Active Learning experiment %s seconds ---" % (time.time() - start_time)) query_analyser = ExperimentAnalyserFactory.experiment_analyser( performance_metrics=[metric.metric_name for metric in performance_metrics], method_name=self.__query_strategy.query_function_name, method_results=result, type="queries" ) # get a brief description of the experiment query_analyser.plot_learning_curves(title='Active Learning experiment results')
def test_fifteen_iteration(self): experiment = HoldOutExperiment( client=None, X=self.__X, Y=self.__y, scenario_type=PoolBasedSamplingScenario, train_idx=self.__train_idx, test_idx=self.__test_idx, label_idx=self.__label_idx, unlabel_idx=self.__unlabel_idx, ml_technique=self.__ml_technique, performance_metrics=[Mse(squared=True)], query_strategy=QueryRegressionStd(), oracle=SimulatedOracle(labels=self.__y), stopping_criteria=MaxIteration(15), self_partition=False ) result = experiment.evaluate(verbose=False) regressor = result[0].ml_technique # plotting the initial estimation with plt.style.context('seaborn-white'): plt.figure(figsize=(14, 7)) x = np.linspace(0, 20, 1000) pred, std = regressor.predict(x.reshape(-1, 1), return_std=True) plt.plot(x, pred) plt.fill_between(x, pred.reshape(-1, ) - std, pred.reshape(-1, ) + std, alpha=0.2) plt.scatter(self.__X, self.__y, c='k') plt.title('Initial estimation') plt.show()
def test_kullback_leibler_divergence(self): query_strategy = QueryKullbackLeiblerDivergence(n_jobs=5) # init the ALExperiment experiment = HoldOutExperiment( client=self.__client, X=self.__X, Y=self.__y, scenario_type=PoolBasedSamplingScenario, ml_technique=self.__ml_technique, performance_metrics=self.__performance_metrics, query_strategy=query_strategy, oracle=SimulatedOracle(labels=self.__y), stopping_criteria=MaxIteration(5), self_partition=True, test_ratio=0.3, initial_label_rate=0.05, all_class=True ) start_time = time.time() result = experiment.evaluate(client=self.__client, verbose=True) print() print("---Active Learning experiment %s seconds ---" % (time.time() - start_time)) query_analyser = ExperimentAnalyserFactory.experiment_analyser( performance_metrics=[metric.metric_name for metric in self.__performance_metrics], method_name=query_strategy.query_function_name, method_results=result, type="queries" ) # get a brief description of the experiment query_analyser.plot_learning_curves(title='Active Learning experiment results')
def test_cross_validation_randomQuery_MaxIteration(self): ml_technique = LogisticRegression() # ml_technique = BernoulliNB() # ml_technique = svm.SVC(kernel='rbf', probability=True) # ml_technique = svm.NuSVC(gamma='auto', probability=True) # stopping_criteria = PercentOfUnlabel(70) stopping_criteria = MaxIteration(25) # stopping_criteria = TimeLimit(2) # query_strategy = QueryInstanceRandom() query_strategy = QueryInstanceRandom() performance_metrics = [ Accuracy(), F1(average='weighted'), HammingLoss() ] # init the ALExperiment experiment = CrossValidationExperiment( client=self.__client, X=self.__X, Y=self.__y, scenario_type=PoolBasedSamplingScenario, ml_technique=ml_technique, performance_metrics=performance_metrics, query_strategy=query_strategy, oracle=SimulatedOracle(labels=self.__y), stopping_criteria=stopping_criteria, self_partition=True, kfolds=10, oracle_name='SimulatedOracle', test_ratio=0.3, initial_label_rate=0.05, all_class=True, rebalance=True) results = experiment.evaluate(verbose=True, multithread=True, max_threads=10, client=self.__client) for result in results: query_analyser = ExperimentAnalyserFactory.experiment_analyser( performance_metrics=[ metric.metric_name for metric in performance_metrics ], method_name=query_strategy.query_function_name, method_results=result, type="queries") # get a brief description of the experiment query_analyser.plot_learning_curves( title='Active Learning experiment results')
def test_max_iteration(self): max_iteration = MaxIteration(value=10) assert not max_iteration.is_stop() max_iteration.update_information(self.__example_saver) example_saver_local = copy.deepcopy(self.__example_saver) assert max_iteration._current_iter == 0 example_saver_local.add_state( StateItem(select_index=[2], performance=[{ "name": "accuracy_score", "value": 0.89 }], performance_metrics=["accuracy_score"])) max_iteration.update_information(example_saver_local) assert max_iteration._current_iter == 1 assert not max_iteration.is_stop() max_iteration._current_iter = 10 assert max_iteration.is_stop()
def test_mse(self): performance_metrics = [Mse(squared=False)] experiment = HoldOutExperiment( client=self.__client, X=self.__X_reg, Y=self.__y_reg, scenario_type=PoolBasedSamplingScenario, ml_technique=self.__ml_technique_reg, performance_metrics=performance_metrics, query_strategy=self.__query_strategy, oracle=SimulatedOracle(labels=self.__y_reg), stopping_criteria=MaxIteration(value=20), self_partition=True, test_ratio=0.3, initial_label_rate=0.05, all_class=True ) start_time = time.time() result = experiment.evaluate(client=self.__client, verbose=True) print() print("---Active Learning experiment %s seconds ---" % (time.time() - start_time)) query_analyser = ExperimentAnalyserFactory.experiment_analyser( performance_metrics=[metric.metric_name for metric in performance_metrics], method_name=self.__query_strategy.query_function_name, method_results=result, type="queries" ) # get a brief description of the experiment query_analyser.plot_learning_curves(title='Active Learning experiment results') result = experiment.evaluate(verbose=True) regressor = result[0].ml_technique # plotting the initial estimation with plt.style.context('seaborn-white'): plt.figure(figsize=(14, 7)) x = np.linspace(0, 20, 1000) pred, std = regressor.predict(x.reshape(-1, 1), return_std=True) plt.plot(x, pred) plt.fill_between(x, pred.reshape(-1, ) - std, pred.reshape(-1, ) + std, alpha=0.2) plt.scatter(self.__X_reg, self.__y_reg, c='k') plt.title('Initial estimation') plt.show()
def test_cross_validation_randomQuery_unlabelSetEmpty_singleThread(self): ml_technique = LogisticRegression(solver='liblinear') stopping_criteria = MaxIteration(50) query_strategy = QueryInstanceRandom() performance_metrics = [ Accuracy(), F1(average='macro'), HammingLoss(), Precision(average='macro'), Recall(average='macro') ] # init the ALExperiment experiment = CrossValidationExperiment( self.__X, self.__y, scenario_type=PoolBasedSamplingScenario, ml_technique=ml_technique, performance_metrics=performance_metrics, query_strategy=query_strategy, oracle=SimulatedOracle(labels=self.__y), stopping_criteria=stopping_criteria, self_partition=True, kfolds=10, test_ratio=0.3, initial_label_rate=0.05, all_class=True) results = experiment.evaluate(verbose=False) for result in results: query_analyser = ExperimentAnalyserFactory.experiment_analyser( performance_metrics=[ metric.metric_name for metric in performance_metrics ], method_name=query_strategy.query_function_name, method_results=result, type="queries") # get a brief description of the experiment query_analyser.plot_learning_curves( title='Active Learning experiment results')
def test_hold_out_randomQuery_unlabelSetEmpty_ConsoleHumanOracle(self): ml_technique = LogisticRegression(solver='sag') stopping_criteria = MaxIteration(5) query_strategy = QueryInstanceRandom() performance_metrics = [ Accuracy(), F1(average='weighted'), HammingLoss() ] # init the ALExperiment experiment = HoldOutExperiment( client=self.__client, X=self.__X, Y=self.__y, scenario_type=PoolBasedSamplingScenario, ml_technique=ml_technique, performance_metrics=performance_metrics, query_strategy=query_strategy, oracle=ConsoleHumanOracle(labels=self.__y), stopping_criteria=stopping_criteria, self_partition=True, test_ratio=0.3, initial_label_rate=0.05, all_class=True) start_time = time.time() result = experiment.evaluate(client=self.__client, verbose=True) print() print("---Active Learning experiment %s seconds ---" % (time.time() - start_time)) query_analyser = ExperimentAnalyserFactory.experiment_analyser( performance_metrics=[ metric.metric_name for metric in performance_metrics ], method_name=query_strategy.query_function_name, method_results=result, type="queries") # get a brief description of the experiment query_analyser.plot_learning_curves( title='Active Learning experiment results')
def execute_experiment(self, num_iters, file_name): for i in range(0, num_iters): X, y = make_classification(n_samples=self._instance_num, n_features=self._feature_num, n_informative=2 * self._label_num, n_redundant=self._label_num, n_repeated=0, n_classes=self._label_num, n_clusters_per_class=self._label_num, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None, chunks=self._instance_num * 0.10) experiment = HoldOutExperiment( self.__client, X, y, scenario_type=PoolBasedSamplingScenario, ml_technique=self._ml_technique, performance_metrics=self._performance_metrics, query_strategy=self._query_strategy, oracle=SimulatedOracle(labels=y), stopping_criteria=MaxIteration(25), self_partition=True, test_ratio=0.3, initial_label_rate=0.05, all_class=True, batch_size=100, rebalance=True) start_time = time.time() experiment.evaluate(client=self.__client, multithread=False, verbose=True) end_time = time.time() - start_time self.dump_iteration(file_name, {"iter": i + 1, "time": end_time})
def test_hold_out_marginSamplingQuery_unlabelSetEmpty(self): ml_technique = LogisticRegression(solver='liblinear') stopping_criteria = MaxIteration(50) query_strategy = QueryMarginSampling() performance_metrics = [ Accuracy(), F1(average='macro'), HammingLoss(), Precision(average='macro'), Recall(average='macro') ] # init the ALExperiment experiment = HoldOutExperiment(client=None, X=self.__X.to_numpy(), Y=self.__y.to_numpy(), scenario_type=PoolBasedSamplingScenario, ml_technique=ml_technique, performance_metrics=performance_metrics, query_strategy=query_strategy, oracle=SimulatedOracle(labels=self.__y), stopping_criteria=stopping_criteria, self_partition=True, test_ratio=0.3, initial_label_rate=0.05, all_class=False) result = experiment.evaluate(verbose=False) query_analyser = ExperimentAnalyserFactory.experiment_analyser( performance_metrics=[ metric.metric_name for metric in performance_metrics ], method_name=query_strategy.query_function_name, method_results=result, type="queries") # get a brief description of the experiment query_analyser.plot_learning_curves( title='Active Learning experiment results')
def test_ActiveLearning_HoldHout(self): # INI the ALExperiment ----------------------------------------------------------------------------------------- al_ml_technique = LogisticRegression(solver='sag') stopping_criteria = MaxIteration(10) query_strategy = QueryMarginSampling() performance_metrics = [ Accuracy(), F1(average='macro'), HammingLoss(), Precision(average='macro'), Recall(average='macro') ] experiment = HoldOutExperiment( client=self.__client, X=self.__X.to_numpy(), Y=self.__y['BAD'].to_numpy(), scenario_type=PoolBasedSamplingScenario, train_idx=self.__train_idx, test_idx=self.__test_idx, label_idx=self.__label_idx, unlabel_idx=self.__unlabel_idx, ml_technique=al_ml_technique, performance_metrics=performance_metrics, query_strategy=query_strategy, oracle=SimulatedOracle(labels=self.__y['BAD'].to_numpy()), stopping_criteria=stopping_criteria, self_partition=False, rebalance=True, batch_size=50) print("") start_time = time.time() result = experiment.evaluate(verbose=True) print("---Active Learning experiment %s seconds ---" % (time.time() - start_time)) query_analyser = ExperimentAnalyserFactory.experiment_analyser( performance_metrics=[ metric.metric_name for metric in performance_metrics ], method_name=query_strategy.query_function_name, method_results=result, type="queries") # get a brief description of the experiment query_analyser.plot_learning_curves( title='Active Learning experiment results') foldIndex = 0 train_x = self.__X.iloc[self.__train_idx[foldIndex], :] train_y = self.__y.iloc[self.__train_idx[foldIndex], :] test_x = self.__X.iloc[self.__test_idx[foldIndex], :] test_y = self.__y.iloc[self.__test_idx[foldIndex], :] active_y_pred = result[0].ml_technique.predict(test_x) print("Active Learning Accuracy score : ", accuracy_score(test_y, active_y_pred)) print( "Active Learning F1 score: ", f1_score(test_y, active_y_pred, average='macro', zero_division=0)) print("Active Learning Hamming Loss", hamming_loss(test_y, active_y_pred)) print( "Active Learning Precision score : ", precision_score(test_y, active_y_pred, average='macro', zero_division=0)) print( "Active Learning Recall score : ", recall_score(test_y, active_y_pred, average='macro', zero_division=0)) # END the ALExperiment ----------------------------------------------------------------------------------------- # INI the PLExperiment ----------------------------------------------------------------------------------------- pl_ml_technique = LogisticRegression(solver='liblinear') print("") start_time = time.time() pl_ml_technique.fit(train_x, train_y) print("---Passive Learning experiment %s seconds ---" % (time.time() - start_time)) passive_y_pred = pl_ml_technique.predict(test_x) print("Pasive Learning Accuracy score : ", accuracy_score(test_y, passive_y_pred)) print( "Pasive Learning F1 score: ", f1_score(test_y, passive_y_pred, average='macro', zero_division=0)) print("Pasive Learning Hamming Loss", hamming_loss(test_y, passive_y_pred)) print( "Pasive Learning Precision score : ", precision_score(test_y, passive_y_pred, average='macro', zero_division=0)) print( "Pasive Learning Recall score : ", recall_score(test_y, passive_y_pred, average='macro', zero_division=0))
def test_keras_digits_recognition_active_learning(self): # load the data - it returns 2 tuples of digits & labels - one for (x_train, y_train), (x_test, y_test) = mnist.load_data() batch_size = 1024 num_classes = 10 epochs = 3 # input image dimensions img_rows, img_cols = 28, 28 # display 14 random images from the training set np.random.seed(123) rand_14 = np.random.randint(0, x_train.shape[0], 14) sample_digits = x_train[rand_14] sample_labels = y_train[rand_14] num_rows, num_cols = 2, 7 f, ax = plt.subplots(num_rows, num_cols, figsize=(12, 5), gridspec_kw={ 'wspace': 0.03, 'hspace': 0.01 }, squeeze=True) for r in range(num_rows): for c in range(num_cols): image_index = r * 7 + c ax[r, c].axis("off") ax[r, c].imshow(sample_digits[image_index], cmap='gray') ax[r, c].set_title('No. %d' % sample_labels[image_index]) plt.show() plt.close() if K.image_data_format() == 'channels_first': x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols) x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols) input_shape = (1, img_rows, img_cols) else: x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1) x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1) input_shape = (img_rows, img_cols, 1) x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255 x_test /= 255 ml_technique = Sequential() ml_technique.add( Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=input_shape)) ml_technique.add(Conv2D(64, (3, 3), activation='relu')) ml_technique.add(MaxPooling2D(pool_size=(2, 2))) ml_technique.add(Dropout(0.25)) ml_technique.add(Flatten()) ml_technique.add(Dense(128, activation='relu')) ml_technique.add(Dropout(0.5)) ml_technique.add(Dense(num_classes, activation='softmax')) ml_technique.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy']) # convert class vectors to binary class matrices y_train = keras.utils.to_categorical(y_train, num_classes) y_test = keras.utils.to_categorical(y_test, num_classes) X = np.concatenate((x_train, x_test)) y = np.concatenate((y_train, y_test)) train_idx, test_idx, label_idx, unlabel_idx = split( X=X, y=y, test_ratio=0.3, initial_label_rate=0.05, split_count=1, all_class=True) # convert to indexed collection train_idx = IndexCollection(train_idx[0]) test_idx = IndexCollection(test_idx[0]) label_idx = IndexCollection(label_idx[0]) unlabel_idx = IndexCollection(unlabel_idx[0]) # Define the active learning components stopping_criteria = MaxIteration(10) query_strategy = QueryLeastConfidentSampling() oracle = SimulatedOracle(labels=y) start_time = time.time() experimentState = State( round=0, train_idx=train_idx, test_idx=test_idx, init_L=label_idx, init_U=unlabel_idx, performance_metrics=[metric for metric in ["loss", "accuracy"]], verbose=True) while not stopping_criteria.is_stop() and len(unlabel_idx) > 0: label_x = X[label_idx.index, :] label_y = y[label_idx.index] test_x = X[test_idx, :] test_y = y[test_idx] # Train and evaluate Model over the labeled instances ml_technique.fit(label_x, label_y, batch_size=batch_size, epochs=epochs, verbose=True, validation_data=(test_x, test_y)) # predict the results over the labeled test instances label_pred = ml_technique.predict_classes(test_x) # performance calc for all metrics label_perf = [] score = ml_technique.evaluate(x_test, y_test, verbose=1) label_perf.append({"name": "loss", "value": score[0]}) label_perf.append({"name": "accuracy", "value": score[1]}) # use the query strategy for selecting the indexes select_ind = query_strategy.select(X=X, y=y, label_index=label_idx, unlabel_index=unlabel_idx, batch_size=batch_size, model=ml_technique, client=self.__client) # show label values oracle.query(instances=X[select_ind], indexes=select_ind) # update label and unlabel instaces label_idx.update(select_ind) unlabel_idx.difference_update(select_ind) # save intermediate results experimentState.add_state( StateItem(select_index=select_ind, performance_metrics=[ metric['name'] for metric in label_perf ], performance=label_perf)) # update stopping_criteria stopping_criteria.update_information(experimentState) end_time = time.time() - start_time print(end_time) query_analyser = ExperimentAnalyserFactory.experiment_analyser( performance_metrics=[metric for metric in ["loss", "accuracy"]], method_name=query_strategy.query_function_name, method_results=[experimentState], type="queries") # get a brief description of the experiment query_analyser.plot_learning_curves( title='Active Learning experiment results')
def test_custom_activeLearning_keras(self): batch_size = 5 epochs = 20 # partition the data train_idx, test_idx, label_idx, unlabel_idx = split( X=self.__X, y=self.__y, test_ratio=0.3, initial_label_rate=0.05, split_count=1, all_class=True) # convert to indexed collection train_idx = IndexCollection(train_idx[0]) test_idx = IndexCollection(test_idx[0]) label_idx = IndexCollection(label_idx[0]) unlabel_idx = IndexCollection(unlabel_idx[0]) # Create the model ml_technique = Sequential() ml_technique.add(Dense(input_dim=30, units=30)) ml_technique.add(Dense(input_dim=30, units=30)) ml_technique.add(Dense(input_dim=30, units=2)) ml_technique.add(Activation('softmax')) ml_technique.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy']) # Define the active learning components stopping_criteria = MaxIteration(10) query_strategy = QueryLeastConfidentSampling() performance_metrics = [ Accuracy(), F1(average='weighted'), HammingLoss() ] oracle = SimulatedOracle(labels=self.__y) start_time = time.time() experimentState = State(round=0, train_idx=train_idx, test_idx=test_idx, init_L=label_idx, init_U=unlabel_idx, performance_metrics=[ metric.metric_name for metric in performance_metrics ], verbose=True) while not stopping_criteria.is_stop() and len(unlabel_idx) > 0: label_x = self.__X[label_idx.index, :] label_y = self.__y[label_idx.index] test_x = self.__X[test_idx, :] test_y = self.__y[test_idx] # Train and evaluate Model over the labeled instances ml_technique.fit(label_x, label_y, batch_size=batch_size, epochs=epochs, verbose=True) # predict the results over the labeled test instances label_pred = ml_technique.predict_classes(test_x) # performance calc for all metrics label_perf = [] for metric in performance_metrics: value = metric.compute(y_true=test_y, y_pred=label_pred) label_perf.append({"name": metric.metric_name, "value": value}) # use the query strategy for selecting the indexes select_ind = query_strategy.select(X=self.__X, y=self.__y, label_index=label_idx, unlabel_index=unlabel_idx, batch_size=batch_size, model=ml_technique, client=self.__client) # show label values oracle.query(instances=self.__X[select_ind], indexes=select_ind) # update label and unlabel instaces label_idx.update(select_ind) unlabel_idx.difference_update(select_ind) # save intermediate results experimentState.add_state( StateItem(select_index=select_ind, performance_metrics=[ metric['name'] for metric in label_perf ], performance=label_perf)) # update stopping_criteria stopping_criteria.update_information(experimentState) end_time = time.time() - start_time print(end_time) query_analyser = ExperimentAnalyserFactory.experiment_analyser( performance_metrics=[ metric.metric_name for metric in performance_metrics ], method_name=query_strategy.query_function_name, method_results=[experimentState], type="queries") # get a brief description of the experiment query_analyser.plot_learning_curves( title='Active Learning experiment results')