def test_opf_accuracy(): labels = [1, 1, 2, 2] preds = [1, 1, 1, 1] acc = general.opf_accuracy(labels, preds) assert acc == 0.5
def prune(self, X_train, Y_train, X_val, Y_val, n_iterations=10): """Prunes a classifier over a validation set. Args: X_train (np.array): Array of training features. Y_train (np.array): Array of training labels. X_val (np.array): Array of validation features. Y_val (np.array): Array of validation labels. n_iterations (int): Maximum number of iterations. """ logger.info('Pruning classifier ...') # Fits training data into the classifier self.fit(X_train, Y_train) # Predicts new data self.predict(X_val) # Gathering initial number of nodes initial_nodes = self.subgraph.n_nodes # For every possible iteration for t in range(n_iterations): logger.info('Running iteration %d/%d ...', t + 1, n_iterations) # Creating temporary lists X_temp, Y_temp = [], [] # Removing irrelevant nodes for j, n in enumerate(self.subgraph.nodes): if n.relevant != c.IRRELEVANT: X_temp.append(X_train[j, :]) Y_temp.append(Y_train[j]) # Copying lists back to original data X_train = np.asarray(X_temp) Y_train = np.asarray(Y_temp) # Fits training data into the classifier self.fit(X_train, Y_train) # Predicts new data preds = self.predict(X_val) # Calculating accuracy acc = g.opf_accuracy(Y_val, preds) logger.info('Current accuracy: %s.', acc) # Gathering final number of nodes final_nodes = self.subgraph.n_nodes # Calculating pruning ratio prune_ratio = 1 - final_nodes / initial_nodes logger.info('Prune ratio: %s.', prune_ratio)
def supervised_opf_feature_selection(opytimizer): # Gathers features features = opytimizer[:, 0].astype(bool) # Remaking training and validation subgraphs with selected features X_train_selected = X_train[:, features] X_val_selected = X_val[:, features] # Creates a SupervisedOPF instance opf = SupervisedOPF(distance='log_squared_euclidean', pre_computed_distance=None) # Fits training data into the classifier opf.fit(X_train_selected, Y_train) # Predicts new data preds = opf.predict(X_val_selected) # Calculates accuracy acc = g.opf_accuracy(Y_val, preds) return 1 - acc
def unsupervised_opf_clustering(opytimizer): # Gathers parameters from Opytimizer # Pay extremely attention to their order when declaring due to their bounds max_k = int(opytimizer[0][0]) # Creates an UnsupervisedOPF instance opf = UnsupervisedOPF(max_k=max_k, distance='log_squared_euclidean', pre_computed_distance=None) # Fits training data into the classifier opf.fit(X_train, Y_train) # If data is labeled, one can propagate predicted labels instead of only the cluster identifiers opf.propagate_labels() # Predicts new data preds, _ = opf.predict(X_test) # Calculates accuracy acc = g.opf_accuracy(Y_test, preds) return 1 - acc
X_train, X_val, Y_train, Y_val = s.split(X, Y, percentage=0.5, random_state=1) # Creates a always true loop while True: # Creates a SupervisedOPF instance opf = SupervisedOPF(distance='log_squared_euclidean', pre_computed_distance=None) # Fits training data into the classifier opf.fit(X_train, Y_train) # Predicts new data preds = opf.predict(X_val) # Calculating accuracy acc = g.opf_accuracy(Y_val, preds) print(f'Accuracy: {acc}') # Gathers which samples were missclassified errors = np.argwhere(Y_val != preds) # If there are no missclassified samples if len(errors) == 0: # Breaks the process break # For every wrong classified sample for e in errors: # Adds the sample to the training set X_train = np.vstack((X_train, X_val[e, :]))
def _learn(self, X_train, Y_train, I_train, X_val, Y_val, I_val): """Learns the best `k` value over the validation set. Args: X_train (np.array): Array of training features. Y_train (np.array): Array of training labels. I_train (np.array): Array of training indexes. X_val (np.array): Array of validation features. Y_val (np.array): Array of validation labels. I_val (np.array): Array of validation indexes. """ logger.info('Learning best `k` value ...') # Creating a subgraph self.subgraph = KNNSubgraph(X_train, Y_train, I_train) if self.pre_computed_distance: if self.pre_distances.shape[ 0] != self.subgraph.n_nodes or self.pre_distances.shape[ 1] != self.subgraph.n_nodes: raise e.BuildError( 'Pre-computed distance matrix should have the size of `n_nodes x n_nodes`' ) # Defining initial maximum accuracy as 0 max_acc = 0.0 for k in range(1, self.max_k + 1): # Gathers current `k` as subgraph's best `k` self.subgraph.best_k = k # Calculate the arcs using the current `k` value self.subgraph.create_arcs(k, self.distance_fn, self.pre_computed_distance, self.pre_distances) # Calculate the p.d.f. using the current `k` value self.subgraph.calculate_pdf(k, self.distance_fn, self.pre_computed_distance, self.pre_distances) # Clusters the subgraph self._clustering() # Calculate the predictions over the validation set preds = self.predict(X_val, I_val) # Calculating the accuracy acc = g.opf_accuracy(Y_val, preds) if acc > max_acc: max_acc = acc best_k = k logger.info('Accuracy over k = %d: %s', k, acc) self.subgraph.destroy_arcs() self.subgraph.best_k = best_k
y_val_opf = y_val_opf + 1 y_test_opf = y_test + 1 clf.learn(X_train_opf, y_train_opf, X_val_opf, y_val_opf, n_iterations=20) else: clf.fit(X_train, y_train) if name == "OPF": preds = clf.predict(X_test) acc = g.opf_accuracy(y_test_opf, preds) score = acc print(score) print(accuracy_score(y_test_opf, preds)) else: score = clf.score(X_test, y_test) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, x_max]x[y_min, y_max]. if hasattr(clf, "decision_function"): Z = clf.decision_function(Xfull) elif name == "OPF": predsopf = clf.predict(Xfull) predsopf = np.asarray(predsopf) predsopf = predsopf - 1
# Parsing a pre-loaded numpy array X, Y = p.parse_loader(txt) # Splitting data into training and testing sets X_train, X_test, Y_train, Y_test = s.split(X, Y, percentage=0.8, random_state=1) #Splitting data into training and validation sets X_train, X_unlabeled, Y_train, Y_unlabeled = s.split(X_train, Y_train, percentage=0.25, random_state=1) # Creates a SemiSupervisedOPF instance opf = SemiSupervisedOPF(distance='log_squared_euclidean', pre_computed_distance=None) # Fits training data along with unlabeled data into the semi-supervised classifier opf.fit(X_train, Y_train, X_unlabeled) # Predicts new data preds = opf.predict(X_test) # Calculating accuracy acc = g.opf_accuracy(Y_test, preds) print(f'Accuracy: {acc}')
def learn(self, X_train, Y_train, X_val, Y_val, n_iterations=10): """Learns the best classifier over a validation set. Args: X_train (np.array): Array of training features. Y_train (np.array): Array of training labels. X_val (np.array): Array of validation features. Y_val (np.array): Array of validation labels. n_iterations (int): Number of iterations. """ logger.info('Learning the best classifier ...') # Defines the maximum accuracy max_acc = 0 # Defines the previous accuracy previous_acc = 0 # Defines the iterations counter t = 0 while True: logger.info('Running iteration %d/%d ...', t+1, n_iterations) # Fits training data into the classifier self.fit(X_train, Y_train) # Predicts new data preds = self.predict(X_val) # Calculating accuracy acc = g.opf_accuracy(Y_val, preds) if acc > max_acc: max_acc = acc best_opf = copy.deepcopy(self) # Saves the iteration number best_t = t # Gathers which samples were missclassified errors = np.argwhere(Y_val != preds) # Defining the initial number of non-prototypes as 0 non_prototypes = 0 for n in self.subgraph.nodes: if n.status != c.PROTOTYPE: non_prototypes += 1 for err in errors: # Counter will receive the number of non-prototypes ctr = non_prototypes # While the counter is bigger than zero while ctr > 0: # Generates a random index j = int(r.generate_uniform_random_number(0, len(X_train))) # If the node on that particular index is not a prototype if self.subgraph.nodes[j].status != c.PROTOTYPE: # Swap the input nodes X_train[j, :], X_val[err, :] = X_val[err, :], X_train[j, :] # Swap the target nodes Y_train[j], Y_val[err] = Y_val[err], Y_train[j] # Decrements the number of non-prototypes non_prototypes -= 1 # Resets the counter ctr = 0 # If the node on that particular index is a prototype else: # Decrements the counter ctr -= 1 # Calculating difference between current accuracy and previous one delta = np.fabs(acc - previous_acc) # Replacing the previous accuracy as current accuracy previous_acc = acc # Incrementing the counter t += 1 logger.info('Accuracy: %s | Delta: %s | Maximum Accuracy: %s', acc, delta, max_acc) # If the difference is smaller than 10e-4 or iterations are finished if delta < 0.0001 or t == n_iterations: # Replaces current class with the best OPF self = best_opf logger.info('Best classifier has been learned over iteration %d.', best_t+1) break
def _learn(self, X_train, Y_train, X_val, Y_val): """Learns the best `k` value over the validation set. Args: X_train (np.array): Array of training features. Y_train (np.array): Array of training labels. X_val (np.array): Array of validation features. Y_val (np.array): Array of validation labels. """ logger.info('Learning best `k` value ...') # Creating a subgraph self.subgraph = KNNSubgraph(X_train, Y_train) # Checks if it is supposed to use pre-computed distances if self.pre_computed_distance: # Checks if its size is the same as the subgraph's amount of nodes if self.pre_distances.shape[ 0] != self.subgraph.n_nodes or self.pre_distances.shape[ 1] != self.subgraph.n_nodes: # If not, raises an error raise e.BuildError( 'Pre-computed distance matrix should have the size of `n_nodes x n_nodes`' ) # Defining initial maximum accuracy as 0 max_acc = 0.0 # For every possible `k` value for k in range(1, self.max_k + 1): # Gathers current `k` as subgraph's best `k` self.subgraph.best_k = k # Calculate the arcs using the current `k` value self.subgraph.create_arcs(k, self.distance_fn, self.pre_computed_distance, self.pre_distances) # Calculate the p.d.f. using the current `k` value self.subgraph.calculate_pdf(k, self.distance_fn, self.pre_computed_distance, self.pre_distances) # Clusters the subgraph self._clustering() # Calculate the predictions over the validation set preds = self.predict(X_val) # Calculating the accuracy acc = g.opf_accuracy(Y_val, preds) # If accuracy is better than maximum accuracy if acc > max_acc: # Replaces the maximum accuracy value max_acc = acc # Defines current `k` as the best `k` value best_k = k logger.info(f'Accuracy over k = {k}: {acc}') # Destroy the arcs self.subgraph.destroy_arcs() # Applying the best k to the subgraph's property self.subgraph.best_k = best_k
import numpy as np import opfython.math.general as g # Defining array, labels and predictions array = np.asarray([1.5, 2, 0.5, 1.25, 1.75, 3]) labels = [0, 0, 0, 1, 1, 1, 2] preds = [0, 0, 1, 1, 0, 1, 2] # Normalizing the array norm_array = g.normalize(array) print(norm_array) # Calculating the confusion matrix c_matrix = g.confusion_matrix(labels, preds) print(c_matrix) # Calculating OPF-like accuracy opf_acc = g.opf_accuracy(labels, preds) print(opf_acc) # Calculating OPF-like accuracy per label opf_acc_per_label = g.opf_accuracy_per_label(labels, preds) print(opf_acc_per_label) # Calculating purity measure purity = g.purity(labels, preds) print(purity)
def _learn(self, X_train, Y_train, I_train, X_val, Y_val, I_val): """Learns the best `k` value over the validation set. Args: X_train (np.array): Array of training features. Y_train (np.array): Array of training labels. I_train (np.array): Array of training indexes. X_val (np.array): Array of validation features. Y_val (np.array): Array of validation labels. I_val (np.array): Array of validation indexes. """ logger.info('Learning best `k` value ...') # Creating a subgraph self.subgraph = ANNSubgraph(X_train, Y_train, I_train) # Defining initial maximum accuracy as 0 max_acc = 0. best_k = 1 # For every possible `k` value for k in range(1, self.max_k + 1): # Gathers current `k` as subgraph's best `k` self.subgraph.best_k = k # Initiating the ANN method to perform Approximate Nearest Neighbors search if self.ann_params.get('name') == 'hnsw': self.ann_params['ef'] = k self.ann_search = self.ann_class(self.ann_params) # Build the ANN index self.ann_search.fit(X_train) # Calculate the arcs using the current `k` value self.subgraph.build_arcs(k, self.ann_search) # Calculate the p.d.f. using the current `k` value # self.subgraph.calculate_pdf( # k, self.distance_fn, self.pre_computed_distance, self.pre_distances) self.subgraph.calc_pdf(k, self.distance_fn) # Clusters the subgraph self._clustering() # Calculate the predictions over the validation set preds = self.predict(X_val, I_val) # Calculating the accuracy acc = g.opf_accuracy(Y_val, preds) # If accuracy is better than maximum accuracy if acc > max_acc: # Replaces the maximum accuracy value max_acc = acc # Defines current `k` as the best `k` value best_k = k logger.info('Accuracy over k = %d: %s', k, acc) # Destroy the arcs self.subgraph.destroy_arcs() # Applying the best k to the subgraph's property self.subgraph.best_k = best_k
def run(self, technique, easy_X, easy_Y, hard_X, hard_Y, test_X, test_Y, k_hardSamples, k_easySamples, iteration): # ---------- PIPELINE ---------- print("") print("Iniciando Pipeline..") print("") # final variables ssmodel_accuracy = [] ssmodel_corrected = [] fullmodel_accuracy = [] fullmodel_corrected = [] hard_time_to_select = [] easy_time_to_select = [] ssmodel_knowClass = [] fullmodel_knowClass = [] wrong_percentage = [] easy_X_bkp = easy_X easy_Y_bkp = easy_Y hard_X_bkp = hard_X hard_Y_bkp = hard_Y # For to iterate over scenarios for (method_hard, method_easy) in zip([technique] * 2, ["Random", technique]): # for variables ss_model_score = [] ss_model_corrected = [] full_model_score = [] full_model_corrected = [] hardTimeToSelect = [] easyTimeToSelect = [] ss_know_class = [] full_know_class = [] wrongPercentages = [] # Recover complete dataset easy_X = easy_X_bkp easy_Y = easy_Y_bkp hard_X = hard_X_bkp hard_Y = hard_Y_bkp print( "Métodos de Aprendizado Ativo: Dataset Hard {} / Dataset Easy {}" .format(method_hard, method_easy)) print("") # For to control iterations number for i in range(0, iteration): print("===== Iteração {} =====".format(i + 1)) print("") # Selecting samples with Active Learning from Hard Dataset timeToSelect_hard, selected_hard_X, selected_hard_Y, hard_X, hard_Y, hard_correctedLabels = self.selectSamples( method_hard, hard_X, hard_Y, k_hardSamples * 2 if i == 0 else k_hardSamples, True if i == 0 else False, ("none" if i == 0 else learner_hard) if method_hard != "Random" else ("none" if i == 0 else ssmodel), ) # Append True Labeled Data to the Labeled Data if i == 0: labeled_X = selected_hard_X labeled_Y = selected_hard_Y else: labeled_X = np.vstack((labeled_X, selected_hard_X)) labeled_Y = np.vstack((labeled_Y, selected_hard_Y)) print( "Samples Labeled: {} - Time to Select: {} - Corrected: {}". format(len(labeled_Y), timeToSelect_hard, hard_correctedLabels)) # Learner Object to apply into hard pool learner_hard = self.createLearner( method_hard, labeled_X, labeled_Y, "none" if i == 0 else learner_hard, True if i == 0 else False) learner_easy = self.createLearner( method_easy, labeled_X, labeled_Y, "none" if i == 0 else learner_easy, True if i == 0 else False) # selecting samples with Active Learning from Easy Dataset timeToSelect_easy, selected_easy_X, selected_easy_Y, easy_X, easy_Y, easy_correctedLabels = self.selectSamples( method_easy, easy_X, easy_Y, k_easySamples * 2 if i == 0 else k_easySamples, (True if i == 0 else False) if method_easy == "Random" else False, ((ssmodel if i != 0 else "none") if method_easy == "Random" else learner_easy)) # append True Labeled Data to the Unlabeled Data if i == 0: unlabeled_X = selected_easy_X unlabeled_Y = selected_easy_Y else: unlabeled_X = np.vstack((unlabeled_X, selected_easy_X)) unlabeled_Y = np.vstack((unlabeled_Y, selected_easy_Y)) print( "Samples Unlabeled: {} - Time to Select: {} - Corrected: {}" .format(len(unlabeled_Y), timeToSelect_easy, easy_correctedLabels)) # semi supervised classification with OPF Semi Supervised t = time.time() ssmodel = SemiSupervisedOPF(distance='log_squared_euclidean', pre_computed_distance=None) ssmodel.fit(labeled_X, labeled_Y.flatten().astype("int"), unlabeled_X) print("Semi Supervised Score: {}% - Time: {}".format( round( g.opf_accuracy(test_Y.flatten().astype("int"), ssmodel.predict(test_X)) * 100, 2), round((time.time() - t), 3))) ss_model_score.append( round( g.opf_accuracy(test_Y.flatten().astype("int"), ssmodel.predict(test_X)) * 100, 2)) # join labeled data with unlabeled Z_dataset_X = np.vstack((labeled_X, unlabeled_X)) Z_dataset_Y = np.hstack( (labeled_Y.flatten(), unlabeled_Y.flatten())) # full supervised classification fullmodel = SupervisedOPF(distance='log_squared_euclidean', pre_computed_distance=None) fullmodel.fit(Z_dataset_X, Z_dataset_Y.flatten().astype("int")) print("Full Supervised Score: {}% - Time: {}".format( round( g.opf_accuracy(test_Y.flatten().astype("int"), fullmodel.predict(test_X)) * 100, 2), round((time.time() - t), 3))) full_model_score.append( round( g.opf_accuracy(test_Y.flatten().astype("int"), fullmodel.predict(test_X)) * 100, 2)) # Predict Semi-Supervised Labels to See how many errors are propagating ss_predict = ssmodel.predict(unlabeled_X) wrongPercentage = self.calcWrongPercentage( ss_predict, unlabeled_Y) # List of corrected Labels by methods ss_model_corrected.append(hard_correctedLabels) full_model_corrected.append(hard_correctedLabels + easy_correctedLabels) # List of time's to select hardTimeToSelect.append(timeToSelect_hard) easyTimeToSelect.append(timeToSelect_easy) # List of known class ss_know_class.append(len(np.unique(labeled_Y))) full_know_class.append(len(np.unique(Z_dataset_Y))) # List of wrong percentages wrongPercentages.append(wrongPercentage) print("") # Append Results ssmodel_accuracy.append(ss_model_score) ssmodel_corrected.append(ss_model_corrected) fullmodel_accuracy.append(full_model_score) fullmodel_corrected.append(full_model_corrected) hard_time_to_select.append(hardTimeToSelect) easy_time_to_select.append(easyTimeToSelect) ssmodel_knowClass.append(ss_know_class) fullmodel_knowClass.append(full_know_class) wrong_percentage.append(wrongPercentages) print("===" * 25) print("") return ssmodel_accuracy, ssmodel_corrected, fullmodel_accuracy, fullmodel_corrected, \ hard_time_to_select, easy_time_to_select, ssmodel_knowClass, fullmodel_knowClass, wrong_percentage