def classificationValidation(self,test_list, kmeans_path, kernel, C, gamma): ''' Main classification Validation function to validate model on :param true_val_Vector: true values of test set. :param test_list: list of paths where test images are held. :param KmeansName: Load the Kmeans classifier for Binerization Task. ''' if gamma == None: clf = SVC(C=C,kernel=kernel) else: clf = SVC(C=C,gamma=gamma,kernel=kernel) print "kernel: " + kernel print "gamma: " + str(gamma) print "C: " + str(C) clf.fit(self.X,self.y) results_vector = [] y_true = [] cl=0 k_means = joblib.load(kmeans_name) [m,num_of_clusters] = np.shape(self.X) for path in test_list: for item in os.listdir(path): p = path + "/" + item im = cv.imread(p) fe = FeatureExtractor(im) feature_vector = np.zeros(num_of_clusters) raw_vector = fe.computeFeatureVector() Km_vector = k_means.predict(raw_vector) for k in range(len(Km_vector)): feature_vector[Km_vector[k]] = feature_vector[Km_vector[k]] + 1 res = clf.predict(feature_vector) # Debugging if res[0] == 1: print p + " is not a foram!" if res[0] == 0: print p + " is a foram!" y_true.append(cl) results_vector.append(res[0]) cl = cl + 1 print "confusion_matrix" print confusion_matrix(y_true,results_vector)
def createClassificationTrainingFromDataset(self, dataset_name, labels_list, path_list): ''' Creates a new training set to work on from given path list and labels. Notice path_list and path_labels are intended to be lists of the same length. see tests in __main__ for examples. :param dataset_name: the name of the data set :param path_list: a list of pathes frome which the images are collected. :param labels_list: a list of labels to use for the images collected from corresponding path. (i.e. first label correspond to first path in the path list.) ''' base_path = "binData/" labels = [] trainingData = [] classes = [] cl = 0 ### Building the feature matrix. for i, path in enumerate(path_list): labels.append(labels_list[i]) print labels_list[i] for item in os.listdir(path): p = path + "/" + item print p # DEBUG im = cv.imread(p) fe = FeatureExtractor(im) feature_vector = fe.computeFeatureVector() if len(trainingData) == 0: trainingData = feature_vector else: np.vstack((trainingData, feature_vector)) classes.append(cl) print "vstack Kmeans Classifier: " print np.shape(trainingData) classes = np.array(classes) cl = cl + 1 ### DEBUG print np.shape(trainingData) print np.shape(classes) ### SAVING THE DATASETS TO NPZ FORMAT np.savez(os.path.join(base_path, dataset_name), trainingData, labels, classes)
def createKmeansTrainingDataset(self,kmeans_data, dataset_name, kmeans_name, path_list, labels_list, num_of_clusters): ''' Create Training for Kmeans With regression. :param: KmeansData: the training matrix obtained using createClassificationTrainingFromDataset method on the HOLDOUT set. :param: kmeansName: the name of the Kmeans classifier to be saved and pickeled. :param: dataset_name: the name of the NEW dataset created using the Kmeans classifier on the training. i.e. clustering the feature vector. :param: path_list: list of paths where the training set is at. :param: labels_list: the list of labels for the samples in the training set. :param: num_of_clusters: the number of clusters for the Kmeans classifier. ''' npzfile = np.load(kmeans_data) KmeansData = npzfile['arr_0'] Kmeanslabels = npzfile['arr_1'] Kmeansclasses = npzfile['arr_2'] k_means = cluster.KMeans(n_clusters=num_of_clusters) k_means.fit(kmeans_data) base_path = "binData/" labels = labels_list trainingData = [] classes = [] cl=0 ### Building the feature matrix. for i, path in enumerate(path_list): print labels_list[i] for item in os.listdir(path): p = path + "/" + item print p # DEBUG im = cv.imread(p) fe = FeatureExtractor(im) feature_vector = np.zeros(num_of_clusters) raw_vector = fe.computeFeatureVector() Km_vector = k_means.predict(raw_vector) for j in range(len(Km_vector)): feature_vector[Km_vector[j]] = feature_vector[Km_vector[j]] + 1 trainingData.append(feature_vector) classes.append(cl) # Here we multiply the number of POSITIVE samples in the training set so that the 'unbalanced' problem of "Foram vs. Not-Foram" # 'becomes balanced'. if i == 0: print "working on positive samples" print "Original training size: (should be 68 by 10)" print np.shape(trainingData) print np.shape(classes) for k in range(9): trainingData = np.vstack((trainingData, trainingData)) classes = np.hstack((classes,classes)) print "After Multipling Positive Samples by 8" print np.shape(trainingData) print np.shape(classes) trainingData = trainingData.tolist() classes = classes.tolist() cl = cl + 1 ### DEBUG print "final shape: (should be 54,000~ by 10):" print np.shape(trainingData) ### SAVING THE DATASETS TO NPZ FORMAT joblib.dump(k_means, os.path.join(base_path, kmeans_name), compress=9) np.savez(os.path.join(base_path, dataset_name), trainingData, labels_list, classes)