def gridsearch_for_linear(X, y): """ Parameter tuning for the linear classifier in two stages. First tuning is done on a coarse grid, second on a finer grid at the position of the optimal values of the first grid. :param X: Data x :param y: Labels y :return: Best parameters """ n_cpu = multiprocessing.cpu_count() IOHelper.write("Linear SVC: Starting coarse gridsearch.") # LinSvm gridSearch c_range = np.logspace(1, 10, 10, base=10.0) param_grid = dict(C=c_range) grid = GridSearchCV(LinearSVC(), param_grid=param_grid, n_jobs=n_cpu) grid.fit(X, y) _c = grid.best_params_['C'] IOHelper.write("Linear SVC: Finished coarse gridsearch with params: C: " + str(_c)) IOHelper.write("Linear SVC: Starting fine gridsearch:") # c_range_2 = np.linspace(_c - 0.5 * _c, _c + 0.5 * _c, num=5) c_range_2 = [_c - 0.5 * _c, _c, 2 * _c] param_grid = dict(C=c_range_2) grid = GridSearchCV(LinearSVC(), param_grid=param_grid, n_jobs=n_cpu) grid.fit(X, y) _c = grid.best_params_['C'] IOHelper.write("Linear SVC: Finished fine gridsearch with params: C: " + str(_c)) return _c
def gridsearch_for_gauss(X, y): """ Parameter tuning for the gauss classifier in two stages. First tuning is done on a coarse grid, second on a finer grid at the position of the optimal val :param X: Data x :param y: Labels y :return: Best parameters """ n_cpu = multiprocessing.cpu_count() print("Using multiprocessing. Avaiable cores: " + str(n_cpu)) IOHelper.write("Gauss SVC: Starting gridsearch for gaussian classifier.") c_range = np.logspace(1, 10, 10, base=10.0) gamma_range = np.logspace(-3, 2, 6, base=10.0) # c_range = np.logspace(-4, 1, 6, base=10.0) # gamma_range = np.logspace(-9, -3, 6, base=10.0) param_grid = dict(gamma=gamma_range, C=c_range) grid = GridSearchCV(SVC(kernel="rbf"), param_grid=param_grid, n_jobs=n_cpu) grid.fit(X, y) _c = grid.best_params_['C'] _gamma = grid.best_params_['gamma'] print("First search complete. Starting second search...") IOHelper.write("Gauss SVC: Finished coarse gridsearch with params: C: " + str(_c) + " gamma: " + str(_gamma)) IOHelper.write("Gauss SVC: Starting fine for gaussian classifier.") c_range_2 = [_c - 0.5 * _c, _c, 2 * _c] gamma_range_2 = [_gamma - 0.5 * _gamma, _gamma, 2 * _gamma] param_grid = dict(gamma=gamma_range_2, C=c_range_2) grid = GridSearchCV(SVC(kernel="rbf"), param_grid=param_grid, n_jobs=n_cpu) grid.fit(X, y) _c = grid.best_params_['C'] _gamma = grid.best_params_['gamma'] IOHelper.write("Gauss SVC: Finished fine gridsearch with params: C: " + str(_c) + " gamma: " + str(_gamma)) return _c, _gamma
def predict(self, X): """ Predicts the labels for the given data vector X. Uses the range _gauss_distance defined in the fit()-method to determine which classifier should predict which element in the data vector x. :param X: Data vector. :return: Vector of predictions. """ time_start = time.time() if self._verbose: IOHelper.write("Starting predicting.") """ If-Construct to account for the border cases (all points for one classifier): (1) margins = [0, 0]: All points used for the linear SVM. (2) e.g. margins = [-0.3, 0.3] Points distributed between both. Standard case. (3) margins = [1, -1]: All points used for the gauss SVM. (fit()-Method set margins to -1) """ if not self._use_distance and 0 < self._k < 1.0: n = np.ceil(self._k * X.shape[0]) # Random ziehen. gauss_indices = random.sample(np.arange(X.shape[0]), int(n)) lin_indices = np.setdiff1d(np.arange(X.shape[0]), gauss_indices) lin_predictions = self._lin_svc.predict(X[lin_indices]) gauss_predictions = self._gauss_svc.predict(X[gauss_indices]) predictions = np.zeros(len(lin_predictions) + len(gauss_predictions)) predictions[lin_indices] = lin_predictions predictions[gauss_indices] = gauss_predictions self._time_predict = time.time() - time_start if self._verbose: IOHelper.write("Finished predicting.") return predictions if self._gauss_distance == 0.0: # (1) predictions = self._lin_svc.predict(X) self._time_predict = time.time() - time_start if self._verbose: IOHelper.write("Finished predicting.") return predictions if 0.0 < self._gauss_distance: # (2) fx = abs(self._lin_svc.decision_function(X)) / np.linalg.norm(self._lin_svc.coef_[0]) gauss_indices = np.where(fx < self._gauss_distance) lin_indices = np.where(fx >= self._gauss_distance) lin_predictions = self._lin_svc.predict(X[lin_indices]) gauss_predictions = self._gauss_svc.predict(X[gauss_indices]) predictions = np.zeros(len(lin_predictions) + len(gauss_predictions)) predictions[lin_indices] = lin_predictions predictions[gauss_indices] = gauss_predictions self._time_predict = time.time() - time_start if self._verbose: IOHelper.write("Finished predicting.") return predictions if self._gauss_distance == -1: # (3) predictions = self._gauss_svc.predict(X) self._time_predict = time.time() - time_start if self._verbose: IOHelper.write("Finished predicting.") return predictions # If no condition matched raise Exception("Fatal error: Count param")
def fit(self, X, y): """ Fit the model according to the given training data. Fits a linear SVC on the given data. Afterwards, certain datapoints are selected and given to a gaussian SVC. The selection is dependant on the attribute useFactor of this object. :param X: Training vector :param y: Target vector relative to X :return: Returns self. """ if self._verbose: IOHelper.write("Starting fitting process.\n") IOHelper.write("Starting fitting process for linear SVC.") time_start_lin = time.time() self._lin_svc.fit(X, y) self._time_fit_lin = time.time() - time_start_lin if self._verbose: IOHelper.write("Completed fitting process for linear SVC.") IOHelper.write("Sorting points for classifiers.") time_start_overhead = time.time() x, y, gauss_distance = self.get_points_close_to_hyperplane_by_count(X, y, self._k) try: self._n_gauss = x.shape[0] # Measure the number of points for gauss classifier: except AttributeError: self._n_gauss = len(x) self._gauss_distance = gauss_distance self._time_overhead = time.time() - time_start_overhead if (self._verbose): IOHelper.write("Sorting finished.") IOHelper.write("Starting fitting process for gaussian SVC.") # Measure the number of points for linear classifier: self._n_lin = X.shape[0] - self._n_gauss time_start_gauss = time.time() if self._n_gauss != 0: self._gauss_svc.fit(x, y) self._time_fit_gauss = time.time() - time_start_gauss if self._verbose: IOHelper.write("Completed fitting process for gaussian SVC.") IOHelper.write("Finished fitting process.\n") return self
def gridsearch_and_save(data): ''' Method that searches the best parameters for the given data-set for the DualSvm for different k and saves it to a text file. :param data: String, name of the data. Search is done automatically in the data directory. :return: None. Output will be written to a textfile. ''' IOHelper.write("Starting parameter tuning for " + data) x, x_test, y, y_test = DataLoader.load_data(data) file_string = "output/" + data + "-params.csv" #file_string = "output/" + data + "-params.csv" k = 0 n = 0 c_lin = 0 c_gauss = [0, 0, 0, 0, 0, 0, 0, 0, 0] gamma = [0, 0, 0, 0, 0, 0, 0, 0, 0] try: output = open(file_string, 'w') except Exception: try: output = open("../" + file_string, 'w') except Exception: output = open("error-params.txt", 'w') for j in range(4): # Smaller steps from 0 to 20: 0, 5, 10, 15 n = j IOHelper.write("Batch run " + str(j) + ", k = " + str(0.05 * j)) # Load the classifier k = 0.05 * j clf = DualSvm(use_distance=True) clf.k = k # Parameter Tuning if j == 0: # In the first run, calculate best parameters for linear svm c_lin = gridsearch_for_linear(x, y) else: clf.c_lin = c_lin clf.fit_lin_svc(x, y) # Fit linear classifier beforehand. This is necessary for the get_points method to work correctly. x_gauss, y_gauss, margins = clf.get_points_close_to_hyperplane_by_count(x, y, k) c_gauss[n], gamma[n] = gridsearch_for_gauss(x_gauss, y_gauss) # In the following runs, do the same for the gaussian svm, as the subset of points for the classifier is changing for i in range(5): # Bigger steps from 20 to 100: 20, 40, 60, 80, 100 n = 4 + i IOHelper.write("Batch run " + str(i + 4) + ", k = " + str(0.2 * (i + 1))) # Load the classifier k = 0.2 * (i + 1) clf = DualSvm(use_distance=True) clf.k = k if k <= 0.6: clf.c_lin = c_lin clf.fit_lin_svc(x, y) x_gauss, y_gauss, margins = clf.get_points_close_to_hyperplane_by_count(x, y, k) c_gauss[n], gamma[n] = gridsearch_for_gauss(x_gauss, y_gauss) output.write(str(c_lin) + "\n") for value in c_gauss: output.write(str(value) + ",") output.write("\n") for value in gamma: output.write(str(value) + ",") output.write("\n")