Example #1
0
def gridsearch_for_linear(X, y):
    """
    Parameter tuning for the linear classifier in two stages. First tuning is done on a coarse grid, second on a finer grid at the position of the optimal values of the first grid.

    :param X: Data x
    :param y: Labels y
    :return: Best parameters
    """
    n_cpu = multiprocessing.cpu_count()
    IOHelper.write("Linear SVC: Starting coarse gridsearch.")
    # LinSvm gridSearch
    c_range = np.logspace(1, 10, 10, base=10.0)
    param_grid = dict(C=c_range)
    grid = GridSearchCV(LinearSVC(), param_grid=param_grid, n_jobs=n_cpu)
    grid.fit(X, y)

    _c = grid.best_params_['C']

    IOHelper.write("Linear SVC: Finished coarse gridsearch with params: C: " + str(_c))
    IOHelper.write("Linear SVC: Starting fine gridsearch:")

    # c_range_2 = np.linspace(_c - 0.5 * _c, _c + 0.5 * _c, num=5)
    c_range_2 = [_c - 0.5 * _c, _c, 2 * _c]
    param_grid = dict(C=c_range_2)
    grid = GridSearchCV(LinearSVC(), param_grid=param_grid, n_jobs=n_cpu)
    grid.fit(X, y)

    _c = grid.best_params_['C']
    IOHelper.write("Linear SVC: Finished fine gridsearch with params: C: " + str(_c))

    return _c
Example #2
0
def gridsearch_for_gauss(X, y):
    """
    Parameter tuning for the gauss classifier in two stages. First tuning is done on a coarse grid, second on a finer grid at the position of the optimal val

    :param X: Data x
    :param y: Labels y
    :return: Best parameters
    """

    n_cpu = multiprocessing.cpu_count()
    print("Using multiprocessing. Avaiable cores: " + str(n_cpu))
    IOHelper.write("Gauss SVC: Starting gridsearch for gaussian classifier.")
    c_range = np.logspace(1, 10, 10, base=10.0)
    gamma_range = np.logspace(-3, 2, 6, base=10.0)
    # c_range = np.logspace(-4, 1, 6, base=10.0)
    # gamma_range = np.logspace(-9, -3, 6, base=10.0)

    param_grid = dict(gamma=gamma_range, C=c_range)

    grid = GridSearchCV(SVC(kernel="rbf"), param_grid=param_grid, n_jobs=n_cpu)
    grid.fit(X, y)
    _c = grid.best_params_['C']
    _gamma = grid.best_params_['gamma']

    print("First search complete. Starting second search...")

    IOHelper.write("Gauss SVC: Finished coarse gridsearch with params: C: " + str(_c) + " gamma: " + str(_gamma))
    IOHelper.write("Gauss SVC: Starting fine for gaussian classifier.")

    c_range_2 = [_c - 0.5 * _c, _c, 2 * _c]
    gamma_range_2 = [_gamma - 0.5 * _gamma, _gamma, 2 * _gamma]

    param_grid = dict(gamma=gamma_range_2, C=c_range_2)
    grid = GridSearchCV(SVC(kernel="rbf"), param_grid=param_grid, n_jobs=n_cpu)
    grid.fit(X, y)

    _c = grid.best_params_['C']
    _gamma = grid.best_params_['gamma']

    IOHelper.write("Gauss SVC: Finished fine gridsearch with params: C: " + str(_c) + " gamma: " + str(_gamma))

    return _c, _gamma
Example #3
0
    def predict(self, X):
        """
        Predicts the labels for the given data vector X. Uses the range _gauss_distance defined in the fit()-method to determine which classifier should predict which element in the data vector x.

        :param X: Data vector.
        :return: Vector of predictions.

        """

        time_start = time.time()
        if self._verbose:
            IOHelper.write("Starting predicting.")

        """
        If-Construct to account for the border cases (all points for one classifier):

        (1) margins = [0, 0]: All points used for the linear SVM.
        (2) e.g. margins = [-0.3, 0.3] Points distributed between both. Standard case.
        (3) margins = [1, -1]: All points used for the gauss SVM. (fit()-Method set margins to -1)
        """
        if not self._use_distance and 0 < self._k < 1.0:
            n = np.ceil(self._k * X.shape[0])  # Random ziehen.
            gauss_indices = random.sample(np.arange(X.shape[0]), int(n))
            lin_indices = np.setdiff1d(np.arange(X.shape[0]), gauss_indices)
            lin_predictions = self._lin_svc.predict(X[lin_indices])
            gauss_predictions = self._gauss_svc.predict(X[gauss_indices])
            predictions = np.zeros(len(lin_predictions) + len(gauss_predictions))
            predictions[lin_indices] = lin_predictions
            predictions[gauss_indices] = gauss_predictions
            self._time_predict = time.time() - time_start
            if self._verbose:
                IOHelper.write("Finished predicting.")
            return predictions

        if self._gauss_distance == 0.0:  # (1)
            predictions = self._lin_svc.predict(X)
            self._time_predict = time.time() - time_start
            if self._verbose:
                IOHelper.write("Finished predicting.")
            return predictions

        if 0.0 < self._gauss_distance:  # (2)
            fx = abs(self._lin_svc.decision_function(X)) / np.linalg.norm(self._lin_svc.coef_[0])
            gauss_indices = np.where(fx < self._gauss_distance)
            lin_indices = np.where(fx >= self._gauss_distance)
            lin_predictions = self._lin_svc.predict(X[lin_indices])
            gauss_predictions = self._gauss_svc.predict(X[gauss_indices])
            predictions = np.zeros(len(lin_predictions) + len(gauss_predictions))
            predictions[lin_indices] = lin_predictions
            predictions[gauss_indices] = gauss_predictions
            self._time_predict = time.time() - time_start
            if self._verbose:
                IOHelper.write("Finished predicting.")
            return predictions

        if self._gauss_distance == -1:  # (3)
            predictions = self._gauss_svc.predict(X)
            self._time_predict = time.time() - time_start
            if self._verbose:
                IOHelper.write("Finished predicting.")
            return predictions

        # If no condition matched
        raise Exception("Fatal error: Count param")
Example #4
0
    def fit(self, X, y):
        """
        Fit the model according to the given training data.

        Fits a linear SVC on the given data.
        Afterwards, certain datapoints are selected and given to a gaussian SVC. The selection is dependant on the attribute useFactor of this object.

        :param X: Training vector
        :param y: Target vector relative to X
        :return: Returns self.
        """

        if self._verbose:
            IOHelper.write("Starting fitting process.\n")
            IOHelper.write("Starting fitting process for linear SVC.")

        time_start_lin = time.time()
        self._lin_svc.fit(X, y)
        self._time_fit_lin = time.time() - time_start_lin

        if self._verbose:
            IOHelper.write("Completed fitting process for linear SVC.")
            IOHelper.write("Sorting points for classifiers.")

        time_start_overhead = time.time()
        x, y, gauss_distance = self.get_points_close_to_hyperplane_by_count(X, y, self._k)
        try:
            self._n_gauss = x.shape[0]  # Measure the number of points for gauss classifier:
        except AttributeError:
            self._n_gauss = len(x)
        self._gauss_distance = gauss_distance
        self._time_overhead = time.time() - time_start_overhead

        if (self._verbose):
            IOHelper.write("Sorting finished.")
            IOHelper.write("Starting fitting process for gaussian SVC.")

        # Measure the number of points for linear classifier:
        self._n_lin = X.shape[0] - self._n_gauss

        time_start_gauss = time.time()
        if self._n_gauss != 0:
            self._gauss_svc.fit(x, y)
        self._time_fit_gauss = time.time() - time_start_gauss

        if self._verbose:
            IOHelper.write("Completed fitting process for gaussian SVC.")
            IOHelper.write("Finished fitting process.\n")

        return self
Example #5
0
def gridsearch_and_save(data):
    '''
    Method that searches the best parameters for the given data-set for the DualSvm for different k and saves it to a text file.
    :param data: String, name of the data. Search is done automatically in the data directory.
    :return: None. Output will be written to a textfile.
    '''
    IOHelper.write("Starting parameter tuning for " + data)
    x, x_test, y, y_test = DataLoader.load_data(data)
    file_string = "output/" + data + "-params.csv"
    #file_string = "output/" + data + "-params.csv"

    k = 0
    n = 0

    c_lin = 0
    c_gauss = [0, 0, 0, 0, 0, 0, 0, 0, 0]
    gamma = [0, 0, 0, 0, 0, 0, 0, 0, 0]

    try:
        output = open(file_string, 'w')
    except Exception:
        try:
            output = open("../" + file_string, 'w')
        except Exception:
            output = open("error-params.txt", 'w')

    for j in range(4):  # Smaller steps from 0 to 20: 0, 5, 10, 15
        n = j
        IOHelper.write("Batch run " + str(j) + ", k = " + str(0.05 * j))
        # Load the classifier
        k = 0.05 * j
        clf = DualSvm(use_distance=True)
        clf.k = k

        # Parameter Tuning
        if j == 0:  # In the first run, calculate best parameters for linear svm
            c_lin = gridsearch_for_linear(x, y)
        else:
            clf.c_lin = c_lin
            clf.fit_lin_svc(x,
                            y)  # Fit linear classifier beforehand. This is necessary for the get_points method to work correctly.
            x_gauss, y_gauss, margins = clf.get_points_close_to_hyperplane_by_count(x, y, k)
            c_gauss[n], gamma[n] = gridsearch_for_gauss(x_gauss,
                                                        y_gauss)  # In the following runs, do the same for the gaussian svm, as the subset of points for the classifier is changing

    for i in range(5):  # Bigger steps from 20 to 100: 20, 40, 60, 80, 100
        n = 4 + i
        IOHelper.write("Batch run " + str(i + 4) + ", k = " + str(0.2 * (i + 1)))

        # Load the classifier
        k = 0.2 * (i + 1)
        clf = DualSvm(use_distance=True)
        clf.k = k

        if k <= 0.6:
            clf.c_lin = c_lin
            clf.fit_lin_svc(x, y)
            x_gauss, y_gauss, margins = clf.get_points_close_to_hyperplane_by_count(x, y, k)
            c_gauss[n], gamma[n] = gridsearch_for_gauss(x_gauss, y_gauss)

    output.write(str(c_lin) + "\n")
    for value in c_gauss:
        output.write(str(value) + ",")
    output.write("\n")
    for value in gamma:
        output.write(str(value) + ",")
    output.write("\n")