Python SVM.SGD Examples

Programming Language: Python

Namespace/Package Name: svm

Class/Type: SVM

Method/Function: SGD

Examples at hotexamples.com: 2

Python SVM.SGD - 2 examples found. These are the top rated real world Python examples of svm.SVM.SGD extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

SVM(30)

SGD(2)

Train(2)

_K(2)

_X(2)

_alphas(2)

_compute_separator(2)

_objective(2)

_y(2)

Predict(1)

_kernel(1)

accuracy(1)

add_sample(1)

dump(1)

labeled_new_data(1)

predict_known(1)

Example #1

Show file

 def hpsvm(params):
     svm = SVM(**params)
     W = svm.SGD(X_train.to_numpy(), y_train.to_numpy())
     y_test_predicted = np.array([])
     y_train_predicted = np.array([])
     for i in range(X_train.shape[0]):
         yp = np.sign(np.dot(X_train.to_numpy()[i], W))
         y_train_predicted = np.append(y_train_predicted, yp)
     for i in range(X_test.shape[0]):
         yp = np.sign(np.dot(X_test.to_numpy()[i], W))
         y_test_predicted = np.append(y_test_predicted, yp)
     svmaccuracy = hf.calculate_accuracy(y_test.to_numpy(),
                                         y_test_predicted)
     return svmaccuracy

Example #2

Show file

def main():

    ## User input

    data_path = input(
        "Enter the path to your input file. For example [C:\\Users\\User\\Documents\\Datasets\\data.csv]:"
    )

    problemtype = input(
        "What is your ML problem type; Enter [c] for classification, [r] for regression:"
    )

    df = pd.read_csv(data_path)

    target = input("Enter the target variable column name for your dataset:")

    scaletype = input(
        "Enter the scaling type for the dataset, options include [MinMaxScaler], [QuantileTransformer], [StandardScaler]:"
    )

    #Creating an empty list to store accuracies
    results = []

    ## Preprocessing data

    print("                                ")
    print("---------------------------------")
    print("        Cleaning the data        ")
    print("---------------------------------")
    print("                                ")

    pre = Preprocessing(df, target, scaletype)
    targettype = df[target].dtype
    print("Encoding the target variable if it is categorical...")
    targetY = pre.TargetEncoding()
    print("Removing any empty columns from the dataset...")
    df = pre.RemoveEmptyColumns()
    print("Imputing missing values into the dataset...")
    df = pre.MissingValImp()
    print("Encoding categorical variables...")
    df = pre.SimpleCatEncoding()
    columns = df.columns
    print("Scaling the values...")
    df = pre.Scaling()
    df.columns = columns
    print("Selecting the most important features based on correlation")
    df = pre.Correlaton_selection()
    df = pd.concat([df, targetY], axis=1)
    print("Checking the data for any outliers and removing them...")
    df = pre.Outliers()
    df['label'] = df[target]
    df = df.drop([target], axis=1)
    df = pre.removecolumnspace()
    df = df.drop([target], axis=1)
    print("Data cleaning finished")

    ## Option to perform pca for large data

    print("                                 ")
    print("---------------------------------")
    print("               PCA               ")
    print("---------------------------------")
    print("                                 ")

    yn = input(
        "Would you like to perform principal component analysis on the dataset? Enter [y] for yes, [n] for no:"
    )

    if yn == 'y':
        label = df['label']
        n_components = int(
            input(
                "How many components would you like to reduce the dataset to?:"
            ))
        pca = PCA(n_components)
        pca.fit(df)
        df = pca.transform(df)
        df = pd.DataFrame(df)
        df['label'] = label

    ## Train Test split
    print("                                ")
    print("---------------------------------")
    print("Splitting the data into train and test sets")
    print("---------------------------------")
    print("                                ")

    test_size = float(
        input("Input the test size for the split of test and train sets:"))
    hf = HelperFunctions(test_size)
    global train, test
    train, test = hf.train_test_split(df)
    global X_train
    X_train = train.drop(['label'], axis=1)
    global X_test
    X_test = test.drop(['label'], axis=1)
    global y_train
    y_train = train['label']
    global y_test
    y_test = test['label']

    print("                                ")
    print("---------------------------------")
    print("         Training models         ")
    print("---------------------------------")
    print("                                ")

    print("                                ")
    print("---------------------------------")
    print("          Decision Tree          ")
    print("---------------------------------")
    print("                                ")

    print("Optimizing parameters...")

    if problemtype == 'c':

        def hpdt(params):
            dt = decisiontree(**params)
            tree = dt.decision_tree_algorithm(train)
            dtpredictions = dt.decision_tree_predictions(test, tree)
            dtaccuracy = hf.calculate_accuracy(dtpredictions, y_test)
            return dtaccuracy

        spacedt = {
            'counter': hp.choice('counter', [0]),
            'min_samples': hp.choice('min_samples', range(1, 5)),
            'max_depth': hp.choice('max_depth', range(1, 20)),
            'random_subspace': hp.choice('random_subspace', [None])
        }

        def fdt(params):
            acc = hpdt(params)
            return {'loss': -acc, 'status': STATUS_OK}

        trials = Trials()
        max_evals = int(
            input(
                "Enter your value for the maximum number of evaluations you want. ():"
            ))
        best = fmin(fdt,
                    spacedt,
                    algo=tpe.suggest,
                    max_evals=max_evals,
                    trials=trials)

        counter = best.get('counter')
        max_depth = best.get('max_depth')
        min_samples = best.get('min_samples')
        random_subspace = best.get('random_subspace')

    else:

        def hpdt(params):
            dt = decisiontree(**params)
            tree = dt.decision_tree_algorithm(train)
            dtpredictions = dt.decision_tree_predictions(test, tree)
            dtaccuracy = hf.rmse(dtpredictions, y_test)
            return dtaccuracy

        spacedt = {
            'counter': hp.choice('counter', [0]),
            'min_samples': hp.choice('min_samples', range(1, 5)),
            'max_depth': hp.choice('max_depth', range(1, 20)),
            'random_subspace': hp.choice('random_subspace', [None])
        }

        def fdt(params):
            acc = hpdt(params)
            return {'loss': acc, 'status': STATUS_OK}

        trials = Trials()
        max_evals = int(
            input(
                "Enter your value for the maximum number of evaluations you want:"
            ))
        best = fmin(fdt,
                    spacedt,
                    algo=tpe.suggest,
                    max_evals=max_evals,
                    trials=trials)

        counter = best.get('counter')
        max_depth = best.get('max_depth')
        min_samples = best.get('min_samples')
        random_subspace = best.get('random_subspace')

    print(
        'Optimal hyperparameters for decision tree are: counter = {}, max_depth = {}, min_samples = {}, random_subspace = {}.'
        .format(counter, max_depth, min_samples, random_subspace))

    dt = decisiontree(counter=counter,
                      min_samples=min_samples,
                      max_depth=max_depth,
                      random_subspace=random_subspace)
    tree = dt.decision_tree_algorithm(train)
    print("Predicting values...")
    dtpredictions = dt.decision_tree_predictions(test, tree)
    if problemtype == "c":
        print("Calculating Accuracy...")
        dtaccuracy = hf.calculate_accuracy(dtpredictions, test['label'])
        print("Accuracy of decision tree predictions = {:.2f}".format(
            dtaccuracy * 100) + '%')
        results.append(dtaccuracy)
    else:
        print("Calculating RMSE...")
        dtrootmean = hf.rmse(dtpredictions, test['label'])
        print("RMSE of decision tree predictions = {}".format(dtrootmean))
        results.append(dtrootmean)

    print("                                ")
    print("---------------------------------")
    print("          Random Forest          ")
    print("---------------------------------")
    print("                                ")

    print("Optimizing parameters...")

    if problemtype == 'c':

        def hpdt(params):
            rf = randomforest(**params)
            forest = rf.fit(train)
            for i in range(0, len(forest) - 1):
                if type(forest[i]) == np.float64:
                    del forest[i]
            rfpredictions = rf.predict(test, forest, 'c')
            rfaccuracy = hf.calculate_accuracy(rfpredictions, test['label'])
            return rfaccuracy

        spacerf = {
            'n_bootstrap': hp.choice('n_bootstrap', range(50, len(train))),
            'n_trees': hp.choice('n_trees', range(1, 20)),
            'n_features': hp.choice('n_features', range(1, 5)),
            'dt_max_depth': hp.choice('dt_max_depth', range(1, 20)),
            'min_samples': hp.choice('min_samples', range(1, 5))
        }

        def frf(params):
            acc = hpdt(params)
            return {'loss': -acc, 'status': STATUS_OK}

        trials = Trials()
        max_evals = int(
            input(
                "Enter your value for the maximum number of evaluations you want:"
            ))
        best = fmin(frf,
                    spacerf,
                    algo=tpe.suggest,
                    max_evals=max_evals,
                    trials=trials)

    else:

        def hpdt(params):
            rf = randomforest(**params)
            forest = rf.fit(train)
            for i in range(0, len(forest) - 1):
                if type(forest[i]) == np.float64:
                    del forest[i]
            rfpredictions = rf.predict(test, forest, 'r')
            rfaccuracy = hf.rmse(rfpredictions, test['label'])
            return rfaccuracy

        spacerf = {
            'n_bootstrap': hp.choice('n_bootstrap', range(50, len(train))),
            'n_trees': hp.choice('n_trees', range(1, 20)),
            'n_features': hp.choice('n_features', range(1, 5)),
            'dt_max_depth': hp.choice('dt_max_depth', range(1, 20)),
            'min_samples': hp.choice('min_samples', range(1, 5))
        }

        def frf(params):
            acc = hpdt(params)
            return {'loss': acc, 'status': STATUS_OK}

        trials = Trials()
        max_evals = int(
            input(
                "Enter your value for the maximum number of evaluations you want:"
            ))
        best = fmin(frf,
                    spacerf,
                    algo=tpe.suggest,
                    max_evals=max_evals,
                    trials=trials)

    n_bootstrap = best.get('n_bootstrap')
    n_trees = best.get('n_trees')
    n_features = best.get('n_features')
    dt_max_depth = best.get('dt_max_depth')
    min_samples = best.get('min_samples')

    print(
        'Optimal hyperparameters for random forest are: n_bootstrap = {}, n_trees = {}, n_features = {}, dt_max_depth = {}, min_samples = {}.'
        .format(n_bootstrap, n_trees, n_features, dt_max_depth, min_samples))

    print("Building random forest...")
    rf = randomforest(n_bootstrap=n_bootstrap,
                      n_trees=n_trees,
                      n_features=n_features,
                      dt_max_depth=dt_max_depth,
                      min_samples=min_samples)
    forest = rf.fit(train)
    for i in range(0, len(forest) - 1):
        if type(forest[i]) == np.float64:
            del forest[i]
    print("Predicting values...")
    rfpredictions = rf.predict(test, forest, problemtype)
    if problemtype == "c":
        print("Calculating Accuracy...")
        rfaccuracy = hf.calculate_accuracy(rfpredictions, test['label'])
        print("Accuracy of random forest predictions = {:.2f}".format(
            rfaccuracy * 100) + '%')
        results.append(rfaccuracy)
    else:
        print("Calculating RMSE...")
        rfrootmean = hf.rmse(rfpredictions, test['label'])
        print("RMSE of random forest predictions = {}".format(rfrootmean))
        results.append(rfrootmean)

    print("                                ")
    print("---------------------------------")
    print("     Support Vector Machines     ")
    print("---------------------------------")
    print("                                ")

    # Hyperparameter optimization

    if problemtype == 'c':
        svmdf = df
        svmdf.insert(loc=len(df.columns), column='intercept', value=1)
        train, test = hf.train_test_split(svmdf)
        X_train = train.drop(['label'], axis=1)
        X_test = test.drop(['label'], axis=1)
        y_train = train['label']
        y_test = test['label']
        print("Optimizing hyperparameters...")

        def hpsvm(params):
            svm = SVM(**params)
            W = svm.SGD(X_train.to_numpy(), y_train.to_numpy())
            y_test_predicted = np.array([])
            y_train_predicted = np.array([])
            for i in range(X_train.shape[0]):
                yp = np.sign(np.dot(X_train.to_numpy()[i], W))
                y_train_predicted = np.append(y_train_predicted, yp)
            for i in range(X_test.shape[0]):
                yp = np.sign(np.dot(X_test.to_numpy()[i], W))
                y_test_predicted = np.append(y_test_predicted, yp)
            svmaccuracy = hf.calculate_accuracy(y_test.to_numpy(),
                                                y_test_predicted)
            return svmaccuracy

        spacesvm = {
            'reg_strength': hp.uniform('reg_strength', 100, 10000),
            'learning_rate': hp.uniform('learning_rate', 0.00001, 0.0001)
        }

        def f(params):
            acc = hpsvm(params)
            return {'loss': -acc, 'status': STATUS_OK}

        max_evals = int(
            input(
                "Enter your value for the maximum number of evaluations you want:"
            ))

        trials = Trials()
        best = fmin(f,
                    spacesvm,
                    algo=tpe.suggest,
                    max_evals=max_evals,
                    trials=trials)

        learning_rate = best.get('learning_rate')
        reg_strength = best.get('reg_strength')

        print(
            'Optimal hyperparameters for SVM are: learning rate = {}, regression strength = {}'
            .format(learning_rate, reg_strength))

        svm = SVM(reg_strength, learning_rate)
        print("training started...")
        W = svm.SGD(X_train.to_numpy(), y_train.to_numpy())
        print("training finished")
        y_test_predicted = np.array([])
        print("testing the model...")
        y_train_predicted = np.array([])
        for i in range(X_train.shape[0]):
            yp = np.sign(np.dot(X_train.to_numpy()[i], W))
            y_train_predicted = np.append(y_train_predicted, yp)
        for i in range(X_test.shape[0]):
            yp = np.sign(np.dot(X_test.to_numpy()[i], W))
            y_test_predicted = np.append(y_test_predicted, yp)
        svmaccuracy = hf.calculate_accuracy(y_test.to_numpy(),
                                            y_test_predicted)
        print("accuracy on test dataset: {:.2f}".format(svmaccuracy * 100) +
              '%')
        results.append(svmaccuracy)
    else:
        print("SVM cannot be used for regression.")

    print("                                ")
    print("---------------------------------")
    print("       K nearest neighbour       ")
    print("---------------------------------")
    print("                                ")

    if problemtype == "c":

        X_trainknn = X_train.to_numpy()
        X_testknn = X_test.to_numpy()
        y_trainknn = y_train.to_numpy()
        y_testknn = y_test.to_numpy()

        def hpknn(params):
            clf = KNN(**params)
            clf.fit(X_trainknn, y_trainknn)
            predictions = clf.predict(X_testknn)
            accuracy = hf.calculate_accuracy(y_testknn, predictions)
            return accuracy

        spaceknn = {'k': hp.choice('k', range(1, 100))}

        def f(params):
            acc = hpknn(params)
            return {'loss': -acc, 'status': STATUS_OK}

        trials = Trials()

        # Finding best hyperparameters

        print("Optimizing hyperparameters...")

        max_evals = int(
            input(
                "Enter your value for the maximum number of evaluations you want:"
            ))

        best = fmin(f,
                    spaceknn,
                    algo=tpe.suggest,
                    max_evals=max_evals,
                    trials=trials)

        k = best.get('k')

        print('Optimal hyperparameters for KNN are: k = {}'.format(k))

        X_train = X_train.to_numpy()
        X_test = X_test.to_numpy()
        y_train = y_train.to_numpy()
        y_test = y_test.to_numpy()
        print("Training started...")
        clf = KNN(k=int(k))
        clf.fit(X_train, y_train)
        print("Training finished")
        print("Predicting values...")
        predictions = clf.predict(X_test)
        knnaccuracy = hf.calculate_accuracy(y_test, predictions)
        print("Accuracy of knn predictions are {:.2f}".format(knnaccuracy *
                                                              100) + '%')
        results.append(knnaccuracy)

    else:
        pass

    ## Model Decision
    print("                                ")
    if problemtype == 'c':
        print("                                ")
        print("----------------------------")
        print("     Accuracy of models     ")
        print("----------------------------")
        print("                                ")
        print('Decision Tree      ' + "{:.2f}".format(dtaccuracy * 100) + '%')
        print('Random Forest      ' + "{:.2f}".format(rfaccuracy * 100) + '%')
        print('SVM                ' + "{:.2f}".format(svmaccuracy * 100) + '%')
        print('KNN                ' + "{:.2f}".format(knnaccuracy * 100) + '%')
    else:
        print("                                ")
        print("----------------------------")
        print("     RMSE of models     ")
        print("----------------------------")
        print("                                ")
        print('Decision Tree      ' + "{:.2f}".format(dtrootmean))
        print('Random Forest      ' + "{:.2f}".format(rfrootmean))

    print("                                ")

    if problemtype == 'c':
        print("Maximum accuracy score is {}".format(max(results) * 100) + '%')
    else:
        print("Minimum rmse is {}".format(min(results)))