Python SKTSVM.predict Examples

Programming Language: Python

Namespace/Package Name: scikitTSVM

Class/Type: SKTSVM

Method/Function: predict

Examples at hotexamples.com: 2

Python SKTSVM.predict - 2 examples found. These are the top rated real world Python examples of scikitTSVM.SKTSVM.predict extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

SKTSVM(4)

fit(3)

predict(2)

score(2)

predict_proba(1)

Example #1

Show file

class DiabetesPrediction:
    def __init__(self, data="diabetes"):
        self.data = data

    def data_processing(self, fileName='pima-indians-diabetes.csv'):
        dataset = read_csv(fileName, header=None)
        #dataset = fetch_mldata(self.data)

        # replace zero with mean value for few colunms
        dataset[[1, 2, 3, 4, 5]] = dataset[[1, 2, 3, 4,
                                            5]].replace(0, numpy.NaN)
        values = dataset.values
        imputer = MICE(n_imputations=100,
                       impute_type='pmm',
                       n_nearest_columns=5,
                       verbose=FALSE)
        transformed_values = imputer.complete(values)
        X = transformed_values[:, 0:8]
        ytrue = transformed_values[:, 8]
        # feature selection
        X = X[:, [0, 1, 2, 5, 6, 7]]
        sc_X = StandardScaler()
        X = sc_X.fit_transform(X)
        return X, ytrue, sc_X

    def unlabel_data(self, ytrue, seed=42, label_perc=.2):
        # split label and unlabeled data
        rng = np.random.RandomState(seed)
        random_labeled_points = rng.rand(len(ytrue)) < label_perc
        ys = np.array([-1] * len(ytrue))  # -1 denotes unlabeled point
        #label_perc = label_sample_perc
        #label_len = len(ytrue) * label_perc // 100
        #for x in range(0, label_len):
        #    ys[x] = ytrue[x]
        ys[random_labeled_points] = ytrue[random_labeled_points]
        return ys

    def validation(self, y_test, y_pred_test, y_pred_prob):
        acc = sklearn.metrics.accuracy_score(y_test,
                                             y_pred_test,
                                             sample_weight=None)
        print("Accuracy:", acc)
        print("F1 SCORE: ", f1_score(y_test, y_pred_test))
        print("classification report: ")
        print(classification_report(y_test, y_pred_test))
        cm = confusion_matrix(y_test, y_pred_test)
        TP = cm[1, 1]
        TN = cm[0, 0]
        FP = cm[0, 1]
        FN = cm[1, 0]
        classification_error = (FP + FN) / float(TP + TN + FP + FN)
        print("classification_error: ", classification_error)
        sensitivity = TP / float(FN + TP)
        print(
            "sensitivity: ", sensitivity
        )  # also known as recall score, When the actual value is positive, how often is the prediction correct?
        specificity = TN / (TN + FP)
        print(
            "specificity: ", specificity
        )  # When the actual value is negative, how often is the prediction correct?
        precision = TP / float(TP + FP)
        print(
            "precision: ", precision
        )  # How "precise" is the classifier when predicting positive instances?
        roc_auc = sklearn.metrics.roc_auc_score(y_test, y_pred_prob)
        print("ROC Curve AUC Area: ", roc_auc)
        print("Confusion matrix:")
        print(cm)
        label = ["0", "1"]
        sns.heatmap(cm, annot=True, xticklabels=label, yticklabels=label)
        plt.show()
        # plot histogram of predicted probability of diabtes
        plt.rcParams['font.size'] = 12
        # 8 bins
        plt.hist(y_pred_prob, bins=8)
        # x-axis limit from 0 to 1
        plt.xlim(0, 1)
        plt.title('Histogram of predicted probabilities')
        plt.xlabel('Predicted probability of diabetes')
        plt.ylabel('Frequency')
        plt.show()
        # plot ROC curve
        fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_test, y_pred_prob)
        print("fpr below")
        print(fpr)
        print("tpr below")
        print(tpr)
        plt.plot(fpr, tpr)
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.0])
        plt.rcParams['font.size'] = 12
        plt.title('ROC curve for diabetes classifier')
        plt.xlabel('False Positive Rate (1 - Specificity)')
        plt.ylabel('True Positive Rate (Sensitivity)')
        plt.grid(True)
        plt.show()
        return acc, sensitivity, specificity, roc_auc

    def cross_valid(self, model, X, Y):
        # Constants
        num_folds = 10
        num_instances = len(X)
        seed = 42
        np.random.seed(seed)
        kfold = cross_validation.KFold(n=num_instances,
                                       n_folds=num_folds,
                                       random_state=seed)
        #kfold = cross_validation.StratifiedKFold(n_splits=num_folds, random_state=seed)
        results = cross_val_score(model, X, Y, cv=kfold)

        results *= 100.0
        info = "Model 10 fold Accuracy mean: %.2f%% (+/- %.3f%%)" % (
            results.mean(), results.std())
        print(info)
        #print(results)

    def cross_valid2(self,
                     model,
                     X,
                     y,
                     label_perc=.8,
                     test_train_split=.2,
                     show_plot=False):
        results = []
        result_mean = []
        for i in range(0, 10):
            # split train, test data
            X_train, X_test, ytrue, y_test = model_selection.train_test_split(
                X, y, test_size=test_train_split, random_state=5 + i)

            # split label and unlabel sample
            ys = self.unlabel_data(ytrue, 5 + i, label_perc)

            model.fit(X_train, ys)
            y_pred_test = model.predict(X_test)
            y_pred_test_prob = model.predict_proba(X_test)[:, 1]
            accuracy = sklearn.metrics.accuracy_score(y_test,
                                                      y_pred_test,
                                                      sample_weight=None)
            results.append(accuracy * 100.0)
        print(results)
        print(
            "Model 10 fold Accuracy mean: %.2f%% (+/- %.3f%%)" %
            (np.mean(results), np.std(results)), "label %", label_perc)
        result_mean.append(np.mean(results))
        if show_plot:
            fig, ax = plt.subplots()
            plt.axis([1, 10, 0, 100])
            plt.title("10 fold CV Accuracy variance")
            sns.pointplot(x=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                          y=results,
                          ax=ax,
                          x_min=0,
                          x_max=10,
                          y_min=0,
                          y_max=100)
            ax.set_xlabel('Index Number for trial')
            ax.set_ylabel('Accuracy')
            plt.show()
        return result_mean

    def validate_algo(self, X, ytrue, model):
        self.cross_valid2(model, X, ytrue, show_plot=TRUE)
        label_percs = [.1, .2, .3, .4, .5, .6, .7, .8, .9]
        result = []
        for i in label_percs:
            result = numpy.append(result,
                                  self.cross_valid2(model, X, ytrue, i),
                                  axis=0)
        print(result)
        print(
            "Model 10 fold Accuracy with varrying label mean: %.2f%% (+/- %.3f%%)"
            % (np.mean(result), np.std(result)))

        fig, ax = plt.subplots()
        plt.axis([0, 1, 0, 100])
        plt.title("10 fold CV Accuracy with label sample %")
        sns.pointplot(x=label_percs,
                      y=result,
                      ax=ax,
                      x_min=0,
                      x_max=1,
                      y_min=0,
                      y_max=100)
        ax.set_xlabel('Labeled Sample Percentage')
        ax.set_ylabel('Accuracy')
        plt.show()

        test_train_splits = [.1, .2, .3, .4, .5, .6, .7, .8, .9]
        result = []
        for i in test_train_splits:
            result = numpy.append(result,
                                  self.cross_valid2(model, X, ytrue, .5, i),
                                  axis=0)
        print(result)
        print(
            "Model 10 fold Accuracy with varrying test data mean: %.2f%% (+/- %.3f%%)"
            % (np.mean(result), np.std(result)))
        fig, ax = plt.subplots()
        plt.axis([0, 1, 0, 100])
        plt.title("10 fold CV Accuracy with test sample %")
        sns.pointplot(x=test_train_splits,
                      y=result,
                      ax=ax,
                      x_min=0,
                      x_max=1,
                      y_min=0,
                      y_max=100)
        ax.set_xlabel('Test Sample Percentage')
        ax.set_ylabel('Accuracy')
        plt.show()

    def process(self):
        X, ytrue, sc_X = self.data_processing()
        self.basemodel = svm.SVC(kernel='rbf',
                                 decision_function_shape='ovr',
                                 probability=True)

        print("SVM model cross Validation")
        # create SVM model
        self.model2 = svm.SVC(kernel='sigmoid',
                              decision_function_shape='ovr',
                              probability=True,
                              gamma=.1,
                              coef0=.5)
        self.cross_valid(self.model2, X, ytrue)

        #TSVM
        print("T SVM Semi Supervised Classifier cross Validation")
        self.TSVMmodel = SKTSVM(kernel='rbf')
        #self.validate_algo(X, ytrue, self.TSVMmodel)

        #S3VMmodel
        print("CPLE SVM Semi Supervised Classifier cross Validation")
        self.S3VMmodel = CPLELearningModel(
            self.basemodel, predict_from_probabilities=True)  # RBF SVM
        #self.validate_algo(X, ytrue, self.S3VMmodel)
        #self.cross_valid2(self.S3VMmodel, X, ytrue, show_plot=TRUE, label_perc = .5)

        # create semi supervised model with svm as base model
        self.ssmodel = SelfLearningModel(self.basemodel)
        print("Fast Semi Supervised Classifier cross Validation")
        #self.validate_algo(X, ytrue, self.ssmodel)

        # split train, test data
        X, X_test, ytrue, y_test = model_selection.train_test_split(
            X, ytrue, test_size=.2, random_state=7)

        #split label and unlabel sample
        ys = self.unlabel_data(ytrue, 42, .8)

        # model with simple SVM
        self.model2.fit(X, ytrue)
        print("Simple SVM Model")
        y_pred_train_svm = self.model2.predict(X)
        y_pred_train_prob_svm = self.model2.predict_proba(X)[:, 1]
        print("SVM Algo Train Data Validation")
        self.validation(ytrue, y_pred_train_svm, y_pred_train_prob_svm)
        # test data with svm
        y_pred_test_svm = self.model2.predict(X_test)
        y2_pred_prob_svm = self.model2.predict_proba(X_test)[:, 1]
        print("SVM Algo Test Data Validation")
        self.validation(y_test, y_pred_test_svm, y_pred_prob_svm)

        # fit TSVM semi supervised model
        self.TSVMmodel.fit(X, ys)
        print("TSVM Semi Supervised Fast Algo ready")
        y_pred_train = self.TSVMmodel.predict(X)
        y_pred_train_prob = self.TSVMmodel.predict_proba(X)[:, 1]
        print("TSVM Semi Supervised Fast Algo Train Data Validation")
        self.validation(ytrue, y_pred_train, y_pred_train_prob)

        y_pred_test = self.TSVMmodel.predict(X_test)
        y_pred_prob = self.TSVMmodel.predict_proba(X_test)[:, 1]
        print("TSVMmodel Semi Supervised Fast Algo Test Data Validation")
        self.validation(y_test, y_pred_test, y_pred_prob)

        # fit CPLE semi supervised model
        self.S3VMmodel.fit(X, ys)
        print("CPLE Semi Supervised Fast Algo ready")
        y_pred_train = self.S3VMmodel.predict(X)
        y_pred_train_prob = self.S3VMmodel.predict_proba(X)[:, 1]
        print("CPLE Semi Supervised Fast Algo Train Data Validation")
        self.validation(ytrue, y_pred_train, y_pred_train_prob)

        y_pred_test = self.S3VMmodel.predict(X_test)
        y_pred_prob = self.S3VMmodel.predict_proba(X_test)[:, 1]
        print("CPLE Semi Supervised Fast Algo Test Data Validation")
        self.validation(y_test, y_pred_test, y_pred_prob)

        # fit Fast semi supervised model
        self.ssmodel.fit(X, ys)
        print("Semi Supervised Fast Algo ready")
        y_pred_train = self.ssmodel.predict(X)
        y_pred_train_prob = self.ssmodel.predict_proba(X)[:, 1]
        print("Semi Supervised Fast Algo Train Data Validation")
        self.validation(ytrue, y_pred_train, y_pred_train_prob)

        y_pred_test = self.ssmodel.predict(X_test)
        y_pred_prob = self.ssmodel.predict_proba(X_test)[:, 1]
        print("Semi Supervised Fast Algo Test Data Validation")
        return self.validation(y_test, y_pred_test, y_pred_prob)

    def predict(self, x):
        return self.ssmodel.predict(x)

    def plot_boundary(self, pl, model, title):
        X1, ytrue, sc_X = self.data_processing()
        # create PCA transform
        pca = PCA(n_components=2).fit(X1)
        pca_2d = pca.transform(X1)
        for i in range(0, pca_2d.shape[0]):
            if ytrue[i] == 0:
                c1 = pl.scatter(pca_2d[i, 0], pca_2d[i, 1], c='r', marker='+')
            else:
                c2 = pl.scatter(pca_2d[i, 0], pca_2d[i, 1], c='g', marker='o')
        pl.legend([c1, c2], ['Diabetes', 'No Diabetes'])
        x_min, x_max = pca_2d[:, 0].min() - 1, pca_2d[:, 0].max() + 1
        y_min, y_max = pca_2d[:, 1].min() - 1, pca_2d[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, .01),
                             np.arange(y_min, y_max, .01))

        # split label and unlabeled data for PCA self learning model
        ys = self.unlabel_data(ytrue, 42, .8)

        # create self learning model for PCA
        #basemodel = svm.SVC(kernel='rbf', decision_function_shape='ovr', probability=True)
        #ssmodel = SelfLearningModel(basemodel)
        model.fit(pca_2d, ys)
        print("PCA model built")
        Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)
        SMALL_SIZE = 14
        MEDIUM_SIZE = 16
        BIGGER_SIZE = 16
        plt.rc('font', size=SMALL_SIZE)  # controls default text sizes
        plt.rc('axes', titlesize=SMALL_SIZE)  # fontsize of the axes title
        plt.rc('axes', labelsize=MEDIUM_SIZE)  # fontsize of the x and y labels
        plt.rc('xtick', labelsize=SMALL_SIZE)  # fontsize of the tick labels
        plt.rc('ytick', labelsize=SMALL_SIZE)  # fontsize of the tick labels
        plt.rc('legend', fontsize=SMALL_SIZE)  # legend fontsize
        pl.contour(xx, yy, Z)
        pl.axis('off')
        pl.title(title)
        pl.show()
        return pl

    def Run_Algo(self):
        # main code
        D = DiabetesPrediction()
        D.process()

        # testing
        X1, ytrue, sc_X = D.data_processing()
        ##sample = [[6, 148, 72, 33.5, 0.627, 50]]
        ##sample = sc_X.transform(sample)
        print("testing first 10 samples:")
        print("Actual Y values:", ytrue[:10])
        print("Semi Supervised predicted Y values", D.predict(X1[:10, :]))
        print("Semi supervised predicted Y prob")
        print(D.ssmodel.predict_proba(X1[:10, :]))

        # plot model decision boundary
        D.plot_boundary(plt, self.ssmodel)
        D.plot_boundary(plt, self.TSVMmodel)

Example #2

Show file

File: main_interface.py Project: reirembeci/senior-capstone

test_set = TfidfVect.transform(test_data).toarray()

# Label Propagation
"""
label_prop_model = helpers.get_function('LP')
label_prop_model.fit(train_set, train_labels)
test_predict = label_prop_model.predict(test_set)
print(label_prop_model.score(test_set, test_labels))
"""

print("Total size of training set: ", len(train_labels))
i = 0
for l in train_labels:
    if l == -1:
        i += 1
print("Size of unlabeled data: ", i)
print("Size of the testing set ", len(test_data))

# TSVM
#"""
tsvm.fit(train_set, train_labels)
test_predict = tsvm.predict(test_set)
print("Accuracy: ", tsvm.score(test_set, test_labels))
#"""
print("Confusion matrix:")
matrix = confusion_matrix(test_labels, test_predict, labels=[1, 0])
print(matrix)
precision, recall, f_measure = functions.fmeasure(matrix)
print("Precision: ", precision)
print("Recall: ", recall)
print("f_measure: ", f_measure)