Beispiel #1
0
def test_predict_consistent():
    """ Check binary predict decision has also predicted probability above 0.5.
    """
    for kernel in kernels:
        gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
        assert_array_equal(gpc.predict(X),
                           gpc.predict_proba(X)[:, 1] >= 0.5)
Beispiel #2
0
def test_multi_class(kernel):
    # Test GPC for multi-class classification problems.
    gpc = GaussianProcessClassifier(kernel=kernel)
    gpc.fit(X, y_mc)

    y_prob = gpc.predict_proba(X2)
    assert_almost_equal(y_prob.sum(1), 1)

    y_pred = gpc.predict(X2)
    assert_array_equal(np.argmax(y_prob, 1), y_pred)
Beispiel #3
0
                b_list.append(b_i)

    mol_frac_list = np.column_stack((a_list, b_list))

    return mol_frac_list, a_list, b_list


plotlist, a_frac, b_frac = mol_frac_gen(a_start=0.1,
                                        a_stop=0.8,
                                        b_start=0.1,
                                        b_stop=0.45,
                                        a_number=400,
                                        b_number=400)

expected = y_test  # correct ans
predicted = classifier.predict(X_test)
plotted = classifier.predict(plotlist)

prediction_df_dict = {"predict": plotted, "A_RAW": a_frac, "B_RAW": b_frac}

prediction_df = pd.DataFrame(prediction_df_dict)

color_list = [
    "lightblue", "lightcoral", "navajowhite", "plum", "lightpink", "violet",
    "lightgreen", "mediumseagreen", "lightyellow", "mediumpurple"
]

test_color_list = ["blue", "red", "tan", "indigo", "deeppink"]

plt.rc('font', family='serif')
plt.rc('xtick', labelsize='medium')
Beispiel #4
0
# Specify Gaussian Processes with fixed and optimized hyperparameters
gp_fix = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0),
                                   optimizer=None)
gp_fix.fit(X[:train_size], y[:train_size])

gp_opt = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0))
gp_opt.fit(X[:train_size], y[:train_size])

print("Log Marginal Likelihood (initial): %.3f" %
      gp_fix.log_marginal_likelihood(gp_fix.kernel_.theta))
print("Log Marginal Likelihood (optimized): %.3f" %
      gp_opt.log_marginal_likelihood(gp_opt.kernel_.theta))

print("Accuracy: %.3f (initial) %.3f (optimized)" %
      (accuracy_score(y[:train_size], gp_fix.predict(X[:train_size])),
       accuracy_score(y[:train_size], gp_opt.predict(X[:train_size]))))
print("Log-loss: %.3f (initial) %.3f (optimized)" %
      (log_loss(y[:train_size],
                gp_fix.predict_proba(X[:train_size])[:, 1]),
       log_loss(y[:train_size],
                gp_opt.predict_proba(X[:train_size])[:, 1])))

# Plot posteriors
plt.figure()
plt.scatter(X[:train_size, 0],
            y[:train_size],
            c='k',
            label="Train data",
            edgecolors=(0, 0, 0))
plt.scatter(X[train_size:, 0],
Beispiel #5
0
kernel = 1.0 * RBF(1.0)
gpc = GaussianProcessClassifier(kernel=kernel, random_state=0)
gpc.fit(X_train, Y_train)

pkl_fileName = "Guassian_model.pkl"
with open(pkl_fileName, 'wb') as file:
    pickle.dump(gpc, file)

acc = gpc.score(X_train, Y_train)

#acc=gpc.score(X_test,Y_test)

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

gaussianNB_pred = gpc.predict(X_train)

precision = precision_score(Y_train, gaussianNB_pred, average='binary')
recall = recall_score(Y_train, gaussianNB_pred, average='binary')
F1 = (2 * precision * recall) / (precision + recall)
print("---------------ON TRAIN  DATA-----------------")
print(" Using Guassian Classifier\n precision=" + str(precision * 100) +
      "\nRecall=" + str(recall * 100))
print("Accuracy  ==" + str(acc * 100))
print("F1 is=" + str(F1))
print("---------------ON TRAIN DATA--------------")

###########################################
from sklearn.model_selection import KFold
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
Beispiel #6
0
def GPC2(X, Y, Z):
    Y = np.squeeze(Y)
    kernels = 1.0 * RBF(length_scale=1.0)
    clf = GaussianProcessClassifier(kernel=kernels, warm_start=True).fit(X, Y)
    p = clf.predict(Z)
    return p
Beispiel #7
0
def test_predict_consistent(kernel):
    # Check binary predict decision has also predicted probability above 0.5.
    gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
    assert_array_equal(gpc.predict(X), gpc.predict_proba(X)[:, 1] >= 0.5)
Beispiel #8
0
XGBClassifier = XGBClassifier()
XGBClassifier.fit(X, y)
y_pred = XGBClassifier.predict(X_test)
XGBClassifier_accy = round(accuracy_score(y_pred, y_test), 3)
print(XGBClassifier_accy)
from sklearn.ensemble import ExtraTreesClassifier
ExtraTreesClassifier = ExtraTreesClassifier()
ExtraTreesClassifier.fit(X, y)
y_pred = ExtraTreesClassifier.predict(X_test)
extraTree_accy = round(accuracy_score(y_pred, y_test), 3)
print(extraTree_accy)

from sklearn.gaussian_process import GaussianProcessClassifier
GaussianProcessClassifier = GaussianProcessClassifier()
GaussianProcessClassifier.fit(X, y)
y_pred = GaussianProcessClassifier.predict(X_test)
gau_pro_accy = round(accuracy_score(y_pred, y_test), 3)
print(gau_pro_accy)
from sklearn.ensemble import VotingClassifier

voting_classifier = VotingClassifier(estimators=[
    ('lr_grid', logreg_grid),
    ('svc', svm_grid),
    ('random_forest', rf_grid),
    ('gradient_boosting', gradient_boost),
    ('decision_tree_grid',dectree_grid),
    ('knn_classifier', knn_grid),
    ('XGB_Classifier', XGBClassifier),
    ('bagging_classifier', bagging_grid),
    ('adaBoost_classifier',adaBoost_grid),
    ('ExtraTrees_Classifier', ExtraTreesClassifier),
Beispiel #9
0
     [190, 90, 47], [175, 64, 39],
     [177, 70, 40], [159, 55, 37], [171, 75, 42], [181, 85, 43]]

Y = ['male', 'male', 'female', 'female', 'male', 'male', 'female', 'female',
     'female', 'male', 'male']


# train them on our data
clf = clf.fit(X, Y)
clf1 = clf1.fit(X, Y)
clf2 = clf2.fit(X, Y)
clf3 =clf3.fit(X, Y)

prediction = clf.predict([[190, 70, 43]])
prediction1 = clf1.predict([[120, 78, 49]])
prediction2 = clf2.predict([[130,89,33]])
prediction3 = clf3.predict([[150,99,55]])

print(accuracy_score(prediction,prediction3));



print('DecisionTreeClassifier')
print(prediction)
print('KNeighborsClassifier')
print(prediction1)
print('RandomForestClassifier')
print(prediction2)
print('GaussianProcessClassifier')
print(prediction3)
y=df['category']


# In[11]:


# gaussian process
kernel=1.0*RBF(1.0)
gpc=GaussianProcessClassifier(kernel=kernel,random_state=0).fit(x, y)
gpc.score(x,y)


# In[12]:


x_predict=gpc.predict(x)


# In[13]:


x_predict_prob=gpc.predict_proba(x)


# In[14]:


# multi-class logistic classification
mul_lr=linear_model.LogisticRegression(multi_class='multinomial',solver='newton-cg').fit(x,y)
x_predict_log=mul_lr.predict(x)
Beispiel #11
0
# Nearest Neighbors
pred_KNN = clf_KNN.predict(X)
acc_KNN = accuracy_score(Y, pred_KNN) * 100
print('Accuracy for KNN: {}'.format(acc_KNN))

# In[9]:

# RBF SVM
pred_svm_RBF = clf_svm_RBF.predict(X)
acc_svm_RBF = accuracy_score(Y, pred_svm_RBF) * 100
print('Accuracy for RBF SVM: {}'.format(acc_svm_RBF))

# In[10]:

# Gaussian Process
pred_GPC = clf_GPC.predict(X)
acc_GPC = accuracy_score(Y, pred_GPC) * 100
print('Accuracy for GPC: {}'.format(acc_GPC))

# In[11]:

# Random Forest
pred_RFC = clf_RFC.predict(X)
acc_RFC = accuracy_score(Y, pred_RFC) * 100
print('Accuracy for RFC: {}'.format(acc_RFC))

# In[12]:

# Neural Net
pred_NN = clf_NN.predict(X)
acc_NN = accuracy_score(Y, pred_NN) * 100
Beispiel #12
0
# get the evaluation set
a = process_eval_depression()
x_eval = a[0]
y_eval = a[1]

# get the test set
a = process_test_depression()
x_test = a[0]
y_test = a[1]

print(x_test)

import numpy as np
from sklearn.gaussian_process import GaussianProcessClassifier

gp = GaussianProcessClassifier()
gp.fit(x_train, y_train)
y_pred = gp.predict(x_eval)
y_pred2 = gp.predict(x_test)

print(gp.predict([[0, 3, 1, 1]]))

from sklearn import metrics
print("Precision for evaluation set:",metrics.precision_score(y_eval, y_pred))
print("Precision for test set:",metrics.precision_score(y_test, y_pred2))

print("recall for evaluation set:",metrics.recall_score(y_eval, y_pred))
print("recall for test set:",metrics.recall_score(y_test, y_pred2))

Beispiel #13
0
class WindowClassifier():
    def __init__(self, X, Y):  # X refers to the features, Y the labels
        self._X = X
        self._Y = Y

        # Test 40% Train 60%
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self._X, self._Y, test_size=0.4)

    def handleEvaluate(self,
                       modelName,
                       extraParameter=None):  # X -> features to predict
        if modelName == "nn":
            self.knn_metrics = {}
            self._knn = neighbors.KNeighborsClassifier(extraParameter)
            self._knn.fit(self.X_train, self.y_train)
            self.knn(5)
            return self.knn_metrics['accuracy']
        elif modelName == "svm":
            self.svm_metrics = {}
            self._svm = self._svm = svm.SVC(C=extraParameter)
            self._svm.fit(self.X_train, self.y_train)
            self.svm(5)
            return self.svm_metrics['accuracy']
        elif modelName == "rf":
            self.rff_metrics = {}
            self._rff = RandomForestClassifier(extraParameter)
            self._rff.fit(self.X_train, self.y_train)
            self.random_forest(5)
            return self.rff_metrics['accuracy']
        elif modelName == "lr":
            self.lr_metrics = {}
            self._lr = LogisticRegression()
            self._lr.fit(self.X_train, self.y_train)
            self.lr(5)
            return self.lr_metrics['accuracy']
        elif modelName == "mlp":
            self.mlp_metrics = {}
            self._mlp = MLPClassifier()
            self._mlp.fit(self.X_train, self.y_train)
            self.mlp(5)
            return self.mlp_metrics['accuracy']
        elif modelName == "gpc":
            self.gaupc_metrics = {}
            self._gaupc = GaussianProcessClassifier()
            self._gaupc.fit(self.X_train, self.y_train)
            self.gaupc(5)
            return self.gaupc_metrics['accuracy']
        elif modelName == "dtc":
            self.detc_metrics = {}
            self._detc = DecisionTreeClassifier()
            self._detc.fit(self.X_train, self.y_train)
            self.detc(5)
            return self.detc_metrics['accuracy']
        elif modelName == "ada":
            self.adab_metrics = {}
            self._adab = AdaBoostClassifier()
            self._adab.fit(self.X_train, self.y_train)
            self.adab(5)
            return self.adab_metrics['accuracy']
        elif modelName == "gnb":
            self.ganb_metrics = {}
            self._ganb = GaussianNB()
            self._ganb.fit(self.X_train, self.y_train)
            self.ganb(5)
            return self.ganb_metrics['accuracy']
        elif modelName == "qd":
            self.qud_metrics = {}
            self._qud = QuadraticDiscriminantAnalysis()
            self._qud.fit(self.X_train, self.y_train)
            self.qud(5)
            return self.qud_metrics['accuracy']
        # call other methods here
        return None

    def handlePredict(self, X, modelName):  # X -> features to predict
        print("WindowClassifier => handlePredict()")
        if modelName == "nn":
            return self.predict_knn(X)
        elif modelName == "svm":
            return self.predict_svm(X)
        elif modelName == "rf":
            return self.predict_forest(X)
        elif modelName == "mlp":
            return self.predict_mlp(X)
        elif modelName == "gpc":
            return self.predict_gaupc(X)
        elif modelName == "dtc":
            return self.predict_detc(X)
        elif modelName == "ada":
            return self.predict_adab(X)
        elif modelName == "gnb":
            return self.predict_ganb(X)
        elif modelName == "qd":
            return self.predict_qud(X)
        return None

    # Add new classifers here
    def logistic(self, CV, n_estimate):
        # Create KNN classifier
        clf = neighbors.KNeighborsClassifier(n_neighbors=n_estimate)

        # Train model with a specified cv
        cv_scores = cross_val_score(clf,
                                    self._X,
                                    self._Y,
                                    cv=CV,
                                    scoring='accuracy')
        cv_precision = cross_val_score(clf,
                                       self._X,
                                       self._Y,
                                       cv=CV,
                                       scoring='precision')
        cv_recall = cross_val_score(clf,
                                    self._X,
                                    self._Y,
                                    cv=CV,
                                    scoring='recall')

        values = {
            'accuracy': np.mean(cv_scores),
            'precision': np.mean(cv_precision),
            'recall': np.mean(cv_recall)
        }
        return values

    def X(self):
        return self._X if self._X is not None else None

    def Y(self):
        return self._Y if self._Y is not None else None

    def Y_test(self):
        return self.y_test if self.y_test is not None else None

    '''Next few methods get the accuracy, until the webpage is developed we will redefine this again'''

    def knn_accuracy(self):
        if len(self.knn_metrics) != 0:
            return self.knn_metrics['accuracy']
        raise ValueError('Knn has not been evaluated')

    def rff_accuracy(self):
        if len(self.rff_metrics) != 0:
            return self.knn_metrics['accuracy']
        raise ValueError('Random Forest has not been evaluated')

    def svm_accuracy(self):
        if len(self.svm_metrics) != 0:
            return self.knn_metrics['accuracy']
        raise ValueError('SVM has not been evaluated')

    '''All classifiers along with metrics in interest and other predictions'''

    def knn(self, CV):
        # Train model with a specified cv
        cv_scores = cross_val_score(self._knn,
                                    self._X,
                                    self._Y,
                                    cv=CV,
                                    scoring='accuracy')
        cv_precision = cross_val_score(self._knn,
                                       self._X,
                                       self._Y,
                                       cv=CV,
                                       scoring='precision')
        cv_recall = cross_val_score(self._knn,
                                    self._X,
                                    self._Y,
                                    cv=CV,
                                    scoring='recall')

        values = {
            'accuracy': np.mean(cv_scores),
            'precision': np.mean(cv_precision),
            'recall': np.mean(cv_recall)
        }

        self.knn_metrics = values
        return values

    def predict_knn(self, matrix=None):  # matrix -> features
        print("WindowClassifier => predictKNN()")
        # Make prediction with knn
        # print(matrix[0])
        predX = self.X_test if matrix is None else matrix
        # print(predX.shape)
        predY = self._knn.predict(predX)  # => [0, 1, 0, 1, 1, 1, 1]
        # print(predY)
        return predY

    def svm(self, CV):
        # Train model with a specified cv
        cv_scores = cross_val_score(self._svm,
                                    self._X,
                                    self._Y,
                                    cv=CV,
                                    scoring='accuracy')
        cv_precision = cross_val_score(self._svm,
                                       self._X,
                                       self._Y,
                                       cv=CV,
                                       scoring='precision')
        cv_recall = cross_val_score(self._svm,
                                    self._X,
                                    self._Y,
                                    cv=CV,
                                    scoring='recall')

        values = {
            'accuracy': np.mean(cv_scores),
            'precision': np.mean(cv_precision),
            'recall': np.mean(cv_recall)
        }
        self.svm_metrics = values
        return values

    def predict_svm(self, matrix=None):  # matrix -> features
        print("WindowClassifier => predictSVM()")
        # Make prediction with svm
        Y_pred = self._svm.predict(self.X_test if matrix is None else matrix)
        return Y_pred

    def random_forest(self, CV):
        # Train model with a specified cv
        cv_scores = cross_val_score(self._rff,
                                    self._X,
                                    self._Y,
                                    cv=CV,
                                    scoring='accuracy')
        cv_precision = cross_val_score(self._rff,
                                       self._X,
                                       self._Y,
                                       cv=CV,
                                       scoring='precision')
        cv_recall = cross_val_score(self._rff,
                                    self._X,
                                    self._Y,
                                    cv=CV,
                                    scoring='recall')

        values = {
            'accuracy': np.mean(cv_scores),
            'precision': np.mean(cv_precision),
            'recall': np.mean(cv_recall)
        }
        self.rff_metrics = values
        return values

    def predict_forest(self, matrix=None):  # matrix -> features
        # Make prediction with random forest
        Y_pred = self._rff.predict(self.X_test if matrix is None else matrix)
        return Y_pred

    def lr(self, CV):
        # Train model with a specified cv
        cv_scores = cross_val_score(self._lr,
                                    self._X,
                                    self._Y,
                                    cv=CV,
                                    scoring='accuracy')
        cv_precision = cross_val_score(self._lr,
                                       self._X,
                                       self._Y,
                                       cv=CV,
                                       scoring='precision')
        cv_recall = cross_val_score(self._lr,
                                    self._X,
                                    self._Y,
                                    cv=CV,
                                    scoring='recall')

        values = {
            'accuracy': np.mean(cv_scores),
            'precision': np.mean(cv_precision),
            'recall': np.mean(cv_recall)
        }

        self.lr_metrics = values
        return values

    def mlp(self, CV):
        # Train model with a specified cv
        cv_scores = cross_val_score(self._mlp,
                                    self._X,
                                    self._Y,
                                    cv=CV,
                                    scoring='accuracy')
        cv_precision = cross_val_score(self._mlp,
                                       self._X,
                                       self._Y,
                                       cv=CV,
                                       scoring='precision')
        cv_recall = cross_val_score(self._mlp,
                                    self._X,
                                    self._Y,
                                    cv=CV,
                                    scoring='recall')

        values = {
            'accuracy': np.mean(cv_scores),
            'precision': np.mean(cv_precision),
            'recall': np.mean(cv_recall)
        }

        self.mlp_metrics = values
        return values

    def predict_mlp(self, matrix=None):  # matrix -> features
        print("WindowClassifier => predictMLP()")
        # Make prediction with svm
        Y_pred = self._mlp.predict(self.X_test if matrix is None else matrix)
        return Y_pred

    def gaupc(self, CV):
        # Train model with a specified cv
        cv_scores = cross_val_score(self._gaupc,
                                    self._X,
                                    self._Y,
                                    cv=CV,
                                    scoring='accuracy')
        cv_precision = cross_val_score(self._gaupc,
                                       self._X,
                                       self._Y,
                                       cv=CV,
                                       scoring='precision')
        cv_recall = cross_val_score(self._gaupc,
                                    self._X,
                                    self._Y,
                                    cv=CV,
                                    scoring='recall')

        values = {
            'accuracy': np.mean(cv_scores),
            'precision': np.mean(cv_precision),
            'recall': np.mean(cv_recall)
        }

        self.gaupc_metrics = values
        return values

    def predict_gaupc(self, matrix=None):  # matrix -> features
        print("WindowClassifier => predictGPC()")
        # Make prediction with svm
        Y_pred = self._gaupc.predict(self.X_test if matrix is None else matrix)
        return Y_pred

    def detc(self, CV):
        # Train model with a specified cv
        cv_scores = cross_val_score(self._detc,
                                    self._X,
                                    self._Y,
                                    cv=CV,
                                    scoring='accuracy')
        cv_precision = cross_val_score(self._detc,
                                       self._X,
                                       self._Y,
                                       cv=CV,
                                       scoring='precision')
        cv_recall = cross_val_score(self._detc,
                                    self._X,
                                    self._Y,
                                    cv=CV,
                                    scoring='recall')

        values = {
            'accuracy': np.mean(cv_scores),
            'precision': np.mean(cv_precision),
            'recall': np.mean(cv_recall)
        }
        self.detc_metrics = values
        return values

    def predict_detc(self, matrix=None):  # matrix -> features
        print("WindowClassifier => predictDT()")
        # Make prediction with svm
        Y_pred = self._detc.predict(self.X_test if matrix is None else matrix)
        return Y_pred

    def adab(self, CV):
        # Train model with a specified cv
        cv_scores = cross_val_score(self._adab,
                                    self._X,
                                    self._Y,
                                    cv=CV,
                                    scoring='accuracy')
        cv_precision = cross_val_score(self._adab,
                                       self._X,
                                       self._Y,
                                       cv=CV,
                                       scoring='precision')
        cv_recall = cross_val_score(self._adab,
                                    self._X,
                                    self._Y,
                                    cv=CV,
                                    scoring='recall')

        values = {
            'accuracy': np.mean(cv_scores),
            'precision': np.mean(cv_precision),
            'recall': np.mean(cv_recall)
        }
        self.adab_metrics = values
        return values

    def predict_adab(self, matrix=None):  # matrix -> features
        print("WindowClassifier => predictADA()")
        # Make prediction with svm
        Y_pred = self._adab.predict(self.X_test if matrix is None else matrix)
        return Y_pred

    def ganb(self, CV):
        # Train model with a specified cv
        cv_scores = cross_val_score(self._ganb,
                                    self._X,
                                    self._Y,
                                    cv=CV,
                                    scoring='accuracy')
        cv_precision = cross_val_score(self._ganb,
                                       self._X,
                                       self._Y,
                                       cv=CV,
                                       scoring='precision')
        cv_recall = cross_val_score(self._ganb,
                                    self._X,
                                    self._Y,
                                    cv=CV,
                                    scoring='recall')

        values = {
            'accuracy': np.mean(cv_scores),
            'precision': np.mean(cv_precision),
            'recall': np.mean(cv_recall)
        }
        self.ganb_metrics = values
        return values

    def predict_ganb(self, matrix=None):  # matrix -> features
        print("WindowClassifier => predictGNB()")
        # Make prediction with svm
        Y_pred = self._ganb.predict(self.X_test if matrix is None else matrix)
        return Y_pred

    def qud(self, CV):
        # Train model with a specified cv
        cv_scores = cross_val_score(self._qud,
                                    self._X,
                                    self._Y,
                                    cv=CV,
                                    scoring='accuracy')
        cv_precision = cross_val_score(self._qud,
                                       self._X,
                                       self._Y,
                                       cv=CV,
                                       scoring='precision')
        cv_recall = cross_val_score(self._qud,
                                    self._X,
                                    self._Y,
                                    cv=CV,
                                    scoring='recall')

        values = {
            'accuracy': np.mean(cv_scores),
            'precision': np.mean(cv_precision),
            'recall': np.mean(cv_recall)
        }
        self.qud_metrics = values
        return values

    def predict_qud(self, matrix=None):  # matrix -> features
        print("WindowClassifier => predictQUD()")
        # Make prediction with svm
        Y_pred = self._qud.predict(self.X_test if matrix is None else matrix)
        return Y_pred

    '''Tuning'''

    def pick_the_best_knn(self):
        knn2 = neighbors.KNeighborsClassifier()
        param_grid = {"n_neighbors": np.arange(1, 25)}
        knn_gscv = GridSearchCV(knn2, param_grid, cv=5)
        knn_gscv.fit(self._X, self._Y)
        return knn_gscv.best_params_

    def pick_the_best_random_forest(self):
        rff = RandomForestRegressor()
        n_estimators = [
            int(x) for x in np.linspace(start=200, stop=2000, num=10)
        ]
        # Number of features to consider at every split
        max_features = ['auto', 'sqrt']
        max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
        max_depth.append(None)
        min_samples_split = [2, 5, 10]
        min_samples_leaf = [1, 2, 4]
        bootstrap = [True, False]
        random_grid = {
            'n_estimators': n_estimators,
            'max_features': max_features,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'bootstrap': bootstrap
        }
        rf_random = RandomizedSearchCV(estimator=rff,
                                       param_distributions=random_grid,
                                       n_iter=100,
                                       cv=3,
                                       verbose=2,
                                       random_state=42,
                                       n_jobs=-1)
        rf_random.fit(self._X, self._Y)
        print(rf_random.best_params_)
Beispiel #14
0
neigh = KNeighborsClassifier(n_neighbors=nneighbors)

neigh.fit(df[features].values, df['Survived'].values.ravel())

pre = neigh.predict(test[features].values)

print('accuracy on training set\n')
neigh.score(df[features].values, df['Survived'].values.ravel())

#%% gaussian process classifier

gp = GaussianProcessClassifier(n_jobs=-1)

gp.fit(x_train, y_train)

pre = gp.predict(x_val)

print('accuracy on training set\n')
gp.score(x_train, y_train)

print('accuracy on validation set\n')
gp.score(x_val, y_val)

#%% logistic regressor

logreg = linear_model.LogisticRegression(solver='lbfgs',
                                         penalty='l2',
                                         C=1.0e2,
                                         max_iter=1000,
                                         warm_start=True)
Beispiel #15
0
def run_classifications(X, y, X_test, labelname, k, features, headers):
    ret_predictions = {}

    if features != -1:
        X1, X_test1, n1 = feature_selection_chi2(X, X_test, y, features,
                                                 headers)
        X2, X_test2, n2 = feature_selection_f_classif(X, X_test, y, features,
                                                      headers)
        X = np.concatenate((X1, X2), axis=1)
        X_test = np.concatenate((X_test1, X_test2), axis=1)

    # X, X_test, n = feature_selection_f_classif(X, X_test, y, features, headers)

    print('{} : {}'.format("Feature Selected X", X.shape))
    print('{} : {}'.format("Feature Selected X_test", X_test.shape))
    print_line()

    # return

    k_fold = KFold(n_splits=k, shuffle=True, random_state=0)

    # # L2 LOGISTIC REGRESSION ######
    # lr2 = LogisticRegression()
    # start_time = time.time()
    # lr2.fit(X, y)
    # runtime = str(time.time() - start_time)
    # y_train = lr2.predict(X)
    # y_test = lr2.predict(X_test)
    # print_classification_stats("L2 Logistic Regression " + labelname, y, y_train, y_test, runtime)
    # cv = cross_val_score(lr2, X, y, cv=k_fold, scoring='mean_squared_error')
    # print "CV Score: " + str(cv)
    # print "CV Average: " + str(sum(cv)/float(len(cv)))
    # print_line()
    # ret_predictions['lr2'] = np.concatenate((y_train, y_test))

    # # L1 LOGISTIC REGRESSION ######
    # lr1 = LogisticRegression(penalty='l1')
    # start_time = time.time()
    # lr1.fit(X, y)
    # runtime = str(time.time() - start_time)
    # y_train = lr1.predict(X)
    # y_test = lr1.predict(X_test)
    # print_classification_stats("L2 Logistic Regression " + labelname, y, y_train, y_test, runtime)
    # cv = cross_val_score(lr1, X, y, cv=k_fold, scoring='mean_squared_error')
    # print "CV Score: " + str(cv)
    # print "CV Average: " + str(sum(cv)/float(len(cv)))
    # print_line()
    # ret_predictions['lr1'] = np.concatenate((y_train, y_test))

    # RANDOM FOREST ######
    rf = RandomForestClassifier()
    start_time = time.time()
    rf.fit(X, y)
    runtime = str(time.time() - start_time)
    y_train = rf.predict(X)
    y_test = rf.predict(X_test)
    print_classification_stats("Random Forest " + labelname, y, y_train,
                               y_test, runtime)
    cv = cross_val_score(rf, X, y, cv=k_fold, scoring='mean_squared_error')
    print "CV Score: " + str(cv)
    print "CV Average: " + str(sum(cv) / float(len(cv)))
    print_line()
    ret_predictions['rf'] = np.concatenate((y_train, y_test))

    # lo,hi = prediction_Error_Bootstrap(rf, X, y)
    # print ".95 Confidence Interval: " + str(lo) + " - " + str(hi)

    # # K NEAREST NEIGHBORS ######
    # neigh = KNeighborsClassifier(4)
    # start_time = time.time()
    # neigh.fit(X, y)
    # runtime = str(time.time() - start_time)
    # y_train = neigh.predict(X)
    # y_test = neigh.predict(X_test)
    # print_classification_stats("KNN " + labelname, y, y_train, y_test, runtime)
    # cv = cross_val_score(neigh, X, y, cv=k_fold, scoring='mean_squared_error')
    # print "CV Score: " + str(cv)
    # print "CV Average: " + str(sum(cv)/float(len(cv)))
    # print_line()
    # ret_predictions['knn'] = np.concatenate((y_train, y_test))

    # lo,hi = prediction_Error_Bootstrap(neigh, X, y)
    # print ".95 Confidence Interval: " + str(lo) + " - " + str(hi)

    # # Linear SVM ######
    # #svc = SVC(kernel='linear', C=0.025)
    # #start_time = time.time()
    # #svc.fit(X, y)
    # #runtime = str(time.time() - start_time)
    # #y_train = svc.predict(X)
    # #y_test = svc.predict(X_test)
    # #print_classification_stats("Linear SVM " + labelname, y, y_train, y_test, runtime)
    # #cv = cross_val_score(svc, X, y, cv=k_fold, scoring='mean_squared_error')
    # #print "CV Score: " + str(cv)
    # #print "CV Average: " + str(sum(cv)/float(len(cv)))
    # #print_line()
    # #ret_predictions['svc'] = np.concatenate((y_train, y_test))

    # ## RBF SVM ######
    # #rsvc = SVC(gamma=2, C=1)
    # #start_time = time.time()
    # #rsvc.fit(X, y)
    # #runtime = str(time.time() - start_time)
    # #y_train = rsvc.predict(X)
    # #y_test = rsvc.predict(X_test)
    # #print_classification_stats("RBF SVM " + labelname, y, y_train, y_test, runtime)
    # #cv = cross_val_score(rsvc, X, y, cv=k_fold, scoring='mean_squared_error')
    # #print "CV Score: " + str(cv)
    # #print "CV Average: " + str(sum(cv)/float(len(cv)))
    # #print_line()
    # #ret_predictions['rbf'] = np.concatenate((y_train, y_test))

    # Gaussian Process ######
    gp = GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True)
    start_time = time.time()
    gp.fit(X, y)
    runtime = str(time.time() - start_time)
    y_train = gp.predict(X)
    y_test = gp.predict(X_test)
    print_classification_stats("Gaussian Process " + labelname, y, y_train,
                               y_test, runtime)
    cv = cross_val_score(gp, X, y, cv=k_fold, scoring='mean_squared_error')
    print "CV Score: " + str(cv)
    print "CV Average: " + str(sum(cv) / float(len(cv)))
    print_line()
    ret_predictions['gp'] = np.concatenate((y_train, y_test))

    # lo,hi = prediction_Error_Bootstrap(gp, X, y)
    # print ".95 Confidence Interval: " + str(lo) + " - " + str(hi)

    # # Decision Tree ######
    # dt = DecisionTreeClassifier()
    # start_time = time.time()
    # dt.fit(X, y)
    # runtime = str(time.time() - start_time)
    # y_train = dt.predict(X)
    # y_test = dt.predict(X_test)
    # print_classification_stats("Decision Tree " + labelname, y, y_train, y_test, runtime)
    # cv = cross_val_score(dt, X, y, cv=k_fold, scoring='mean_squared_error')
    # print "CV Score: " + str(cv)
    # print "CV Average: " + str(sum(cv)/float(len(cv)))
    # print_line()
    # ret_predictions['dt'] = np.concatenate((y_train, y_test))

    # # Neural Net ######
    # mlp = MLPClassifier()
    # start_time = time.time()
    # mlp.fit(X, y)
    # runtime = str(time.time() - start_time)
    # y_train = mlp.predict(X)
    # y_test = mlp.predict(X_test)
    # print_classification_stats("Neural Net " + labelname, y, y_train, y_test, runtime)
    # cv = cross_val_score(mlp, X, y, cv=k_fold, scoring='mean_squared_error')
    # print "CV Score: " + str(cv)
    # print "CV Average: " + str(sum(cv)/float(len(cv)))
    # print_line()
    # ret_predictions['mlp'] = np.concatenate((y_train, y_test))

    # # AdaBoost Classifier ######
    # ab = AdaBoostClassifier()
    # start_time = time.time()
    # ab.fit(X, y)
    # runtime = str(time.time() - start_time)
    # y_train = ab.predict(X)
    # y_test = ab.predict(X_test)
    # print_classification_stats("AdaBoost " + labelname, y, y_train, y_test, runtime)
    # cv = cross_val_score(ab, X, y, cv=k_fold, scoring='mean_squared_error')
    # print "CV Score: " + str(cv)
    # print "CV Average: " + str(sum(cv)/float(len(cv)))
    # print_line()
    # ret_predictions['ab'] = np.concatenate((y_train, y_test))

    # lo,hi = prediction_Error_Bootstrap(ab, X, y)
    # print ".95 Confidence Interval: " + str(lo) + " - " + str(hi)

    # # Naive Bayes ######
    # gnb = GaussianNB()
    # start_time = time.time()
    # gnb.fit(X, y)
    # runtime = str(time.time() - start_time)
    # y_train = gnb.predict(X)
    # y_test = gnb.predict(X_test)
    # print_classification_stats("Naive Bayes " + labelname, y, y_train, y_test, runtime)
    # cv = cross_val_score(gnb, X, y, cv=k_fold, scoring='mean_squared_error')
    # print "CV Score: " + str(cv)
    # print "CV Average: " + str(sum(cv)/float(len(cv)))
    # print_line()
    # ret_predictions['gnb'] = np.concatenate((y_train, y_test))

    # # QDA ######
    # qda = QuadraticDiscriminantAnalysis()
    # start_time = time.time()
    # qda.fit(X, y)
    # runtime = str(time.time() - start_time)
    # y_train = qda.predict(X)
    # y_test = qda.predict(X_test)
    # print_classification_stats("QDA " + labelname, y, y_train, y_test, runtime)
    # cv = cross_val_score(qda, X, y, cv=k_fold, scoring='mean_squared_error')
    # print "CV Score: " + str(cv)
    # print "CV Average: " + str(sum(cv)/float(len(cv)))
    # print_line()
    # ret_predictions['qda'] = np.concatenate((y_train, y_test))

    # lo,hi = prediction_Error_Bootstrap(qda, X, y)
    # print ".95 Confidence Interval: " + str(lo) + " - " + str(hi)

    return ret_predictions
Beispiel #16
0
def main():

    #Load Training Data -- 45 titles of 9 different genres,

    #5 Blues, 5 Classic Rock, 5 Classical, 5 Country, 5 Electronic, 5 Metal,
    #5 Hip Hop, 5 Jazz, 5 Pop

    #4 Features: Note Density(Avg. # of Notes Per Second, Initial Tempo, Bass Register Importance,
    #Amount of Arpeggiation) -- All previously normalized

    trainList = [[
        -0.766678522671417, -1.4134894457127, 0.0122606256864645,
        0.43151719310051
    ], [
        -1.25246164516747, -1.0661631348805, 1.66340797727991, 1.21419919717718
    ],
                 [
                     -1.40405821644502, -1.0661631348805, 0.403817590836005,
                     -1.09638443503824
                 ],
                 [
                     -1.3577680200252, 0.971484555335123, 0.649660918402909,
                     -0.553046197585348
                 ],
                 [
                     -1.12520463926371, -1.11247330965812, 2.62171635167137,
                     0.652347745189381
                 ],
                 [
                     2.43551771310301, 0.369452283225963, 0.434310848102859,
                     -0.134141978619881
                 ],
                 [
                     0.477945144638293, -0.232579988883197, 0.0594181118767843,
                     -0.0658562454473348
                 ],
                 [
                     0.870921477457971, -0.394665600604894, -0.400726511974132,
                     0.479514784984597
                 ],
                 [
                     1.45707350937701, -0.440975775382522, -0.308871070886021,
                     1.06130302355337
                 ],
                 [
                     0.317025254101156, 0.276831933670708, 0.0302952889775195,
                     -0.313278979682085
                 ],
                 [
                     0.344564567366374, -0.811457173603543, 0.0058742100751454,
                     -0.772179500134836
                 ],
                 [
                     -1.43344191386319, -0.950387697936426, 0.189561226701406,
                     -1.63399440732091
                 ],
                 [
                     -0.509888362597354, 0.624158244502915, -1.64915124629092,
                     -2.57380317251345
                 ],
                 [
                     -0.688984501422195, -0.440975775382522,
                     -0.573876378367778, -0.911725215749629
                 ],
                 [
                     1.0931214550796, 0.415762458003591, -0.239951378380251,
                     -0.647473539619574
                 ],
                 [
                     0.305762422927743, 0.0915912345601968, 0.515319672402441,
                     0.0357254444966523
                 ],
                 [
                     0.0624347875482635, -0.37151051321608,
                     -0.0786751179102309, -0.50150779894908
                 ],
                 [
                     -1.15105910134567, -1.25140383399101, 0.486438451417823,
                     -1.57481116359108
                 ],
                 [
                     -0.705237572473972, 1.89768805088768, 1.29234485530951,
                     0.154049060043043
                 ],
                 [
                     -0.044785112264018, -0.440975775382522, 0.14184574295472,
                     0.528489953548707
                 ],
                 [
                     0.890374866596739, 0.23052175889308, -0.728919490615075,
                     -0.395906062340086
                 ],
                 [
                     0.338609204986195, 0.0915912345601968,
                     0.00240196126908647, 0.675915446802265
                 ],
                 [
                     0.702159802983765, 0.0452810597825691, 1.56736511491168,
                     -0.454043070733069
                 ],
                 [
                     1.57109652803916, 0.161056496726638, -1.09636595497638,
                     1.14348078111971
                 ],
                 [
                     -0.37196172410778, 0.554692982336474, -0.368561234334016,
                     0.907415824160339
                 ],
                 [
                     0.600507499201812, 1.4345863031114, 1.00839825378751,
                     0.955005214604942
                 ],
                 [
                     1.10244499892636, 2.17554909955344, 1.81454080734429,
                     0.363035159499528
                 ],
                 [
                     0.153359658817574, -1.18193857182456, 1.6354924054874,
                     -0.00454647255469575
                 ],
                 [
                     0.916855697221869, 1.24934560400089, 1.93419178349293,
                     0.244811833289804
                 ],
                 [
                     -0.0918102995735832, -0.927232610547612,
                     0.127335162294101, 0.0839276733003873
                 ],
                 [
                     1.37119861987878, -0.811457173603543, -1.52856876509287,
                     0.67106206208497
                 ],
                 [
                     0.93323082818666, -0.603061387104218, -0.400227918868385,
                     1.8010528221493
                 ],
                 [
                     1.07383037479694, -0.950387697936426, -0.573990235637437,
                     1.29493370762047
                 ],
                 [
                     -0.583104951959534, -0.834612260992357, -1.43566596814499,
                     1.40195387262669
                 ],
                 [
                     -1.19986159572819, -1.29771400876863, -0.12304791014598,
                     1.49648769523065
                 ],
                 [
                     -0.372297246211269, 1.66613717699954, -1.11177695290247,
                     -0.910589434995976
                 ],
                 [
                     0.399464128365263, -0.834612260992357, 0.160241864875368,
                     -0.855590862944964
                 ],
                 [
                     -1.14389931582131, -0.649371561881846, -1.38617972788128,
                     -2.7441128533059
                 ],
                 [
                     -0.771977276287568, 0.253676846281894, -0.83487270827995,
                     0.255986573229018
                 ],
                 [
                     -1.36950601865057, -1.64504031960084, 0.0851051444620558,
                     -2.01914851836497
                 ],
                 [
                     -0.233730884591378, -0.927232610547612, 0.400787709093632,
                     0.196017786581242
                 ],
                 [
                     0.613634713378625, -0.209424901494383, 0.171104729047728,
                     -0.688776703429684
                 ],
                 [
                     -0.964928464375092, 1.48089647788903, -1.63760172273251,
                     -1.13536431429426
                 ],
                 [
                     0.262997873689561, -0.880922435769985, 0.45336491398644,
                     0.160631572105278
                 ],
                 [
                     2.46876985514246, -1.11247330965812, -1.8129457869236,
                     1.20203427847653
                 ]]

    testList = [[
        -1.42550367089293, -0.741991911437102, 1.03907139337304,
        -0.524103659257335
    ],
                [
                    -1.4713199318185, -0.834612260992357, 0.610848921601832,
                    -1.07153109849005
                ],
                [
                    0.0483469943547123, 1.06410490489038, 1.41483496062355,
                    -0.736087518180374
                ],
                [
                    -0.973697981011053, 0.693623506669357, -0.180733411152442,
                    -1.16782656220974
                ],
                [
                    -1.0181376270085, -1.0661631348805, -0.664143752552901,
                    0.270549075112729
                ],
                [
                    0.0740472545469077, -0.255735076272011, 0.582212294684255,
                    -0.128785564777901
                ],
                [
                    0.71758688500379, 0.554692982336474, 0.133199237498739,
                    0.581912344088212
                ],
                [
                    -0.137417909220219, 1.71244735177717, 0.60518579551654,
                    -0.154413314075152
                ],
                [
                    0.271454224529076, -1.25140383399101, 0.613087382121505,
                    -0.772701130690326
                ],
                [
                    0.864020291135146, -0.209424901494383, -0.191872343613909,
                    0.217888209405153
                ],
                [
                    -0.810894246904719, 1.15672525444563, -0.172523673097193,
                    -1.04070772100896
                ],
                [
                    3.20706768407871, 1.71244735177717, -0.643132149404525,
                    -0.357730150445054
                ],
                [
                    -0.250691095660947, -0.533596124937777, -1.30676064627265,
                    -2.08822794623238
                ],
                [
                    -0.795521491862785, 0.137901409337824, 0.150083129455051,
                    -1.40763364196608
                ],
                [
                    0.372076487690884, -0.487285950160149, -0.849538792435035,
                    -1.26744421775474
                ],
                [
                    -0.65660384990208, -1.29771400876863, -0.180568359917071,
                    1.42353891497765
                ],
                [
                    -0.77988391630925, -0.37151051321608, 1.21149957238249,
                    -0.365286553173383
                ],
                [
                    -1.30604172925065, -0.139959639327942, -0.229290080643909,
                    -1.25699057202116
                ],
                [
                    -1.04110227888866, -1.01985296010287, -0.330316732364811,
                    1.54321130870156
                ],
                [
                    0.61373886561153, -0.00102911499505855, -0.2808348691897,
                    0.53675115196638
                ],
                [
                    0.49875146997193, -0.0936494645503139, -0.832360238451796,
                    0.651893184309962
                ],
                [
                    2.82271114057672, -0.139959639327942, 0.0440903522786204,
                    0.402991197507736
                ],
                [
                    0.0463327895684618, 0.137901409337824, -0.523226280915796,
                    1.40875258840225
                ],
                [
                    -0.0296713299119058, 0.323142108448335, -1.02406080673175,
                    0.319768352634448
                ],
                [
                    0.781983086248411, 1.9439982256653, 0.181776380407751,
                    1.84028755471701
                ],
                [
                    0.782278454549117, -0.533596124937777, -0.276684483711903,
                    -0.534461148257612
                ],
                [
                    -0.0876172574377816, 0.554692982336474, 1.72524522236284,
                    0.0156455721085213
                ],
                [
                    0.203902110434939, 1.24934560400089, 1.79628504094392,
                    0.437922107143642
                ],
                [
                    1.32483987974607, 2.63865084732972, 1.71067873124182,
                    0.660006834959962
                ],
                [
                    0.851227474200866, 2.17554909955344, 1.95118989770912,
                    0.772316815484984
                ],
                [
                    -0.403488104826915, -0.139959639327942, -2.05730647943417,
                    1.34945888234721
                ],
                [
                    -1.15063158429676, -0.950387697936426, 0.336974769326176,
                    -1.58007717324087
                ],
                [
                    -0.532153920088487, 1.01779473011275, -1.56126596914715,
                    -0.00905096105451718
                ],
                [
                    -0.67501661729555, -0.139959639327942, -0.952051167678154,
                    -0.0422238467014761
                ],
                [
                    -0.310849370568786, 0.0915912345601968, -0.471342095626615,
                    1.41103820392797
                ],
                [
                    -0.208964531651521, 0.323142108448335, -0.361903894340191,
                    -0.275842503944826
                ],
                [
                    -1.04654416417428, -0.464130862771335, -0.949945286117449,
                    0.840457233938729
                ],
                [
                    -0.951584146504103, 1.29565577877852, 0.323269958277516,
                    -0.313860836548317
                ],
                [
                    0.184463802998577, -0.186269814105569, -0.672037942893625,
                    0.240047252985395
                ],
                [
                    -0.0698549841159306, 2.54603049777446, -0.956718963479093,
                    -0.654542118681162
                ],
                [
                    0.191338979070559, 0.253676846281894, -0.723803925093155,
                    -0.137064058860264
                ],
                [
                    0.358325281596958, -0.139959639327942, -0.277426681630464,
                    -0.0153182994681455
                ],
                [
                    -1.43216073444622, -0.163114726716755, 1.53782098070242,
                    1.1224554353098
                ],
                [
                    -0.536067966587311, -0.139959639327942, 1.0180011125623,
                    1.47349893131731
                ],
                [
                    0.666671687756801, -1.4134894457127, -1.89213174857686,
                    0.956901667933616
                ]]

    X = []
    Y = [
        0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4,
        4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8
    ]

    shuffle = list(zip(testList, Y))
    testList, Y = zip(*shuffle)

    neigh = KNeighborsClassifier(
        n_neighbors=10
    )  #With 10 neighbors, accuracy rate jumps from 0.28 to 0.35
    svc = SVC(random_state=2, decision_function_shape='ovr')
    mlp = MLPClassifier(alpha=1)
    gaussian = GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True)
    dT = DecisionTreeClassifier()
    rF = RandomForestClassifier()
    nB = GaussianNB()
    aB = AdaBoostClassifier()
    qD = QuadraticDiscriminantAnalysis()

    kNNprediction = []
    svcPrediction = []
    mlpPrediction = []
    gaussianPrediction = []
    dTPrediction = []
    rFPrediction = []
    nBPrediction = []
    aBPrediction = []
    qDPrediction = []

    tests = []

    for song in trainList:
        X.append(song)
    neigh.fit(X, Y)
    svc.fit(X, Y)
    mlp.fit(X, Y)
    gaussian.fit(X, Y)
    dT.fit(X, Y)
    rF.fit(X, Y)
    nB.fit(X, Y)
    aB.fit(X, Y)
    qD.fit(X, Y)

    for song in testList:
        tests.append(song)
        kNNprediction.append(int(neigh.predict(song)))
        svcPrediction.append(int(svc.predict(song)))
        mlpPrediction.append(int(mlp.predict(song)))
        gaussianPrediction.append(gaussian.predict(song))
        dTPrediction.append(dT.predict(song))
        rFPrediction.append(rF.predict(song))
        nBPrediction.append(nB.predict(song))
        aBPrediction.append(aB.predict(song))
        qDPrediction.append(qD.predict(song))
    conMatrixKNN = sklearn.metrics.confusion_matrix(Y, kNNprediction)
    conMatrixSVC = sklearn.metrics.confusion_matrix(Y, svcPrediction)
    conMatrixMLP = sklearn.metrics.confusion_matrix(Y, mlpPrediction)
    conMatrixGaussian = sklearn.metrics.confusion_matrix(Y, gaussianPrediction)
    conMatrixDT = sklearn.metrics.confusion_matrix(Y, dTPrediction)
    conMatrixRF = sklearn.metrics.confusion_matrix(Y, rFPrediction)
    conMatrixNB = sklearn.metrics.confusion_matrix(Y, nBPrediction)
    conMatrixAB = sklearn.metrics.confusion_matrix(Y, aBPrediction)
    conMatrixQD = sklearn.metrics.confusion_matrix(Y, qDPrediction)

    accKNN = sklearn.metrics.accuracy_score(Y, kNNprediction)
    accSVC = sklearn.metrics.accuracy_score(Y, svcPrediction)
    accMLP = sklearn.metrics.accuracy_score(Y, mlpPrediction)
    accGaussian = sklearn.metrics.accuracy_score(Y, gaussianPrediction)
    accDT = sklearn.metrics.accuracy_score(Y, dTPrediction)
    accRF = sklearn.metrics.accuracy_score(Y, rFPrediction)
    accNB = sklearn.metrics.accuracy_score(Y, nBPrediction)
    accAB = sklearn.metrics.accuracy_score(Y, aBPrediction)
    accQD = sklearn.metrics.accuracy_score(Y, qDPrediction)

    enumeration = [
        'Blues', 'Blues', 'Blues', 'Blues', 'Blues', 'Classic Rock',
        'Classic Rock', 'Classic Rock', 'Classic Rock', 'Classic Rock',
        'Classical', 'Classical', 'Classical', 'Classical', 'Classical',
        'Country', 'Country', 'Country', 'Country', 'Country', 'Electronic',
        'Electronic', 'Electronic', 'Electronic', 'Electronic', 'Metal',
        'Metal', 'Metal', 'Metal', 'Metal', 'Hip-Hop', 'Hip-Hop', 'Hip-Hop',
        'Hip-Hop', 'Hip-Hop', 'Jazz', 'Jazz', 'Jazz', 'Jazz', 'Jazz', 'Pop',
        'Pop', 'Pop', 'Pop', 'Pop'
    ]

    print("KNN Accuracy rate: " + str(accKNN) + "\n")
    print("SVC Accuracy rate: " + str(accSVC) + "\n")
    print("MLP Accuracy Rate: " + str(accMLP) + "\n")
    print("Gaussian Accuracy Rate: " + str(accGaussian) + "\n")
    print("Decision Tree Accuracy Rate: " + str(accDT) + "\n")
    print("Random Forest Accuracy Rate: " + str(accRF) + "\n")
    print("Naive Bayes Accuracy Rate: " + str(accNB) + "\n")
    print("AdaBoost Accuracy Rate: " + str(accAB) + "\n")
    print("Quadratic Discriminant Analysis Accuracy Rate: " + str(accMLP) +
          "\n")
Beispiel #17
0
 print(pattern % ("Process", "Tree"))
 tree.fit(X_train, y_train)
 print(pattern % ("Tree", "Bayes"))
 bayes.fit(X_train, y_train)
 print("Finish training Bayes.")
 # make a predictions
 data = X_test
 target = y_test
 print("All classifiers trained. Start making predictions")
 pattern = "Making predictions for %s"
 print(pattern % "Forest")
 forest_results = forest.predict(X_test)
 print(pattern % "KNN")
 knn_results = knn.predict(X_test)
 print(pattern % "Process")
 process_results = process.predict(X_test)
 print(pattern % "Tree")
 tree_results = tree.predict(X_test)
 print(pattern % "Bayes")
 bayes_results = bayes.predict(X_test)
 # print metrics
 from sklearn.metrics import classification_report
 print "Forest results: "
 print classification_report(y_true=y_test, y_pred=forest_results)
 print "KNN results: "
 print classification_report(y_true=y_test, y_pred=knn_results)
 print "Gaussian process results: "
 print classification_report(y_true=y_test, y_pred=process_results)
 print "Decision Tree results: "
 print classification_report(y_true=y_test, y_pred=tree_results)
 print "Bayes results: "
def trainPredict(subjectid, makeplot=False):
    print("testing participant " + subjectid)
    # Load training data from the file matlab generates
    traindata = np.genfromtxt('csvdata/' + subjectid +
                              '_sim.csv', delimiter=',',
                              missing_values=['NaN', 'nan'],
                              filling_values=None)
    # Clean + downsample this data
    trainx, trainy = cleandata(traindata, downsamplefactor=20)

    # Train a Gaussian Process
    anisokern = kernels.RBF()  # default kernel
    gp = GaussianProcessClassifier(kernel=anisokern)  # Initialize the GPC
    gp.fit(trainx, trainy)  # train this class on the data
    trainx = trainy = None  # Discard all training data to preserve memory

    # load test data
    testdata = np.genfromtxt('csvdata/' + subjectid +
                             '_rival.csv', delimiter=',',
                             missing_values=['NaN', 'nan'],
                             filling_values=None)
    testx, testy = cleandata(testdata, downsamplefactor=4)  # clean data

    testdata = None  # clear from memory
    # work out percentage in percept for each data point:
    percentages, nextpercept = assign_percentage(testy)

    # get a prediction for all points in the test data:
    predicty = gp.predict(testx)
    proby = gp.predict_proba(testx)

    if makeplot:
        summaryplot(participant, testx, testy, predicty, proby, gp)

    # Summarise prediction by reported percept
    meanprediction = {'mean' + percept:
                      proby[testy == value, 1].mean()
                      for percept, value in perceptindices.iteritems()}
    predictiondev = {'stdev' + percept:
                     proby[testy == value, 1].std()
                     for percept, value in perceptindices.iteritems()}
    predictionaccuracy = {'acc' + percept:
                          (predicty[testy == value] ==
                           testy[testy == value]).mean()
                          for percept, value in perceptindices.iteritems()}
    # Summarise prediction by percentage in percept
    predictioncourse = {'timecourse' + percept + str(cutoff):
                        proby[(testy == value) &
                              (percentages < cutoff) &
                              (percentages > cutoff - 0.1), 1].mean()
                        for percept, value in perceptindices.iteritems()
                        for cutoff in np.linspace(0.1, 1, 10)}

    # Summarise mixed percept time courses by the next percept
    nextcourse = {'nextcourse' + percept + str(cutoff):
                  proby[(testy == 0) &
                        (percentages < cutoff) &
                        (percentages > cutoff - 0.1) &
                        (nextpercept == perceptindices[percept]), 1].mean()
                  for percept in ['highfreq', 'lowfreq']
                  for cutoff in np.linspace(0.1, 1, 10)}

    afterdominant = {'after' + percept + "_" + after + "_" + str(cutoff):
                     proby[(testy == perceptindices[percept]) &
                           (percentages < cutoff) &
                           (percentages > cutoff - 0.1) &
                           (nextpercept == perceptindices[after]), 1].mean()
                     for percept, after in [('highfreq', 'mixed'),
                                            ('highfreq', 'lowfreq'),
                                            ('lowfreq', 'mixed'),
                                            ('lowfreq', 'highfreq')]
                     for cutoff in np.linspace(0.1, 1, 10)}

    # Only return the summarised data
    return meanprediction, predictiondev, predictionaccuracy, \
        predictioncourse, nextcourse, afterdominant
Beispiel #19
0
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score

data = pd.read_csv('finance_data.csv',
                   index_col=['Ticker', 'Fiscal Year', 'Fiscal Period'])
print(data.columns)

Y = data.loc[:, 'pos_neg']
X = data.drop(columns=['pos_neg', 'shifted_chg', 'report_date'])
X = scale(X.values)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=.2,
                                                    shuffle=False)
h = .02  # step size in the mesh#i ##i3#fff

kernal = 1.0 * RBF(1.0)
gpc = GaussianProcessClassifier(kernel=kernal)

gpc.fit(X_train, y_train)

Z = gpc.predict(X_test)
acc = accuracy_score(y_test, Z)
print(acc)
print(y_test[0:10])
print(Z[0:10])
Beispiel #20
0
     [181, 85, 43]]
Y = [
    'male', 'male', 'female', 'female', 'male', 'male', 'female', 'female',
    'female', 'male', 'male'
]
clf = clf.fit(X, Y)
clf1 = clf1.fit(X, Y)
clf2 = clf2.fit(X, Y)
clf3 = clf3.fit(X, Y)
clf4 = clf4.fit(X, Y)
# clf5=clf5.fit(X,Y)
prediction = clf.predict([[190, 70, 43]])
prediction1 = clf1.predict([[190, 70, 43]])
prediction2 = clf2.predict([[190, 70, 43]])
prediction3 = clf3.predict([[190, 70, 43]])
prediction4 = clf4.predict([[190, 70, 43]])
# prediction5=clf5.predict([[190,70,43]])
# # print(prediction)
# print(prediction1)
# print(prediction2)
# print(prediction3)
# print(prediction4)
max_acc = max(prediction, prediction1, prediction2, prediction3, prediction4)
if max_acc == prediction:
    print("SVM")
elif max_acc == prediction1:
    print("LogisticRegression")
elif max_acc == prediction2:
    print("DecisionTreeClassifier")
elif max_acc == prediction3:
    print("GaussianNB")
Beispiel #21
0
                 #for st in range(mikos):
                    # print('The gene',name[st],'is according to our algorithm',tava[st])
                    
                 teliko=pd.DataFrame(list(zip(name,tava)),columns=['GENES','TARGET VALIDATION '])
                 teliko.to_excel('output_svc.xlsx', engine='xlsxwriter')
                 if classifiers == 6  or classifiers == 0:
 
                     #------------------------------------------------------------------------
                     #--------------------------PREDICTION PART RF---------------------------------
                     #-------------------------------------------------------------------------
                     
                     pred=[]
                     for st in range(mikos):
                         t=tixera[st]
                         ti=[int(s) for s in t.split(',')]
                         ni=gausian.predict([ti])
                         pred.append(ni[0])
                     #------------------------------------------------------------------------
                     #--------------------------Create final matrix ---------------------------------
                     #-------------------------------------------------------------------------
                     # given that in training 1 was as good target and as a bad one 
                     tava=[]
                     for pr in pred :
                         if pr > 0.95 :
                             t=' GOOD potential target'
                             tava.append(t)
                         else :
                             t=' BAD potential target'
                             tava.append(t)
                     name=[]
                     for st in range(mikos):
        ax.text(1.0, 2.5, r"$\Delta t_2 = %.1f$"%i[1], fontsize=18)
        #pl.draw()
        #nsnIa = np.array([len(tmp2[i-2]) for i in range(4)]).sum()
        
        #label = np.zeros(len(color))
        #label[:nsnIa] = 1

        if REFIT:
            clf.fit(X, y)
            pickle.dump(clf, open(thisdir+"/GPmodel.pkl", 'wb'))
        else:
            # load the model from disk
            clf = pickle.load(open(thisdir+"/GPmodel.pkl", 'rb'))
            

        y_pred = clf.predict(X)
        #in sample accuracy
        accuracy = accuracy_score(y, y_pred)
        print("Accuracy (train) for %0.1f%% " % (accuracy * 100))

        #calculate probability everywhere in the phase space
        '''
        xx = np.linspace(phasespace_complete[:,0].min()-0.1,
                         phasespace_complete[:,0].max()+0.1, 100)
        yy = np.linspace(phasespace_complete[:,1].min()-0.1,
                         phasespace_complete[:,1].max()+0.1, 100).T
        '''
        resolution = 50
        xx = np.linspace(xmin, xmax, resolution)
        yy = np.linspace(ymin, ymax, resolution)
        xx, yy = np.meshgrid(xx, yy)
Beispiel #23
0
class Classifier(object):
    def __init__(self,
                 sample_dir,
                 crossval=False,
                 action="",
                 name="",
                 model=""):
        self.sample_dir = sample_dir
        self.ben_train_pd = None
        self.mal_train_pd = None
        self.all_train_pd = None
        self.groundtruth_tr_pd = None
        self.clf = None
        self.two_gram = None
        self.crossval = crossval

        #self.action = action
        self.name = name
        self.model = model

        self.outdict = {}
        self.outdict["name"] = name
        #self.outdict["action"] = action
        self.result = []
        self.status = []
        self.files = []
        self.num_files = 0

        self.global_log = ""

        if self.name == "tra":
            self.load_groundtruth()

    def log(self, msg):
        time_str = time.strftime('%X %x %Z')
        log_msg = "[%s] %s\n" % (time_str, msg)
        print log_msg
        self.global_log += log_msg

    def dump_log(self, out_pn):
        with open(out_pn, 'a') as f:
            f.write(self.global_log)

    def classify_one(self, in_pn):

        #current_sample_pd = pd.DataFrame.from_csv(in_pn, engine='python')
        current_sample_pd = pd.read_csv(in_pn, header=None, engine='python')

        result = self.clf.predict(current_sample_pd)[0]
        if result == 1:
            return "ben", current_sample_pd
        elif result == 0:
            return "mal", current_sample_pd

    def dump_json(self, out_pn, input_pn, null_file=False):

        fls = []
        with open(input_pn, 'r') as fh:
            input_dict = json.load(fh)
            filelist = input_dict["files"]
            for name in filelist:
                fls.append(None)

        if null_file:
            self.outdict["files"] = fls
            self.outdict["num_files"] = len(fls)

            tmp_result = []
            tmp_status = []
            for i in xrange(len(fls)):
                tmp_result.append(self.result[0])
                tmp_status.append(self.status[0])

            self.outdict["status"] = tmp_status
            self.outdict["result"] = tmp_result

        else:
            self.outdict["result"] = self.result
            self.outdict["files"] = self.files
            self.outdict["status"] = self.status
            self.outdict["num_files"] = len(self.files)

        self.outdict["model"] = self.model

        with open(out_pn, 'w') as f:
            f.write(json.dumps(self.outdict, indent=4, separators=(',', ': ')))

    # should read json file to know the groundtruth
    def classify_all(self, json_file):

        basedir = "/tmp/output"
        with open(json_file, 'r') as fh:
            input_dict = json.load(fh)
            filelist = input_dict["files"]
            tag = input_dict["tags"]

            for i in xrange(len(filelist)):

                groundtruth = tag[i]["tag_b"]
                #in_pn = os.path.join(basedir, groundtruth, filelist[i])
                in_pn = os.path.join(basedir, "%scsv" % groundtruth,
                                     filelist[i] + ".csv")
                current_sample_pd = pd.read_csv(in_pn,
                                                header=None,
                                                engine='python')

                result_num = self.clf.predict(current_sample_pd)[0]

                if result_num == 1:
                    result = "ben"
                elif result_num == 0:
                    result = "mal"

                self.log("** Classifying %s:" % filelist[i])

                if result == groundtruth:
                    tag_dict = {"tag_a": groundtruth, "tag_b": result}
                    status = "correct"
                    msg = "GroundTruth: %s, Predict: %s, result: %s" % \
                      (groundtruth, result, "CORRECT")
                    self.dump_indv_result(filelist[i], msg)
                else:
                    tag_dict = {"tag_a": groundtruth, "tag_b": result}
                    status = "incorrect"
                    msg = "GroundTruth: %s, Predict: %s, result: %s" % \
                      (groundtruth, result, "INCORRECT")
                    self.dump_indv_result(filelist[i], msg)

                self.log("  >> classified as %s. This is %s result" %
                         (result.upper(), status.upper()))

                self.files.append(filelist[i])
                self.status.append(status)
                self.result.append(tag_dict)

    def classify_one_array(self, in_array):

        result = self.clf.predict(in_array)[0]
        if result == 1:
            return "ben", in_array
        elif result == 0:
            return "mal", in_array

    def dump_indv_result(self, filename, msg):
        out_dir = os.path.join("/mnt", "output")
        out_pn = os.path.join(out_dir, filename + ".log.txt")
        with open(out_pn, 'w') as f:
            f.write(msg + "\n")

    def perturb_candidate(self, json_file):
        #os.system("ls -al /tmp/output/malcsv")
        with open(json_file, 'r') as fh:
            input_dict = json.load(fh)
            filelist = input_dict["files"]
            tag = input_dict["tags"]

            ben_sample_pd = self.ben_train_pd

            # let's classify this sample
            for i in xrange(len(filelist)):
                pert_candidate = []

                self.log("[*] Extracting information from %s" % filelist[i])
                groundtruth = tag[i]["tag_b"]
                csv_pn = os.path.join("/tmp", "output", "malcsv",
                                      filelist[i] + ".csv")
                result, current_array = self.classify_one(csv_pn)
                perturbed_array = current_array.copy().values[0]

                if result == "ben":
                    self.log("  >>> doesn't necessary to perturb %s" %
                             filelist[i])
                    continue

                closest_benign = find_closest_benign(ben_sample_pd,
                                                     perturbed_array)
                #print closest_benign
                #target_perturb_idx, diff = get_highest_diff_feature(closest_benign, perturbed_array)
                #print target_perturb_idx, diff
                #comp_two_arrays(closest_benign, perturbed_array)
                sortedlist = get_highest_negdiff_feature(
                    closest_benign, perturbed_array, 100)

                self.log(" >>> should minimize this index")
                for item in sortedlist:
                    two_gram = ret_twogram_from_idx(self.two_gram, item[0])
                    self.log("idx %d, value %d, two-gram: %s" %
                             (item[0], item[1], two_gram))

                    if two_gram is not None:
                        if "|" in two_gram:
                            gram1, gram2 = two_gram.split("|")
                            if gram1 not in pert_candidate:
                                pert_candidate.append(gram1)
                            if gram2 not in pert_candidate:
                                pert_candidate.append(gram2)

                patch_candidate, caller_callee = self.extract_addr(
                    filelist[i], pert_candidate)
                #dump_addr(patch_candidate, PATCH_ADDR_FILE)
                dump_addr(caller_callee, PATCH_ADDR_FILE)

                # nullify call by addr
                in_pn = os.path.join("/mnt", "input", filelist[i])
                out_pn = os.path.join("/mnt", "output", filelist[i] + "_pert")

                self.log(" >>> generating perturbed file %s" % filelist[i])
                patch_bin(in_pn, out_pn, PATCH_ADDR_FILE)

                # record it at the json
                if os.path.getsize(out_pn) > 0:
                    tag_dict = {
                        "tag_a": filelist[i],
                        "tag_b": str(os.path.getsize(out_pn))
                    }
                    status = "success"
                    self.dump_indv_result(filelist[i], "success")
                else:
                    tag_dict = {
                        "tag_a": filelist[i],
                        "tag_b": str(os.path.getsize(out_pn))
                    }
                    status = "fail"
                    self.dump_indv_result(filelist[i], "fail")

                self.files.append(filelist[i] + "_pert")
                self.status.append(status)
                self.result.append(tag_dict)

    def extract_addr(self, filename, pert_candidate):
        file_pn = os.path.join("/mnt", "input", filename)
        out = {}

        # find line with "call" command
        output = _objdump_extract_calls(file_pn)
        call_list, addr_list, caller_callee = extract_caller(output)

        #print caller_callee

        for i in xrange(len(call_list)):
            out[addr_list[i]] = call_list[i]
        return out, caller_callee

    # deprecated
    def gen_perturb_one(self, in_pn, out_pn):
        perturbation_count = 0

        result, current_array = self.classify_one(in_pn)
        self.log("  >>> current sample classified as %s" % result)

        if result == "ben":
            self.log("  >>> doesn't necessary to perturb")
            return

        ben_sample_pd = load_csv_files(BEN_SAMPLE_DIR, "ben", sample=True)

        perturbed_array = current_array.copy().values[0]
        closest_benign = find_closest_benign(ben_sample_pd, perturbed_array)

        while True:
            perturbation_count += 1
            #comp_two_arrays(closest_benign, perturbed_array)
            target_perturb_idx, diff = get_highest_diff_feature(
                closest_benign, perturbed_array)
            self.log(ret_distance(closest_benign, perturbed_array))

            # increase count
            perturbed_array[target_perturb_idx] += diff
            self.log(" >>> perturbing %dth index (current feature diff %d)" %
                     (target_perturb_idx, diff))
            result = self.classify_one_array([perturbed_array])[0]

            if result == "ben":
                self.log(" >>> found successful perturbation")
                break

            if perturbation_count > 300:
                self.log("count over")
                break

    def load_groundtruth(self):

        self.mal_train_pd = load_csv_files(self.sample_dir, "malcsv")
        self.ben_train_pd = load_csv_files(self.sample_dir, "bencsv")

        self.all_train_pd = pd.DataFrame(
            np.vstack([self.ben_train_pd, self.mal_train_pd]))
        self.all_train_pd = np.nan_to_num(self.all_train_pd)

        self.groundtruth_tr_pd = pd.Series([1] * len(self.ben_train_pd) +
                                           [0] * len(self.mal_train_pd))

    def train(self, model, argument=None):
        if model == "nn":
            #assert (isinstance(argument, int), "Argument should be defined")
            n_neighbors = argument
            self.clf = neighbors.KNeighborsClassifier(n_neighbors)

        elif model == "rf":
            max_depth = argument[0]
            random_state = argument[1]
            self.clf = RandomForestClassifier(max_depth=max_depth,
                                              random_state=random_state)

        elif model == "neural":
            solver = argument[0]
            hidden_size = argument[1]
            random_state = argument[2]
            self.clf = MLPClassifier(solver=solver, alpha=1e-5, \
              hidden_layer_sizes=hidden_size, random_state=random_state)

        elif model == 'gaussian':
            kernel_val = argument[0]
            RBF_val = argument[1]
            self.clf = GaussianProcessClassifier(kernel=kernel_val *
                                                 RBF(length_scale=RBF_val),
                                                 optimizer=None)

        elif model == "svm":
            self.clf = svm.SVC()

        self.clf.fit(self.all_train_pd, y=self.groundtruth_tr_pd)

    def ret_report(self):
        if self.crossval == True:
            self.log("Cross-validation result")
            predict_tr = cross_validation.cross_val_predict(self.clf, self.all_train_pd,\
                                                          y=self.groundtruth_tr_pd, cv=3, n_jobs=8)
            cm = confusion_matrix(self.groundtruth_tr_pd, predict_tr)
            a, b, c, d = cm.ravel()
            report = metrics.classification_report(self.groundtruth_tr_pd,
                                                   predict_tr)

        else:
            predict_tr = self.clf.predict(self.all_train_pd)
            cm = confusion_matrix(self.groundtruth_tr_pd, predict_tr)
            a, b, c, d = cm.ravel()
            report = metrics.classification_report(self.groundtruth_tr_pd,
                                                   predict_tr)

        try:
            print_matrix(cm)
            self.log(cm)
        except:
            pass
        self.log(report)
        #log(report)

    def load_model(self, in_pn):
        with open(in_pn, 'rb') as f:
            pickle_obj = pickle.load(f)
            self.clf = pickle_obj["model"]
            self.two_gram = pickle_obj["two-gram"]
            self.ben_train_pd = pickle_obj["benign"]

    def save_model(self, out_pn):
        self.log("[*] Saving model now!")
        #assume that feature_mlsploit.py collectly generate two_gram_mini.pkl (/mnt/output/)
        with open("/mnt/output/two_gram_mini.pkl") as f_two:
            two_grams = pickle.load(f_two)

        model_name = out_pn

        out = {}
        out["model"] = self.clf
        out["two-gram"] = two_grams
        out["benign"] = self.ben_train_pd

        with open(model_name, 'wb') as f:
            pickle.dump(out, f)

        # take care of output.json file
        self.result.append({
            "tag_a": "%s classifier" % self.model,
            "tag_b": "NP"
        })
        self.files.append(os.path.basename(out_pn))
        if os.path.getsize(model_name) > 0:
            self.status.append("success")
            self.dump_indv_result(model_name, "model generation success!")
        else:
            self.status.append("fail")
            self.dump_indv_result(model_name, "model generation fail!")
Beispiel #24
0
N, d = X.shape

N = np.int(1797)
Ntrain = np.int(800)
Ntest = np.int(250)

Xtrain = X[0:Ntrain - 1, :]
ytrain = y[0:Ntrain - 1]
Xtest = X[N - Ntest:N, :]
ytest = y[N - Ntest:N]

#kernel = 1.0 * RBF([1.0]) #isotropic kernel
#kernel = DotProduct(1.0)
kernel = Matern(0.5)
gpc_rbf = GaussianProcessClassifier(kernel=kernel).fit(Xtrain, ytrain)
yp_train = gpc_rbf.predict(Xtrain)
train_error_rate = np.mean(np.not_equal(yp_train, ytrain))
yp_test = gpc_rbf.predict(Xtest)
test_error_rate = np.mean(np.not_equal(yp_test, ytest))
#print('Training error rate')
#print(train_error_rate)
print('Test error rate')
print(test_error_rate)

#testing set 100
# tsize = [500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500]
# radial = [90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90]
# dot = [7, 11, 8, 8, 7, 9, 8, 8, 7, 8, 7]
# matern = [32, 31, 27, 25, 22, 19, 19, 18, 17, 17, 17]
#
# plt.figure(2)
Beispiel #25
0
            outputs = params.n_max_target_attr

            start_time = time.time()
            if n_problem_type == 'Classification':
                try:
                    kernel = 1.0 * RBF(1.0)
                    model = GaussianProcessClassifier(kernel=kernel,
                                                      random_state=0)

                    # Train model
                    model.fit(X_train, y_train)

                    #prediction
                    #Time computation start
                    start_timeFun = time.time()
                    y_pred = model.predict(X_train)
                    errorTrn = model.score(X_train, y_train)
                    funcEvltime = (time.time() - start_timeFun) / len(X_test)

                    #Report
                    #y_pred = model.predict(X_test)
                    errorTst = model.score(X_test, y_test)

                    now = datetime.now()
                    current_time = now.strftime("%H:%M:%S")
                    print(current_time, ' ', exp_num, data_file, ' ', runOpt,
                          ' test accuracy:', errorTst)
                    #collection
                    error = [errorTrn, errorTst, 'none', funcEvltime, 'empty']
                    data_results_coll.update({
                        str(data_file.split('.')[0]) + "_" + str(exp_num) + "_" + runOpt:
Beispiel #26
0
    print(
        "Negative log predictive density of validation set with rbf kernel %.3f"
        % neg_lpd_rbf_v)
    neg_lpd_matern_v = -np.mean(
        np.log(
            gp_matern_fix.predict_proba(X_test)[np.arange(len(X_test)),
                                                y_test]))
    print(
        "Negative log predictive density of validation set with matern kernel %.3f"
        % neg_lpd_matern_v)
    nlpd_rbf_t[i] = neg_lpd_rbf_t
    nlpd_matern_t[i] = neg_lpd_matern_t
    nlpd_rbf_v[i] = neg_lpd_rbf_v
    nlpd_matern_v[i] = neg_lpd_matern_v

    accuracy_rbf[i] = accuracy_score(y_train, gp_rbf_fix.predict(X_train))
    print("Accuracy for X_train with rbf kernel: %.5f" % accuracy_rbf[i])
    print("Accuracy for X_test with rbf kernel: %.5f" %
          accuracy_score(y_test, gp_rbf_fix.predict(X_test)))

    accuracy_matern[i] = accuracy_score(y_train,
                                        gp_matern_fix.predict(X_train))
    print("Accuracy for X_train with matern kernel: %.5f" % accuracy_matern[i])
    print("Accuracy for X_test with matern kernel: %.5f\n" %
          accuracy_score(y_test, gp_matern_fix.predict(X_test)))

print("Average accuracy with rbf kernel: %.5f" % np.mean(accuracy_rbf))
print("Average accuracy with matern kernel: %.5f" % np.mean(accuracy_matern))
print(
    "Average negative log predictive density of training set with rbf kernel: %.5f"
    % np.mean(nlpd_rbf_t))
Beispiel #27
0
#X_train = X
#Y_train = Y_enc

#X_test = data_test[:,1:6]
#Y_test = data_test[]

clf = tree.DecisionTreeClassifier()
clf.fit(X_train,Y_train)
Y_pred = clf.predict(X_test)

clf2 = GradientBoostingClassifier(learning_rate=0.1,n_estimators=200,max_depth=3)
clf2.fit(X_train,Y_train)
Y_pred2 = clf2.predict(X_test)

clf3 = LogisticRegressionCV(multi_class='multinomial',max_iter=500,Cs=30)
clf3.fit(X_train,Y_train)
Y_pred3 = clf3.predict(X_test)

clf4 = SGDClassifier(alpha=0.02)
clf4.fit(X_train,Y_train)
Y_pred4 = clf4.predict(X_test)

clf5 = GaussianProcessClassifier()
clf5.fit(X_train,Y_train)
Y_pred5 = clf5.predict(X_test)

print(metrics.accuracy_score(Y_test,Y_pred2))

# clf5.fit(X,Y)
# Y_pred = clf5.predict()
Beispiel #28
0
for bool, feature in zip(mask, df.columns[1:].tolist()):
    if bool:
        new_features.append(feature)

#print(new_features)

stats.text = str(new_features)

x_train_original, x_test_original, y_train_original, y_test_original = train_test_split(
    X_new, y, test_size=0.25)
#For standardizing data

#clf = svm.LinearSVC(random_state=0)
clf = GaussianProcessClassifier()
clf.fit(x_train_original, y_train_original)
predictions = clf.predict(x_test_original)
#print("Accuracy =", accuracy_score(y_test_original,predictions))
#print(np.unique(predictions))
tn, fp, fn, tp = confusion_matrix(y_test_original, predictions).ravel()

fruits = ['True Positive', 'False Positive', 'True Negative', 'False Negative']
#fruits = [tp, fp, tn, fn]
#counts = [0, 0, 0, 0]
counts = [tp, fp, tn, fn]

source = ColumnDataSource(data=dict(fruits=fruits, counts=counts))

p = figure(x_range=fruits,
           plot_height=350,
           toolbar_location=None,
           title="Counts")
'''
    高斯过程分类:
        预测采用类概率的形式
        其他的和高斯过程回归也挺相似的
        
'''

rlf = GaussianProcessClassifier(kernel=None,
                                optimizer='fmin_l_bfgs_b',
                                n_restarts_optimizer=0,
                                max_iter_predict=100,
                                warm_start=False,
                                copy_X_train=True,
                                random_state=None,
                                multi_class='one_vs_rest',
                                n_jobs=1)
rlf.fit(trainX, trainY)
rlf.score(testX, testY)
preY = rlf.predict(testX)
rlf.log_marginal_likelihood_value_
'''
    kernel                          核函数
    optimizer                       传递给核函数的参数集
    n_restarts_optimizer            每次优化时,是否允许从指定的阈值空间中随机抽样开始执行
    max_iter_predict                牛顿法在逼近预测值时的最大迭代次数
    warm_start                      不太懂
    copy_X_train                    永久保存数据集到对象中
    random_state                    随机器
    multi_class                     多类的问题的处理方法,1v1,还是1v剩余还是什么
    n_jobs                          CPU计算的数量
'''
Beispiel #30
0
print(test_data.info())

# In[57]:

X_predict = scaler.transform(test_data[feature_names])

# In[58]:

#using linear discriminant analysis 'svm'
y_predict = svm.predict(X_predict)
print(y_predict)

# In[59]:

y_pre_gpc = GPC.predict(X_predict)
print(y_pre_gpc)

# In[60]:

y_pre_gpc_RBF = gpc_rbf.predict(X_predict)
print(y_pre_gpc_RBF)

# In[61]:

y_pre_SVM3 = svm3.predict(X_predict)
print(y_pre_SVM3)

# In[62]:

tdata_ori = pd.read_csv(
Beispiel #31
0
    pca = PCA(n_components=different_components[i])
    principal_component_train.append(pca.fit_transform(train_data))
    principal_component_test.append(pca.transform(test_data))

#Then it is needed to train a Gaussian Classifier using the data in each subspace.

from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

kernel = RBF(1.0)
predict = []
score_train = []
score_test = []

for i in range(0, different_components.size):
    print(i)
    gpc = GaussianProcessClassifier(kernel=kernel, random_state=0).fit(
        principal_component_train[i], train_labels)
    predict.append(gpc.predict(principal_component_test[i]))
    score_train.append(gpc.score(principal_component_train[i], train_labels))
    score_test.append(gpc.score(principal_component_test[i], test_labels))

#1.4
#Plot classification error vs. the number of components used for each subspace, and discuss
#your results. Compute the classification error for both the training set and the test set
#(training is always done using the training set), and provide two plots.

plt.pyplot.plot(different_components, score_train)
plt.pyplot.show()
plt.pyplot.plot(different_components, score_test)
plt.pyplot.show()
Beispiel #32
0
gpc.fit(X, y)

scores = cross_val_score(dtc, X, y)
scores1 = cross_val_score(rfc, X, y)
scores2 = cross_val_score(etc, X, y)
scores3 = cross_val_score(abc, X, y).mean()
scores4 = cross_val_score(gbc, X, y).mean()
scores5 = cross_val_score(vcf, X, y).mean()
scores6 = cross_val_score(gpc, X, y).mean()

# 预测测试, 对应的标签是 0, 1, 2, 1
test = [
    [4.0, 3.1, 1.1, 0.1],
    [6.7, 3.1, 4.1, 1.4],
    [7.1, 3.2, 6.1, 1.9],
    [6.3, 2.9, 4.2, 1.4],
]

y_pred0 = dtc.predict(test)
y_pred = rfc.predict(test)
y_pred1 = etc.predict(test)
y_pred2 = abc.predict(test)
y_pred3 = gbc.predict(test)
y_cvf = vcf.predict(test)
y_gpc = gpc.predict(test)

print(y_pred0, y_pred, y_pred1, y_pred2, y_pred3)
print(scores.mean(), scores1.mean(), scores2.mean(), scores3, scores4,
      abc.feature_importances_)
print(y_cvf, scores5, y_gpc, scores6)
Beispiel #33
0
# Specify Gaussian Processes with fixed and optimized hyperparameters
gp_fix = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0),
                                   optimizer=None)
gp_fix.fit(X[:train_size], y[:train_size])

gp_opt = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0))
gp_opt.fit(X[:train_size], y[:train_size])

print("Log Marginal Likelihood (initial): %.3f"
      % gp_fix.log_marginal_likelihood(gp_fix.kernel_.theta))
print("Log Marginal Likelihood (optimized): %.3f"
      % gp_opt.log_marginal_likelihood(gp_opt.kernel_.theta))

print("Accuracy: %.3f (initial) %.3f (optimized)"
      % (accuracy_score(y[:train_size], gp_fix.predict(X[:train_size])),
         accuracy_score(y[:train_size], gp_opt.predict(X[:train_size]))))
print("Log-loss: %.3f (initial) %.3f (optimized)"
      % (log_loss(y[:train_size], gp_fix.predict_proba(X[:train_size])[:, 1]),
         log_loss(y[:train_size], gp_opt.predict_proba(X[:train_size])[:, 1])))


# Plot posteriors
plt.figure(0)
plt.scatter(X[:train_size, 0], y[:train_size], c='k', label="Train data",
            edgecolors=(0, 0, 0))
plt.scatter(X[train_size:, 0], y[train_size:], c='g', label="Test data",
            edgecolors=(0, 0, 0))
X_ = np.linspace(0, 5, 100)
plt.plot(X_, gp_fix.predict_proba(X_[:, np.newaxis])[:, 1], 'r',
         label="Initial kernel: %s" % gp_fix.kernel_)
    def voting_svm(self):
        """Voting implementation of SVM for a unique epoch"""
        per_neuron_prediction = []

        """
        STRUCTURE:
        -> Key neurons
            -> Each epoch
                -> Number of tasks (~100)
                    -> Results for each neuron
        """

        # Choosing features
        # train data
        print("test")
        for neuron in self.features_index:    # for good neurons
            neuron_votes = []
            X_for_neuron = []
            for example in range(len(self.X_train)):     # for each of tasks
                X_for_neuron.append([self.X_train[example][self.epoch*self.neuron_num + neuron]])

            X_test = []
            for example in range(len(self.X_test)):     # for each of tasks
                X_test.append([self.X_test[example][self.epoch*self.neuron_num + neuron]])

            clf = GPC()
            # prediction on individual neuron
            clf.fit(X_for_neuron, self.y_train)
            # add predictions to data for each sample
            pred = clf.predict(X_test)


            neuron_votes.append(pred)
            per_neuron_prediction.append(neuron_votes)


        # test data
        accuracy = 0
        print(per_neuron_prediction[0])
        print(len(self.X_test))

        features_num = len(self.features_index)

        # check if voting legnth is even
        """
        if len(per_neuron_prediction)%2==0:
            del per_neuron_prediction[-1]
            features_num =- 1
        """

        print(per_neuron_prediction)

        # for each testing task per session per epoch
        for test_task in range(len(self.X_test)):
            # count the most number of votes as predicted by SVC
            # classifier per individual neuron
            temp_task = []
            for neuron in range(features_num):
                temp_task.append(per_neuron_prediction[neuron][0][test_task])
            vote_result = mode(temp_task)
            if vote_result == self.y_test[test_task]:
                accuracy += 1
            print("ACCURACY {}".format(accuracy/(test_task+1)))

        accuracy = accuracy/len(self.X_test)

        return accuracy