def log_fit(X,y,n_iter=5): ##get X in the correct shape for sklearn function if len(X.shape) == 1: X = X.reshape(-1,1) ##init the model class lr = gaussian_process.GaussianProcessClassifier() #linear_model.LogisticRegression(penalty='l2',fit_intercept=True, #solver='liblinear',max_iter=100,n_jobs=1,class_weight='balanced') #tree.DecisionTreeClassifier(class_weight='balanced') #neural_network.MLPClassifier(solver='lbfgs') #svm.LinearSVC(class_weight='balanced') accuracy = np.zeros(n_iter) for i in range(n_iter): ##split the data into train and test sets X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=42) ##make sure you have both classes of values in your training and test sets if np.unique(y_train).size<2 or np.unique(y_test).size<2: print("Re-splitting cross val data; only one class type in current set") X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.5,random_state=42) ##now fit to the test data lr.fit(X_train,y_train) ##now try to predict the test data y_pred = lr.predict(X_test) ##lastly, compare the accuracy of the prediction accuracy[i] = (y_pred==y_test).sum()/float(y_test.size) return accuracy.mean()
def get_classifiers(): clfs = {} #clfs['bag'] = { # 'clf': ensemble.BaggingClassifier(neighbors.KNeighborsClassifier(), max_samples=0.5, # max_features=0.5), 'name': "BaggingClassifier"} # clfs['mlp'] = {'clf': neural_network.MLPClassifier(hidden_layer_sizes=(100,100,100), alpha=1e-5, solver='lbfgs', max_iter=500), 'name': 'MultilayerPerceptron'} clfs['logreg'] = {'clf': linear_model.LogisticRegression(), 'params': {'C': [(2**x) for x in np.arange(-5, 15, step=3)]}} clfs['sgd'] = {'clf': linear_model.SGDClassifier(), 'params': {'loss': ['perceptron'], 'alpha': 10 ** np.random.uniform(-6, 1)}} clfs['knc'] = {'clf':neighbors.KNeighborsClassifier(), 'params': {'n_neighbors':np.arange(3, 15)}} clfs['rfc'] = {'clf':ensemble.RandomForestClassifier(), 'params':{'n_estimators':np.arange(64, 1024, step=64)}} clfs['svc'] = {'clf': svm.SVC(), 'params': {'kernel':['linear', 'sigmoid', 'poly', 'rbf'], 'gamma':np.linspace(0.0,2.0,num=21),'C': np.linspace(0.5,1.5,num=11)}} clfs['abc'] = {'clf': ensemble.AdaBoostClassifier(), 'params': {'n_estimators': np.arange(64, 1024, step=64)}} clfs['gbc'] = {'clf': ensemble.GradientBoostingClassifier(), 'params': {'n_estimators': np.arange(64, 1024, step=64)}} clfs['gauss_class'] = {'clf': gaussian_process.GaussianProcessClassifier(), 'params': {}} clfs['gauss_nb'] = {'clf': naive_bayes.GaussianNB(), 'params': {}} #LinearDiscriminantAnalysis(), #QuadraticDiscriminantAnalysis() return clfs
def get_algorithms(): MLA_dict = { # Ensemble methods "ada": ensemble.AdaBoostClassifier(), "bc": ensemble.BaggingClassifier(), "etc": ensemble.ExtraTreesClassifier(), "gbc": ensemble.GradientBoostingClassifier(), "rfc": ensemble.RandomForestClassifier(), # Gaussian processes "gpc": gaussian_process.GaussianProcessClassifier(), # Linear models "lr": linear_model.LogisticRegressionCV(), "pac": linear_model.PassiveAggressiveClassifier(), "rcc": linear_model.RidgeClassifierCV(), "sgd": linear_model.SGDClassifier(), "per": linear_model.Perceptron(), # Navies bayes "bnb": naive_bayes.BernoulliNB(), "gnb": naive_bayes.GaussianNB(), # Nearest neighbour "knn": neighbors.KNeighborsClassifier(), # SVM "svc": svm.SVC(probability=True), "nvc": svm.NuSVC(probability=True), "lvc": svm.LinearSVC(), # Trees "dtc": tree.DecisionTreeClassifier(), "ets": tree.ExtraTreeClassifier(), # Discriminant analysis "lda": discriminant_analysis.LinearDiscriminantAnalysis(), "qda": discriminant_analysis.QuadraticDiscriminantAnalysis(), } return MLA_dict
def gp_clf_iris(): # Follow the example from the sklearn docs, and only use the # first two features, so we can visualize the predicted # probabilities in 2D. X = iris_dataset.data[:, :2] y = iris_dataset.target y_names = iris_dataset.target_names print("Feature names: ", iris_dataset.feature_names) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.15, random_state=RANDOM_SEED) # Make the RBF kernel anisotropic for maximum flexibility. kernel = gp.kernels.RBF(np.ones((X.shape[1], 1))) \ * gp.kernels.ConstantKernel() \ + gp.kernels.WhiteKernel() clf = gp.GaussianProcessClassifier(kernel, n_restarts_optimizer=0) print("Fitting Gaussian Process on input of shape {0}...".format( X_train.shape )) clf.fit(X_train, y_train) print("Learned kernel: {0}".format(str(clf.kernel_))) print("Fit complete.") y_pred = clf.predict(X_test) print(y_pred) acc = accuracy_score(y_test, y_pred) print("Accuracy: {0:.2f}%".format(acc * 100.0)) # Plot class probabilities in 2D, with the coordinates being the # values of the first and second features (f0, f1, i.e., sepal # length and sepal width). f0_min = X[:, 0].min() - 1 f0_max = X[:, 0].max() + 1 f1_min = X[:, 1].min() - 1 f1_max = X[:, 1].max() + 1 step = 0.02 f0, f1 = np.meshgrid(np.arange(f0_min, f0_max, step), np.arange(f1_min, f1_max, step)) grid_data = np.c_[f0.ravel(), f1.ravel()] print(X.shape) print(X_train.shape) print(grid_data.shape) prob_grid = clf.predict_proba(grid_data) prob_grid = prob_grid.reshape((f0.shape[0], f0.shape[1], 3)) print( prob_grid.shape)#, '\n --- \n' ,prob_grid.squeeze() exit() plt.figure(figsize=(6, 6)) plt.imshow(prob_grid, extent=(f0_min, f0_max, f1_min, f1_max), origin='lower') plt.scatter(X[y==0, 0], X[y==0, 1], s=30, c='red', edgecolors='black') plt.scatter(X[y==1, 0], X[y==1, 1], s=30, c='green', edgecolors='black') plt.scatter(X[y==2, 0], X[y==2, 1], s=30, c='blue', edgecolors='black') plt.show()
def GaussianProcessTrain(featureMatrix, labelMatrix): clf = gaussian_process.GaussianProcessClassifier() #print (clf.fit(featureMatrix, labelMatrix)) # cross validation scores = cross_val_score(clf, featureMatrix, labelMatrix, cv=10) print("Accuracy (Cross-V): %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) return clf
def fit_gaussian_process_classifier(self, data_interval = None, verbose = False): """fit a Gaussian Process classifier implementation from: scikit-learn (https://scikit-learn.org/stable/modules/gaussian_process.html) Parameters ---------- data_interval : array_int, optional Array indicies of data used to train (training on a subset). if None - train on whole data set verbose : bool, optional Print statements with more information while training. Returns ------- binary_classifier_holder : array_like Each element is a trained GaussianProcessClassifier object. N elements for N classes. """ if verbose: if data_interval is None: print("N points: %i"%(len(self.input_data))) else: print("N points: %i"%(len(data_interval))) start_time = time.time() binary_classifier_holder = dict() for i, cls_data in enumerate(self.class_data): iter_time = time.time() kernel = gp.kernels.RBF( [1,1,1], [(1e-3,1e3), (1e-3,1e3), (1e-3, 1e3)] ) gpc = gp.GaussianProcessClassifier(kernel = kernel) # for running with a subset of the data if data_interval is None: line = gpc.fit( self.input_data, cls_data ) else: di = np.array(data_interval) line = gpc.fit( self.input_data[di], cls_data[di] ) binary_classifier_holder[self.class_names[i]] = line time_print = time.time()-start_time if verbose: if i == 0: len_classes = len(self.class_ids) print( "Time to fit %s classifiers ~ %.3f\n"%(len_classes, time_print*len_classes) ) print("GaussianProcessClassifier class %s -- current time: %.3f"%(i, time_print) ) return binary_classifier_holder
def predict(pickle_prefix, is_regression): kernel = ConstantKernel() + Matern(length_scale=2, nu=3 / 2) + WhiteKernel(noise_level=1) if is_regression: pickle_file = pickle_prefix + "probabilities.pickle" gp = gaussian_process.GaussianProcessRegressor(kernel=kernel) else: pickle_file = pickle_prefix + "selects.pickle" gp = gaussian_process.GaussianProcessClassifier(kernel=kernel) ys_by_image_id = {} with open(pickle_file, 'rb') as handle: d = pickle.load(handle) for image_id, ys in d.iteritems(): ys_by_image_id[image_id] = ys max_ids = 10 num_test = 20 mses = [] for i, (image_id, ys) in enumerate(ys_by_image_id.iteritems()): ys_train = ys[:-num_test] ys_test = ys[-num_test:] if i > max_ids: break X = np.array(range(len(ys_train))).reshape(-1, 1) gp.fit(X, ys_train) print("================={}=====================".format(image_id)) print( "Fit image-{} with {} samples with {} average probability".format( image_id, len(ys_train), round(np.mean(ys_train), 4))) xs_test = np.linspace(len(ys_train), len(ys_train) + num_test - 1, num=num_test).reshape(-1, 1) y_pred = gp.predict(xs_test) pp.pprint(zip([x[0] for x in xs_test], y_pred, ys_test)) mse = get_mse(y_pred, ys_test) mses.append(mse) print("MSE: {0:5f}".format(mse)) return np.average(mses)
def ModelSelection(test_data, features, label): MLA = [ ensemble.AdaBoostClassifier(), ensemble.BaggingClassifier(), ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier(), gaussian_process.GaussianProcessClassifier(), linear_model.LogisticRegressionCV(), linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(), linear_model.Perceptron(), naive_bayes.BernoulliNB(), naive_bayes.GaussianNB(), neighbors.KNeighborsClassifier(), svm.SVC(probability=True), svm.NuSVC(probability=True), svm.LinearSVC(), tree.DecisionTreeClassifier(), tree.ExtraTreeClassifier(), discriminant_analysis.LinearDiscriminantAnalysis(), discriminant_analysis.QuadraticDiscriminantAnalysis(), ] MLA_columns = ['MLA Name', 'MLA Parameters', 'MLA Score'] MLA_compare = pd.DataFrame(columns=MLA_columns) x_train, x_test, y_train, y_test = train_test_split(train_data[features], train_data[label], test_size=0.2) row_index = 0 MLA_predict = train_data[label] for alg in MLA: MLA_name = alg.__class__.__name__ MLA_compare.loc[row_index, 'MLA Name'] = MLA_name MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params()) alg.fit(x_train, y_train) MLA_predict[MLA_name] = alg.predict(x_test) MLA_compare.loc[row_index, 'MLA Score'] = alg.score(x_test, y_test) row_index += 1 MLA_compare.sort_values(by=['MLA Score'], ascending=False, inplace=True) return MLA_compare, x_train, x_test, y_train, y_test
def all_classifiers(): # Model Data MLA = [ # Ensemble Methods ensemble.AdaBoostClassifier(), ensemble.BaggingClassifier(), ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier(), # Gaussian Processes gaussian_process.GaussianProcessClassifier(), # GLM linear_model.LogisticRegressionCV(), linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(), linear_model.Perceptron(), # Navies Bayes naive_bayes.BernoulliNB(), naive_bayes.GaussianNB(), # Nearest Neighbor neighbors.KNeighborsClassifier(), # SVM svm.SVC(probability=True), svm.NuSVC(probability=True), svm.LinearSVC(), # Trees tree.DecisionTreeClassifier(), tree.ExtraTreeClassifier(), # Discriminant Analysis discriminant_analysis.LinearDiscriminantAnalysis(), discriminant_analysis.QuadraticDiscriminantAnalysis(), # xgboost: http://xgboost.readthedocs.io/en/latest/model.html XGBClassifier() ] return MLA
def make_image_classifier_model_example(): """ This an example of how the image classifier will be made. We will need to get actual features from Image classification team. Creates "Image_Classifier_Model.p" using pickle dump ("wb") :return: None """ FEATURE_EXTRACTOR = lambda image: [ image[:, :, 0].mean(), image[:, :, 1].mean(), image[:, :, 2].mean() ] redish_images = np.random.normal([100, 100, 160], [20, 20, 20], (200, 100, 100, 3)) blueish_images = np.random.normal([160, 100, 100], [20, 20, 20], (200, 100, 100, 3)) x = np.vstack(([FEATURE_EXTRACTOR(x) for x in redish_images], [FEATURE_EXTRACTOR(x) for x in blueish_images])) y = np.ones(400) y[:200] = 0 #red=0 blue=1 GPC_images_classifier = gp.GaussianProcessClassifier( 2.0 * gp.kernels.RBF([2.0, 1.0, 2.0])) GPC_images_classifier.fit(x, y) pickle.dump(GPC_images_classifier, open("Image_Classifier_Model.p", "wb")) MAP = np.zeros((N * 100, M * 100, 3)) cmap = [] for i in range(0, N): i = i + 1 if i < M else N - i column = (M - i) * "b" + i * "r" cmap.append(column) for j, row in enumerate(cmap): for i, color in enumerate(row): position_x = j * 100 position_y = i * 100 color = [100, 100, 160] if color == "r" else [160, 100, 100] picture = np.random.normal(color, [20, 20, 20], (100, 100, 3)) MAP[position_x:position_x + 100, position_y:position_y + 100] = picture cv2.imwrite("MAP.png", MAP)
def classifier(rand, fts, set, cl_type): # Define classifier if cl_type == 'rf': clf = ensemble.RandomForestClassifier(random_state=rand) if cl_type == 'svm': clf = svm.SVC(probability=True, random_state=rand) if cl_type == 'gauss': clf = gaussian_process.GaussianProcessClassifier(random_state=rand) # clf = tree.DecisionTreeClassifier(splitter='best', random_state=rand) # Fit the model and predict clf.fit(train_fts, train_classes) predictions = clf.predict(fts) probs = clf.predict_proba(fts) # Count positive predictions positives = 0 for prediction, file in zip(predictions, set): if int(prediction) == dataset[file]: positives += 1 return positives, probs, clf.classes_
X = [[181, 80, 44], [177, 70, 43], [160, 60, 31], [154, 54, 37], [166, 65, 40], [190, 90, 47], [175, 64, 39], [177, 70, 40], [159, 55, 37], [171, 75, 42], [181, 85, 43]] Y = [ 'male', 'male', 'female', 'female', 'male', 'male', 'female', 'female', 'female', 'male', 'male' ] predict_data = [[190, 70, 43], [180, 50, 30]] clf = discriminant_analysis.QuadraticDiscriminantAnalysis() clf = clf.fit(X, Y) print("Quadratic Discriminant Analysis: ") print(clf.predict(predict_data)) clf = neighbors.KNeighborsClassifier(n_neighbors=5) clf = clf.fit(X, Y) print("KNeighbors Classifier: ") print(clf.predict(predict_data)) clf = gaussian_process.GaussianProcessClassifier(kernel=1.0 * RBF(1.0), random_state=0) clf = clf.fit(X, Y) print("Gaussian Process Classifier: ") print(clf.predict(predict_data))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) shuffle_index = np.random.permutation(len(X_train)) X_train, y_train = X_train[shuffle_index], y_train[shuffle_index] # Feature Scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test) from sklearn import gaussian_process clf = gaussian_process.GaussianProcessClassifier(random_state=0) clf.fit(X_train, y_train) # Cross Validation from sklearn.model_selection import cross_val_score from sklearn.metrics import confusion_matrix from sklearn.model_selection import cross_val_predict cross_val_score(clf, X_train, y_train, cv=3, scoring='accuracy') y_train_pred = cross_val_predict(clf, X_train, y_train, cv=3) cm = confusion_matrix(y_train, y_train_pred) print(cm) from sklearn.metrics import precision_score, recall_score print("precision score = {0:.4f}".format(precision_score( y_train, y_train_pred))) print("recall score = {0:.4f}".format(recall_score(y_train, y_train_pred)))
from sklearn import tree from sklearn import neural_network from sklearn import svm from sklearn import gaussian_process from sklearn.metrics import accuracy_score dt_clf = tree.DecisionTreeClassifier() # CHALLENGE - create 3 more classifiers... # 1 mlp_clf = neural_network.MLPClassifier() # 2 svc_clf = svm.SVC() # 3 gauss_clf = gaussian_process.GaussianProcessClassifier() classifiers = { 'decision_tree': dt_clf, 'MLP': mlp_clf, 'SVC': svc_clf, 'gaussian_process': gauss_clf } # [height, weight, shoe_size] X = [[181, 80, 44], [177, 70, 43], [160, 60, 38], [154, 54, 37], [166, 65, 40], [190, 90, 47], [175, 64, 39], [177, 70, 40], [159, 55, 37], [171, 75, 42], [181, 85, 43]] Y = [ 'male', 'male', 'female', 'female', 'male', 'male', 'female', 'female', 'female', 'male', 'male'
data = _data labels = _labels return _data, _labels, trainingData, trainingLabel splitData (data,labels, 0.5) #print (trainingData) numTests = 20 treeResults = [] gaussianResults = [] neuralNetResults = [] for i in range(numTests): #Classifiers clf_tree = tree.DecisionTreeClassifier() clf_gaussian = gaussian_process.GaussianProcessClassifier() clf_neuralNet = neural_network.MLPClassifier(hidden_layer_sizes=(100,), alpha=0.0001, learning_rate_init=0.0001) #Training treeFit = clf_tree.fit(trainingData, trainingLabel) gaussianFit = clf_gaussian.fit(trainingData,trainingLabel) neuralNetFit = clf_neuralNet.fit(trainingData, trainingLabel) #Predictions and accuraccy prediction_tree = treeFit .predict(data) accuraccy_tree = accuracy_score(labels, prediction_tree) * 100 #in percentage treeResults.append(accuraccy_tree) prediction_gaussian = gaussianFit.predict(data) accuraccy_gaussian = accuracy_score(labels,prediction_gaussian) * 100 gaussianResults.append(accuraccy_gaussian)
def dict_method_clf(): dict_method = {} # 1st part """4KNC""" me4 = neighbors.KNeighborsClassifier(n_neighbors=5) cv4 = StratifiedKFold(5, shuffle=False, random_state=0) scoring4 = 'balanced_accuracy' param_grid4 = [ { 'n_neighbors': [3, 4, 5, 6, 7], "weight": ['uniform', "distance"], "leaf_size=30": [10, 20, 30], 'metric': ['seuclidean', "manhattan"] }, ] dict_method.update({"KNC-set": [me4, cv4, scoring4, param_grid4]}) """1SVC""" me1 = SVC(C=1.0, kernel='rbf', degree=3, gamma='auto_deprecated', coef0=0.0, shrinking=True, probability=False, tol=1e-3, cache_size=200, class_weight='balanced', verbose=False, max_iter=-1, decision_function_shape='ovr', random_state=None) cv1 = StratifiedKFold(5, shuffle=False) scoring1 = 'accuracy' param_grid1 = [{ 'C': [1.0e8, 1.0e6, 10000, 100, 50, 10, 5, 2.5, 1, 0.5, 0.1, 0.01], 'kernel': ker }] dict_method.update({'SVC-set': [me1, cv1, scoring1, param_grid1]}) """5GPC""" me5 = gaussian_process.GaussianProcessClassifier(kernel=kernel) cv5 = StratifiedKFold(5, shuffle=False) scoring5 = 'balanced_accuracy' param_grid5 = [ { "kernel": ker }, ] dict_method.update({'GPC-set': [me5, cv5, scoring5, param_grid5]}) # 2nd part '''TreeC''' me6 = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight="balanced", presort=False) cv6 = StratifiedKFold(5, shuffle=False) scoring6 = 'accuracy' param_grid6 = [{ 'max_depth': [3, 4, 5, 6, 7, 8, 9, 10], 'min_samples_split': [2, 3, 4] }] dict_method.update({'TreeC-em': [me6, cv6, scoring6, param_grid6]}) '''GBC''' me7 = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_depth=3, min_impurity_decrease=0., min_impurity_split=None, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto') cv7 = StratifiedKFold(5, shuffle=False) scoring7 = 'balanced_accuracy' param_grid7 = [{ 'n_estimators': [50, 100, 200, 500], 'max_depth': [3, 4, 5, 6, 7, 8, 9, 10], 'min_samples_split': [2, 3, 4], 'learning_rate': [0.1, 0.05] }] dict_method.update({'GBC-em': [me7, cv7, scoring7, param_grid7]}) '''RFC''' me8 = RandomForestClassifier(n_estimators=100, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, min_impurity_decrease=0., min_impurity_split=None, bootstrap=True, oob_score=False, random_state=None, verbose=0, warm_start=False, class_weight="balanced") cv8 = StratifiedKFold(5, shuffle=False) scoring8 = 'accuracy' param_grid8 = [{ 'n_estimators': [50, 100, 200, 500], 'max_depth': [3, 4, 5, 6, 7, 8, 9, 10], 'min_samples_split': [2, 3, 4], 'learning_rate': [0.1, 0.05] }] dict_method.update({"RFC-em": [me8, cv8, scoring8, param_grid8]}) "AdaBC" dt = DecisionTreeRegressor(criterion="mse", splitter="best", max_features=None, max_depth=5, min_samples_split=4) me9 = AdaBoostClassifier(dt, n_estimators=100, learning_rate=1., algorithm='SAMME.R', random_state=0) cv9 = StratifiedKFold(5, shuffle=False) scoring9 = 'accuracy' param_grid9 = [{ 'n_estimators': [50, 100, 200, 500], 'max_depth': [3, 4, 5, 6, 7, 8, 9, 10], 'min_samples_split': [2, 3, 4], 'learning_rate': [0.1, 0.05] }] dict_method.update({"AdaBC-em": [me9, cv9, scoring9, param_grid9]}) # 3nd "Per" me14 = Perceptron(penalty="l1", alpha=0.0001, fit_intercept=True, max_iter=None, tol=None, shuffle=True, verbose=0, eta0=1.0, random_state=0, class_weight=None, warm_start=False, n_iter=None) cv14 = StratifiedKFold(5, shuffle=False) scoring14 = 'accuracy' param_grid14 = [ { 'alpha': [0.0001, 0.001, 0.01] }, ] dict_method.update({"Per-L1": [me14, cv14, scoring14, param_grid14]}) """LogRL1""" me15 = LogisticRegression(penalty='l1', solver='liblinear', dual=False, tol=1e-3, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight='balanced', random_state=0) cv15 = StratifiedKFold(5, shuffle=False) scoring15 = 'accuracy' param_grid15 = [ { 'C': [0.1, 0.2, 0.3, 0.4, 0.5, 1, 2], 'penalty': ["l1", "l2"] }, ] dict_method.update({"LogR-L1": [me15, cv15, scoring15, param_grid15]}) """3SGDCL2""" me3 = SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None, shuffle=True, verbose=0, epsilon=0.1, random_state=0, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight="balanced", warm_start=False, average=False, n_iter=None) cv3 = StratifiedKFold(5, shuffle=False) scoring3 = 'accuracy' param_grid3 = [ { 'alpha': [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 1e-05], 'loss': ['squared_loss', "huber"], "penalty": ["l1", "l2"] }, ] dict_method.update({"SGDC-set": [me3, cv3, scoring3, param_grid3]}) return dict_method
for x__ in x_: tmp_x_.append([x__]) tmp_x.append(np.array(tmp_x_)) return np.array(tmp_x) if __name__ == '__main__': path = './blod.data' data = np.loadtxt(path, delimiter=' ',skiprows=1) # print(data[0]) train_data = data[90*20:]; test_data = data[:90*20]; # test_data = test_data[90:]; x_data, y_data = np.split(train_data, (10,), axis=1) # x_data = oneToTwo(x_data) x_test_data, y_test_data = np.split(test_data, (10,), axis=1) # x_test_data = oneToTwo(x_test_data) y_data = np.split(y_data, (1,), axis=1)[0] y_test_data = np.split(y_test_data, (1,), axis=1)[0] # print(x_data) # print(y_data.T[0]) # print(x_test_data) # print(y_test_data.T[0]) model = gaussian_process.GaussianProcessClassifier() model.fit(x_data,y_data.T[0]) diabetes_y_pred = model.predict(x_test_data) result = y_test_data.T[0] == diabetes_y_pred acc = np.mean(result) print('准确度: %.2f%%' % (100 * acc))
# X = data1_x_bin # y = Target # from sklearn.model_selection import train_test_split # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101) MLA = [ #Ensemble Methods ensemble.AdaBoostClassifier(), ensemble.BaggingClassifier(), ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier(), #Gaussian Processes gaussian_process.GaussianProcessClassifier(), #GLM linear_model.LogisticRegressionCV(), linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(), linear_model.Perceptron(), #Navies Bayes naive_bayes.BernoulliNB(), naive_bayes.GaussianNB(), #Nearest Neighbor neighbors.KNeighborsClassifier(),
X_train=X_selected.head(num_train) X_test=X_selected.tail(num_test) X_train=preprocessing.scale(X_train) #Machine Learning Algorithm (MLA) Selection and Initialization MLA = [ #Ensemble Methods ensemble.AdaBoostClassifier(learning_rate=0.1,n_estimators=300,random_state=0), ensemble.BaggingClassifier(max_samples= 0.25, n_estimators= 300, random_state= 0), ensemble.ExtraTreesClassifier(criterion= 'entropy', max_depth= 6, n_estimators= 100, random_state= 0), ensemble.GradientBoostingClassifier(learning_rate= 0.05, max_depth= 2, n_estimators= 300, random_state= 0), ensemble.RandomForestClassifier(criterion= 'entropy', max_depth= 6, n_estimators= 100, oob_score= True, random_state= 0), #Gaussian Processes gaussian_process.GaussianProcessClassifier(max_iter_predict= 10, random_state= 0), #GLM linear_model.LogisticRegressionCV(fit_intercept= True, random_state= 0, solver= 'liblinear'), linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(), linear_model.Perceptron(), #Navies Bayes naive_bayes.BernoulliNB(alpha= 0.1), naive_bayes.GaussianNB(), #Nearest Neighbor neighbors.KNeighborsClassifier(algorithm= 'brute', n_neighbors= 7, weights= 'uniform'),
#models_f1.append(f1) #models_performances.append(performance) classifiers = [ ("KNN", None, KNeighborsClassifier(2)), ("Linear SVM", None, SVC(kernel="linear")), ("RBF SVM", None, SVC(gamma=2, C=1)), ("DT", None, DecisionTreeClassifier(min_samples_split=1024, max_depth=20)), ("RF", None, RandomForestClassifier(n_estimators=10, min_samples_split=1024, max_depth=20)), ("AB", None, AdaBoostClassifier(random_state=13370)), #("GP ARD", ["MFCC"], gp.GaussianProcessClassifier(kernel=ard_kernel(sigma=1.2, length_scale=np.array([1]*1)))), ("GP-DP", ["MFCC", "All", "CIFE", "CFS"], gp.GaussianProcessClassifier(kernel=gp.kernels.DotProduct())) # output the confidence level and the predictive variance for the dot product (the only one that we keep in the end) # GP beats SVM in our experiment (qualitative advantages) # only keep RBF, dot product and matern on the chart # add a paragraph 'Processed Data' #1) generate the dataset with 526 features #2) the predictive variance and predictive mean (best and worst) of some vectors from the dot product. ] #classify(X_train[:,bitVec], X_dev[:,bitVec]) models_f1, models_performances = getClassifieresPerformances( classifiers, models_f1, models_performances) #models_f1, models_performances = getClassifieresPerformancesByDefinedX(classifiers, 'predict', models_f1, models_performances, newTrainX, y_bin_train, newDevX) models_f1, models_performances = addRelatedWork(models_f1, models_performances) models_f1 = sorted(models_f1, key=lambda l: l[1]) models_performances = sorted(models_performances, key=lambda l: l[1])
def dict_method_clf(): dict_method = {} # 1st part """1SVC""" me1 = SVC(C=1.0, kernel='rbf', degree=3, gamma='auto_deprecated', coef0=0.0, shrinking=True, probability=False, tol=1e-3, cache_size=200, class_weight='balanced', verbose=False, max_iter=-1, decision_function_shape='ovr', random_state=None) cv1 = StratifiedKFold(5, shuffle=True, random_state=0) scoring1 = 'accuracy' param_grid1 = [{'C': [10, 5, 2.5, 1, 0.5], 'gamma': [0.001, 0.01, 0.0001]}] dict_method.update({'SVC-set': [me1, cv1, scoring1, param_grid1]}) """2LogRL2""" me2 = LogisticRegression(penalty='l2', solver='liblinear', dual=False, tol=1e-3, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight='balanced', random_state=0) cv2 = StratifiedKFold(5, shuffle=True, random_state=0) scoring2 = 'accuracy' param_grid2 = [{'C': [0.1, 0.2, 0.3, 0.4, 0.5, 1, 2]}, ] dict_method.update({"LogRL2-set": [me2, cv2, scoring2, param_grid2]}) """3SGDCL2""" me3 = linear_model.SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None, shuffle=True, verbose=0, epsilon=0.1, random_state=0, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight="balanced", warm_start=False, average=False, n_iter=None) cv3 = StratifiedKFold(5, shuffle=True, random_state=0) scoring3 = 'accuracy' param_grid3 = [{'alpha': [0.0001, 0.001, 0.01]}, ] dict_method.update({"SGDCL2-set": [me3, cv3, scoring3, param_grid3]}) """4KNC""" me4 = neighbors.KNeighborsClassifier(n_neighbors=5) cv4 = StratifiedKFold(5, shuffle=True, random_state=0) scoring4 = 'balanced_accuracy' param_grid4 = [{'n_neighbors': [3, 4, 5]}, ] dict_method.update({"KNC-set": [me4, cv4, scoring4, param_grid4]}) """5GPC""" kernel = 1.0 * RBF(1.0) me5 = gaussian_process.GaussianProcessClassifier(kernel=kernel) cv5 = StratifiedKFold(5, shuffle=True, random_state=0) scoring5 = 'balanced_accuracy' param_grid5 = [{'max_iter_predict': [100, ]}, ] dict_method.update({'GPC-set': [me5, cv5, scoring5, param_grid5]}) # 2nd part '''TreeC''' me6 = DecisionTreeClassifier( criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight="balanced", presort=False) cv6 = StratifiedKFold(5, shuffle=True, random_state=0) scoring6 = 'accuracy' param_grid6 = [{'max_depth': [3, 4, 5, 6]}] dict_method.update({'TreeC-em': [me6, cv6, scoring6, param_grid6]}) '''GBC''' me7 = ensemble.GradientBoostingClassifier( loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_depth=3, min_impurity_decrease=0., min_impurity_split=None, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto') cv7 = StratifiedKFold(5, shuffle=True, random_state=0) scoring7 = 'balanced_accuracy' param_grid7 = [{'max_depth': [3, 4, 5, 6]}] dict_method.update({'GBC-em': [me7, cv7, scoring7, param_grid7]}) '''RFC''' me8 = ensemble.RandomForestClassifier(n_estimators=100, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, min_impurity_decrease=0., min_impurity_split=None, bootstrap=True, oob_score=False, random_state=None, verbose=0, warm_start=False, class_weight="balanced") cv8 = StratifiedKFold(5, shuffle=True, random_state=0) scoring8 = 'accuracy' param_grid8 = [{'max_depth': [3, 4, 5, 6]}] dict_method.update({"RFC-em": [me8, cv8, scoring8, param_grid8]}) "AdaBC" me9 = AdaBoostClassifier(n_estimators=100, learning_rate=1., algorithm='SAMME.R', random_state=0) cv9 = StratifiedKFold(5, shuffle=True, random_state=0) scoring9 = 'accuracy' param_grid9 = [{'base_estimator': [DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=2), DecisionTreeClassifier(max_depth=3)]}] dict_method.update({"AdaBC-em": [me9, cv9, scoring9, param_grid9]}) # 3nd 'SGDCL1' me12 = linear_model.SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None, shuffle=True, verbose=0, epsilon=0.1, random_state=0, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight="balanced", warm_start=False, average=False, n_iter=None) cv12 = StratifiedKFold(5, shuffle=True, random_state=0) scoring12 = 'accuracy' param_grid12 = [{'alpha': [0.0001, 0.001, 0.01]}, ] dict_method.update({"SGDC-L1": [me12, cv12, scoring12, param_grid12]}) "Per" me14 = Perceptron(penalty="l1", alpha=0.0001, fit_intercept=True, max_iter=None, tol=None, shuffle=True, verbose=0, eta0=1.0, random_state=0, class_weight=None, warm_start=False, n_iter=None) cv14 = StratifiedKFold(5, shuffle=True, random_state=0) scoring14 = 'accuracy' param_grid14 = [{'alpha': [0.0001, 0.001, 0.01]}, ] dict_method.update({"Per-L1": [me14, cv14, scoring14, param_grid14]}) """LogRL1""" me15 = LogisticRegression(penalty='l1', solver='liblinear', dual=False, tol=1e-3, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight='balanced', random_state=0) cv15 = StratifiedKFold(5, shuffle=True, random_state=0) scoring15 = 'accuracy' param_grid15 = [{'C': [0.1, 0.2, 0.3, 0.4, 0.5, 1, 2]}, ] dict_method.update({"LogR-L1": [me15, cv15, scoring15, param_grid15]}) return dict_method
from sklearn import feature_selection from sklearn import model_selection from sklearn import metrics # In[ ]: # initialization algorithms algorithms = { # Ensemble Methods ensemble.AdaBoostClassifier(), ensemble.BaggingClassifier(), ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier(), # Gaussian Processes gaussian_process.GaussianProcessClassifier(), # Generalized Linear Models linear_model.LogisticRegressionCV(), linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(), linear_model.Perceptron(), # Navies Bayes naive_bayes.BernoulliNB(), naive_bayes.GaussianNB(), # Nearest Neighbor neighbors.KNeighborsClassifier(),
# # Step 5: Model Data # In[*] #Machine Learning Algorithm (MLA) Selection and initialization MLA = [ #Ensemble Methods ensemble.AdaBoostClassifier(), ensemble.BaggingClassifier(), ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier(n_estimators=100), #Gaussian Processes gaussian_process.GaussianProcessClassifier(), #GLM linear_model.LogisticRegressionCV(), linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(), linear_model.Perceptron(), #Navies Bayes naive_bayes.GaussianNB(), #Nearest Neighbor neighbors.KNeighborsClassifier(n_neighbors=3), #SVM
def learn(self, data_train, target_train): self.model = gaussian_process.GaussianProcessClassifier() self.model.fit( preprocessing.normalize(self.maybe_reshape(data_train), axis=1), target_train) return
def get_skl_estimator(self, **default_parameters): return gaussian_process.GaussianProcessClassifier(**default_parameters)
def compare_algorithm(data, target): x_train, x_cross, y_train, y_cross = train_test_split(data, target) MLA = [ # Ensemble Methods ensemble.AdaBoostClassifier(), ensemble.BaggingClassifier(), ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier(), # Gaussian Processes gaussian_process.GaussianProcessClassifier(), # GLM linear_model.LogisticRegressionCV(), linear_model.PassiveAggressiveClassifier(max_iter=1000, tol=0.001), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(max_iter=1000, tol=0.001), linear_model.Perceptron(max_iter=1000, tol=0.001), # Navies Bayes naive_bayes.BernoulliNB(), naive_bayes.GaussianNB(), # Nearest Neighbor neighbors.KNeighborsClassifier(), # SVM svm.SVC(probability=True), svm.NuSVC(probability=True), svm.LinearSVC(), # Trees tree.DecisionTreeClassifier(), tree.ExtraTreeClassifier(), # Discriminant Analysis discriminant_analysis.LinearDiscriminantAnalysis(), discriminant_analysis.QuadraticDiscriminantAnalysis(), # xgboost: http://xgboost.readthedocs.io/en/latest/model.html xgb.XGBClassifier() ] MLA_columns = [] MLA_compare = pd.DataFrame(columns=MLA_columns) row_index = 0 for alg in MLA: predicted = alg.fit(x_train, y_train).predict(x_cross) fp, tp, th = roc_curve(y_cross, predicted) MLA_name = alg.__class__.__name__ MLA_compare.loc[row_index, 'MLA Name'] = MLA_name MLA_compare.loc[row_index, 'MLA Train Accuracy'] = round( alg.score(x_train, y_train), 4) MLA_compare.loc[row_index, 'MLA Test Accuracy'] = round( alg.score(x_cross, y_cross), 4) MLA_compare.loc[row_index, 'MLA Precission'] = precision_score( y_cross, predicted) MLA_compare.loc[row_index, 'MLA Recall'] = recall_score(y_cross, predicted) MLA_compare.loc[row_index, 'MLA AUC'] = auc(fp, tp) row_index = row_index + 1 MLA_compare.sort_values(by=['MLA Test Accuracy'], ascending=False, inplace=True) print(MLA_compare)
# print("Negative Log Likelihood: %.3f\n" % (mul_gp1.log_marginal_likelihood(theta=None))) # print("Computing 10-fold CV...\n") # tcv = time.time() # cv_gp1 = cross_val_score(mul_gp1, data1[data_features[0:11]], data1["eval"], cv=10) # elapsed = time.time() - tcv # print('CV computation time :: %.3f\n' % (elapsed)) ##print('CV-prediction error rate :: {}'.format(cv_gp1)) ##mean cv and the 95% confidence interval of the cv's estimate # print("Accuracy(Mean CV): %0.2f (+/- %0.2f)\n" % (cv_gp1.mean(), cv_gp1.std() * 2)) # print('---------------------------------------------') # Multiclass as One-vs-One t2 = time.time() #kernel=1.0 * RBF(length_scale=1.0) mul_gp2 = gaussian_process.GaussianProcessClassifier( multi_class='one_vs_one').fit(train_x, train_y) trainTime = time.time() - t2 # print('Multiclass (1-vs-1) computation time :: %.3f\n' % (elapsed)) trainTestStartTime = time.time() print('Multiclass (1-vs-1) Gaussian Process Train Accuracy :: %.3f\n' % (metrics.accuracy_score(train_y, mul_gp2.predict(train_x)))) trainTestTime = time.time() - trainTestStartTime testTestStartTime = time.time() print('Multiclass (1-vs-1) Gaussian Process Test Accuracy :: %.3f\n' % (metrics.accuracy_score(test_y, mul_gp2.predict(test_x)))) testTestTime = time.time() - testTestStartTime print(trainTime) print(trainTestTime) print(testTestTime) # print("Negative Log Likelihood: %.3f\n" % (mul_gp1.log_marginal_likelihood(theta=None)))
# coding=utf-8 """Guassian Process Classifier applied on the Iris dataset.""" from sklearn import datasets, model_selection, gaussian_process, metrics if __name__ == "__main__": print("Loading data...") X, y = datasets.load_iris(return_X_y=True) X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y) print("Fitting model...") gpc = gaussian_process.GaussianProcessClassifier( kernel=gaussian_process.kernels.RBF([1.0])) gpc.fit(X_train, y_train) print("Evaluating model...") print(metrics.classification_report(y_test, gpc.predict(X_test))) print(metrics.confusion_matrix(y_test, gpc.predict(X_test)))
test_features = pd.read_csv('test_features.csv', index_col=0) else: # Split the training and testing data sets; save this test data set for later use too train_labels, test_labels, train_features, test_features = train_test_split(preprocessed_labels, preprocessed_features, test_size=0.2) test_features.to_csv("test_features.csv") test_labels.to_csv("test_labels.csv") # Create one of the following classifiers: if classifier_type == "Tree": classifier = tree.DecisionTreeClassifier() # Create Classifier, doesn't even need any of the params changed elif classifier_type == "DNN": classifier = neural_network.MLPClassifier(verbose=True, max_iter=max_iter, early_stopping=early_stopping, hidden_layer_sizes=(100, 50)) elif classifier_type == "Gaussian": classifier = gaussian_process.GaussianProcessClassifier(kernel=1.0*RBF(1.0)) elif classifier_type == "Cal_Class_Test": classifier = neural_network.MLPClassifier(verbose=True, max_iter=max_iter, hidden_layer_sizes=(100, 50)) classifier = CalibratedClassifierCV(classifier, cv=5, method="isotonic") use_calibrator = False # already calibrated else: # Default to DNN print("Classifier Unselected, Defaulting to DNN") print("----------------------------------------") classifier = neural_network.MLPClassifier(verbose=True, max_iter=max_iter, hidden_layer_sizes=(100, 50)) classifier.fit(train_features, train_labels) # Fit Model if use_calibrator: # Calibrate Classifier to adjust label probabilities print('calibrating...') classifier = CalibratedClassifierCV(classifier, cv="prefit", method=calibration_type) # Defaults to sigmoid classifier.fit(train_features, train_labels)
def multi_classifier_voting_predication(data1, data1_x_bin, cv_split, Target): # why choose one model, when you can pick them all with voting classifier # http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html # removed models w/o attribute 'predict_proba' required for vote classifier and models with a 1.0 correlation to another model vote_est = [ # Ensemble Methods: http://scikit-learn.org/stable/modules/ensemble.html ('ada', ensemble.AdaBoostClassifier()), ('bc', ensemble.BaggingClassifier()), ('etc', ensemble.ExtraTreesClassifier()), ('gbc', ensemble.GradientBoostingClassifier()), ('rfc', ensemble.RandomForestClassifier()), # Gaussian Processes: http://scikit-learn.org/stable/modules/gaussian_process.html#gaussian-process-classification-gpc ('gpc', gaussian_process.GaussianProcessClassifier()), # GLM: http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression ('lr', linear_model.LogisticRegressionCV()), # Navies Bayes: http://scikit-learn.org/stable/modules/naive_bayes.html ('bnb', naive_bayes.BernoulliNB()), ('gnb', naive_bayes.GaussianNB()), # Nearest Neighbor: http://scikit-learn.org/stable/modules/neighbors.html ('knn', neighbors.KNeighborsClassifier()), # SVM: http://scikit-learn.org/stable/modules/svm.html ('svc', svm.SVC(probability=True)), # xgboost: http://xgboost.readthedocs.io/en/latest/model.html ('xgb', XGBClassifier()) ] # Hard Vote or majority rules vote_hard = ensemble.VotingClassifier(estimators=vote_est, voting='hard') vote_hard_cv = model_selection.cross_validate(vote_hard, data1[data1_x_bin], data1[Target], cv=cv_split, return_train_score=True) vote_hard.fit(data1[data1_x_bin], data1[Target]) print("Hard Voting Training w/bin score mean: {:.2f}".format( vote_hard_cv['train_score'].mean() * 100)) print("Hard Voting Test w/bin score mean: {:.2f}".format( vote_hard_cv['test_score'].mean() * 100)) print("Hard Voting Test w/bin score 3*std: +/- {:.2f}".format( vote_hard_cv['test_score'].std() * 100 * 3)) print('-' * 10) # Soft Vote or weighted probabilities vote_soft = ensemble.VotingClassifier(estimators=vote_est, voting='soft') vote_soft_cv = model_selection.cross_validate(vote_soft, data1[data1_x_bin], data1[Target], cv=cv_split, return_train_score=True) vote_soft.fit(data1[data1_x_bin], data1[Target]) print("Soft Voting Training w/bin score mean: {:.2f}".format( vote_soft_cv['train_score'].mean() * 100)) print("Soft Voting Test w/bin score mean: {:.2f}".format( vote_soft_cv['test_score'].mean() * 100)) print("Soft Voting Test w/bin score 3*std: +/- {:.2f}".format( vote_soft_cv['test_score'].std() * 100 * 3)) print('-' * 10) return vote_hard, vote_soft