def leave_one_out_cv(gram_matrix, labels, alg = 'SVM'): """ leave-one-out cross-validation """ scores = [] preds = [] loo = sklearn.cross_validation.LeaveOneOut(len(labels)) for train_index, test_index in loo: X_train, X_test = gram_matrix[train_index][:,train_index], gram_matrix[test_index][:, train_index] y_train, y_test = labels[train_index], labels[test_index] if(alg == 'SVM'): svm = sklearn.svm.SVC(kernel = 'precomputed') svm.fit(X_train, y_train) preds += svm.predict(X_test).tolist() score = svm.score(X_test, y_test) elif(alg == 'kNN'): knn = sklearn.neighbors.KNeighborsClassifier() knn.fit(X_train, y_train) preds += knn.predict(X_test).tolist() score = knn.score(X_test, y_test) scores.append(score) print "Mean accuracy: %f" %(np.mean(scores)) print "Stdv: %f" %(np.std(scores)) return preds, scores
def svm_iterkernel(train_data, train_labels, test_data, test_labels, op_name_dir): label_set=np.unique(train_labels) if op_name_dir != ('None' or 'none'): fo=open(op_name_dir,'a') predict_list={} for kernel in ['linear']: #, 'poly', 'rbf']: t0=time.time() svm = SVC(C=1., kernel=kernel, cache_size=10240) svm.fit(train_data, train_labels) prediction=svm.predict(test_data) predict_list[kernel]=prediction pred_acc_tot =(float(np.sum(prediction == test_labels)))/len(test_labels) print time.time() - t0, ',kernel = '+kernel, ',pred acc = '+str(round(pred_acc_tot*100)) if op_name_dir != ('None' or 'none'): fo.write('time='+str(time.time() - t0)+'sec,kernel='+kernel+',pred acc='+str(round(pred_acc_tot*100))+'\n') for lab_unq in label_set: pred_acc=(prediction == lab_unq) & (test_labels == lab_unq) pred_acc=float(pred_acc.sum())/(len(test_labels[test_labels == lab_unq])) print 'pred_'+str(lab_unq)+','+str(round(pred_acc*100)) if op_name_dir != ('None' or 'none'): fo.write('pred_'+str(lab_unq)+','+str(round(pred_acc*100))+'\n') if op_name_dir != ('None' or 'none'): fo.close() return predict_list
def trainSVM(filteredFaces, labels, subjects, e): uniqueSubjects = np.unique(subjects) accuracies = [] masterK = filteredFaces.dot(filteredFaces.T) for testSubject in uniqueSubjects: idxs = np.nonzero(subjects != testSubject)[0] someFilteredFacesTrain = filteredFaces[idxs] someLabels = labels[idxs] y = someLabels == e K = masterK[idxs, :] K = K[:, idxs] svm = sklearn.svm.SVC(kernel="precomputed") svm.fit(K, y) idxs = np.nonzero(subjects == testSubject)[0] someFilteredFaces = filteredFaces[idxs] someLabels = labels[idxs] y = someLabels == e yhat = svm.decision_function(someFilteredFaces.dot(someFilteredFacesTrain.T)) if len(np.unique(y)) > 1: auc = sklearn.metrics.roc_auc_score(y, yhat) else: auc = np.nan print "{}: {}".format(testSubject, auc) accuracies.append(auc) accuracies = np.array(accuracies) accuracies = accuracies[np.isfinite(accuracies)] print np.mean(accuracies), np.median(accuracies)
def train(): training_set=[] training_labels=[] os.chdir("/Users/muyunyan/Desktop/EC500FINAL/logo/") counter=0 a=os.listdir(".") for i in a: os.chdir(i) print(i) for d in os.listdir("."): img = cv2.imread(d) res=cv2.resize(img,(250,250)) gray_image = cv2.cvtColor(res, cv2.COLOR_BGR2GRAY) xarr=np.squeeze(np.array(gray_image).astype(np.float32)) m,v=cv2.PCACompute(xarr) arr= np.array(v) flat_arr= arr.ravel() training_set.append(flat_arr) training_labels.append(i) os.chdir("..") trainData=training_set responses=training_labels svm = svm.SVC() svm.fit(trainData,responses) return svm
def run_model(train_data, train_labels, test_data, test_labels): ''' Algorithm which will take in a set of training text and labels to train a bag of words model This model is then used with a logistic regression algorithm to predict the labels for a second set of text Method modified from code available at: https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-1-for-beginners-bag-of-words Args: train_data_text: Text training set. Needs to be iterable train_labels: Training set labels test_data_text: The text to Returns: pred_labels: The predicted labels as determined by logistic regression ''' #use Logistic Regression to train a model svm = SVC() # we create an instance of Neighbours Classifier and fit the data. svm.fit(train_data, train_labels) #Now that we have something trained we can check if it is accurate with the test set pred_labels = svm.predict(test_data) perform_results = performance_metrics.get_perform_metrics(test_labels, pred_labels) #Perform_results is a dictionary, so we should add other pertinent information to the run perform_results['vector'] = 'Bag_of_Words' perform_results['alg'] = 'Support_Vector_Machine' return pred_labels, perform_results
def trainOneSVM(masterK, y, subjects): Cs = 1.0 / np.array([0.1, 0.5, 2.5, 12.5, 62.5, 312.5]) # Cs = 10. ** np.arange(-5, +6)/2. uniqueSubjects, subjectIdxs = np.unique(subjects, return_inverse=True) highestAccuracy = -float("inf") NUM_MINI_FOLDS = 4 for C in Cs: # For each regularization value # print "C={}".format(C) accuracies = [] for i in range(NUM_MINI_FOLDS): # For each test subject testIdxs = np.nonzero(subjectIdxs % NUM_MINI_FOLDS == i)[0] trainIdxs = np.nonzero(subjectIdxs % NUM_MINI_FOLDS != i)[0] if len(np.unique(y[testIdxs])) > 1: K = masterK[trainIdxs, :] K = K[:, trainIdxs] svm = sklearn.svm.SVC(kernel="precomputed", C=C) svm.fit(K, y[trainIdxs]) K = masterK[testIdxs, :] K = K[:, trainIdxs] # I.e., need trainIdxs dotted with testIdxs accuracy = sklearn.metrics.roc_auc_score(y[testIdxs], svm.decision_function(K)) # print accuracy accuracies.append(accuracy) if np.mean(accuracies) > highestAccuracy: highestAccuracy = np.mean(accuracies) bestC = C svm = sklearn.svm.SVC(kernel="precomputed", C=bestC) svm.fit(masterK, y) return svm
def main(): data = pickle.load(open('../submodular_20.pickle')) train, train_labels, test, test_labels = Load20NG() vectorizer = sklearn.feature_extraction.text.CountVectorizer(binary=True, lowercase=False) vectorizer.fit(train + test) train_vectors = vectorizer.transform(train) test_vectors = vectorizer.transform(test) svm = sklearn.svm.SVC(probability=True, kernel='rbf', C=10,gamma=0.001) svm.fit(train_vectors, train_labels) json_ret = {} json_ret['class_names'] = ['Atheism', 'Christianity'] json_ret['instances'] = [] explanations = data['explanations']['20ng']['svm'] idxs = data['submodular_idx']['20ng']['svm'][:10] for i in idxs: json_obj = {} json_obj['id'] = i idx = i instance = test_vectors[idx] json_obj['true_class'] = test_labels[idx] json_obj['c1'] = {} json_obj['c1']['predict_proba'] = list(svm.predict_proba(test_vectors[idx])[0]) exp = explanations[idx] json_obj['c1']['exp'] = exp json_obj['c1']['data'] = get_pretty_instance(test[idx], exp, vectorizer) json_ret['instances'].append(json_obj) import json open('static/exp2_local.json', 'w').write('data = %s' % json.dumps(json_ret))
def q20(): X, y = load_data('/Users/pjhades/code/lab/ml/train.dat') y = set_binlabel(y, 0) # init hit counts gammas = [1, 10, 100, 1000, 10000] hits = {} for gamma in gammas: hits[gamma] = 0 repeat = 100 for round in range(repeat): print('round {0}/{1}'.format(round, repeat), end=', ') err_min = 1 gamma_min = max(gammas) + 1 X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=1000) for gamma in gammas: svm = sklearn.svm.SVC(C=0.1, kernel='rbf', gamma=gamma) svm.fit(X_train, y_train) err = get_error(svm, X_val, y_val) if err < err_min or (err == err_min and gamma < gamma_min): err_min = err gamma_min = gamma hits[gamma_min] += 1 print('gamma={0}'.format(gamma_min)) for gamma in gammas: print('{0} hits {1} times'.format(gamma, hits[gamma]))
def q15(): X_train, y_train = load_data('/Users/pjhades/code/lab/ml/train.dat') y = set_binlabel(y_train, 0) svm = sklearn.svm.SVC(C=0.01, kernel='linear') svm.fit(X_train, y) print(linalg.norm(svm.coef_))
def k_fold_cv(gram_matrix, labels, folds = 10, alg = 'SVM', shuffle = True): """ K-fold cross-validation """ pdb.set_trace() scores = [] preds = [] loo = sklearn.cross_validation.KFold(len(labels), folds, shuffle = shuffle, random_state = random.randint(0,100)) #loo = sklearn.cross_validation.LeaveOneOut(len(labels)) for train_index, test_index in loo: X_train, X_test = gram_matrix[train_index][:,train_index], gram_matrix[test_index][:, train_index] y_train, y_test = labels[train_index], labels[test_index] if(alg == 'SVM'): svm = sklearn.svm.SVC(kernel = 'precomputed') svm.fit(X_train, y_train) preds += svm.predict(X_test).tolist() score = svm.score(X_test, y_test) elif(alg == 'kNN'): knn = sklearn.neighbors.KNeighborsClassifier() knn.fit(X_train, y_train) preds += knn.predict(X_test).tolist() score = knn.score(X_test, y_test) scores.append(score) print "Mean accuracy: %f" %(np.mean(scores)) print "Stdv: %f" %(np.std(scores)) return preds, scores
def svm_train(X,y,k): C_range = 10.0 ** np.arange(-2, 9) gamma_range = 10.0 ** np.arange(-5, 4) param_grid = dict(gamma=gamma_range, C=C_range) cv = StratifiedKFold(y=y,n_folds=k) svm = GridSearchCV(SVC(), param_grid=param_grid, cv=cv) svm.fit(X,y) return svm
def svm_liblinear_solver(X, y, C, tol=1e-6, max_iter=100, verbose=False): svm = sklearn.svm.LinearSVC(loss='hinge', tol=tol, C=C, verbose=verbose, intercept_scaling=10, max_iter=max_iter) now = time.clock() svm.fit(X, y) res_time = time.clock() - now return {'w0': svm.intercept_[0], 'w': svm.coef_.copy()[0], 'time': res_time}
def trainSVM(svm, sv, y): print "\ntraining SVM" # cross validate 5 times scores = cross_val_score(svm, sv, y, cv=5) print scores # fit the data to the labels svm.fit(sv, y) return svm
def q16_17(): X_train, y_train = load_data('/Users/pjhades/code/lab/ml/train.dat') for goal in [0, 2, 4, 6, 8]: y = set_binlabel(y_train, goal) svm = sklearn.svm.SVC(C=0.01, kernel='poly', degree=2, coef0=1, gamma=1) svm.fit(X_train, y) ein = get_error(svm, X_train, y) print('{0} vs not {0}, ein={1}'.format(goal, ein), end=', ') print('sum of alphas={0}'.format(np.sum(np.abs(svm.dual_coef_))))
def q19(): X_train, y_train = load_data('/Users/pjhades/code/lab/ml/train.dat') X_test, y_test = load_data('/Users/pjhades/code/lab/ml/test.dat') y_train = set_binlabel(y_train, 0) y_test = set_binlabel(y_test, 0) for gamma in [10000, 1000, 1, 10, 100]: svm = sklearn.svm.SVC(C=0.1, kernel='rbf', gamma=gamma) svm.fit(X_train, y_train) print('gamma={0:<10}, Eout={1}'.format(gamma, get_error(svm, X_test, y_test)))
def hw1q18(): print "----------------------------------------" print " Homework 1 Question 18 " print "----------------------------------------" Y_train_0 = (Y_train == 0).astype(int) Y_test_0 = (Y_test == 0).astype(int) print "in the training set:" print "n(+) =", np.count_nonzero(Y_train_0 == 1), "n(-) =", np.count_nonzero(Y_train_0 == 0) print "in the test set:" print "n(+) =", np.count_nonzero(Y_test_0 == 1), "n(-) =", np.count_nonzero(Y_test_0 == 0) for C in (0.001, 0.01, 0.1, 1, 10): svm = sklearn.svm.SVC(C=C, kernel="rbf", gamma=100, tol=1e-7, shrinking=True, verbose=False) svm.fit(X_train, Y_train_0) print "----------------------------------------" print "C =", C support = svm.support_ coef = svm.dual_coef_[0] b = svm.intercept_[0] print "nSV =", len(support) Y_predict = svm.predict(X_test) print "in the prediction:" print "n(+) =", np.count_nonzero(Y_predict == 1), "n(-) =", np.count_nonzero(Y_predict == 0) print "E_out =", np.count_nonzero(Y_test_0 != Y_predict) print fig = plt.figure() plt.suptitle("C =" + str(C)) plt.subplot(311) plt.title("Training data: green +, red -") plot_01(X_train, Y_train_0) plt.tick_params(axis="x", labelbottom="off") plt.subplot(312) plt.title("Prediction on test data: green +, red -") plot_01(X_test, Y_predict) plt.tick_params(axis="x", labelbottom="off") plt.subplot(313) plt.title("Support vectors: blue") plt.plot(X_train[:, 0], X_train[:, 1], "r.") plt.plot(X_train[support, 0], X_train[support, 1], "b.") plt.show()
def q18(): X_train, y_train = load_data('/Users/pjhades/code/lab/ml/train.dat') X_test, y_test = load_data('/Users/pjhades/code/lab/ml/test.dat') y_train = set_binlabel(y_train, 0) y_test = set_binlabel(y_test, 0) for C in [0.001, 0.01, 0.1, 1, 10]: svm = sklearn.svm.SVC(C=C, kernel='rbf', gamma=100) svm.fit(X_train, y_train) print('C={0}'.format(C)) print('# support vectors =', np.sum(svm.n_support_)) print('Eout =', get_error(svm, X_test, y_test))
def runSVM(self): """ Runs the SVM on 5 different splits of cross validation data """ for train, test in self.kf: svm = self.models["SVM"] train_set, train_labels = self.getCurrFoldTrainData(train) test_set, test_labels = self.getCurrFoldTestData(test) svm.fit(train_set, train_labels) preds = svm.predict(test_set) acc = self.getAccuracy(test_labels, preds) print "(SVM) Percent correct is", acc
def hw1q15(): svm = sklearn.svm.SVC(C=0.01, kernel="linear", shrinking=False, verbose=True) X_train_0 = X_train Y_train_0 = (Y_train == 0).astype(int) svm.fit(X_train_0, Y_train_0) w = svm.coef_[0] b = svm.intercept_[0] print "w =", w print "norm(w) =", np.linalg.norm(w, ord=2) print "b =", b
def trainTest(): data2010, labels2010 = read_tac('2010') data2011, labels2011 = read_tac("2011") #classifiers gnb = naive_bayes.GaussianNB() svm = svm.SVC(kernel = "linear") logReg = linear_model.LogisticRegression() gnb.fit(data2010, labels2010) svm.fit(data2010, labels2010) logReg.fit(data2010, labels2010) gnbPrediction = gnb.predict(data2011) svmPrediction = svm.predict(data2011) logRegPrediction = logReg.predict(data2011) gnbAccuracy = accuracy(labels2011, gnbPrediction) svmAccuracy = accuracy(labels2011, svmPrediction) logRegAccuracy = accuracy(labels2011, logRegPrediction) confusionMatrix = metrics.confusion_matrix(labels2011, logRegPrediction) print "Results:" print "Gaussian Naive Bayes: " print gnbAccuracy print "Support Vector Machine: " print svmAccuracy print "Logistic Regression: " print logRegAccuracy print confusionMatrix fh.write("Results:" + "\n") fh.write("Gaussian Naive Bayes: " + "\n") fh.write(gnbAccuracy + "\n") fh.write("Support Vector Machine: " + "\n") fh.write(svmAccuracy + "\n") fh.write("Logistic Regression: " + "\n") fh.write(logRegAccuracy + "\n") for i in confusionMatrix: fh.write(str(i)) fh.write("\n") fh.write("-------------------------------------------------\n") fh.write("\n\n")
def hw1q16(): print "----------------------------------------" print " Homework 1 Question 16 " print "----------------------------------------" # polynomial kernel: (coef0 + gamma * x1.T * x2) ** degree for idx in (0, 2, 4, 6, 8): svm = sklearn.svm.SVC( C=0.01, kernel="poly", degree=2, gamma=1, coef0=1, tol=1e-4, shrinking=True, verbose=False ) Y_train_i = (Y_train == idx).astype(int) svm.fit(X_train, Y_train_i) Y_predict_i = svm.predict(X_train) support = svm.support_ coef = svm.dual_coef_[0] b = svm.intercept_[0] E_in = np.count_nonzero(Y_train_i != Y_predict_i) print "For class %d:" % (idx) print "sum(alpha) =", np.sum(np.abs(coef)) print "b =", b print "E_in =", E_in fig = plt.figure() # plt.suptitle('%d vs rest' % (idx)) plt.subplot(311) plt.title("Training data: green +, red -") plot_01(X_train, Y_train_i) plt.tick_params(axis="x", labelbottom="off") plt.subplot(312) plt.title("Prediction: green +, red -") plot_01(X_train, Y_predict_i) plt.tick_params(axis="x", labelbottom="off") plt.subplot(313) plt.title("Support vectors: blue") plt.plot(X_train[:, 0], X_train[:, 1], "r.") plt.plot(X_train[support, 0], X_train[support, 1], "b.") plt.show()
def trainRBM_SVM(features, Cparam, nComponents): [X, Y] = listOfFeatures2Matrix(features) rbm = BernoulliRBM(n_components = nComponents, n_iter = 30, learning_rate = 0.2, verbose = True) rbm.fit(X,Y) newX = rbm.transform(X) # colors = ["r","g","b"] # for i in range(1,Y.shape[0],5): # plt.plot(newX[i,:], colors[int(Y[i])]) # plt.show() classifier = {} classifier["rbm"] = rbm svm = sklearn.svm.SVC(C = Cparam, kernel = 'linear', probability = True) svm.fit(newX,Y) classifier["svm"] = svm return classifier
def svm_libsvm_solver(X, y, C, tol=1e-6, max_iter=100, verbose=False, gamma=0): if gamma == 0: svm = sklearn.svm.SVC(C=C, kernel='linear', tol=tol, verbose=verbose, max_iter=max_iter) else: svm = sklearn.svm.SVC(C=C, kernel='rbf', gamma=gamma, tol=tol, verbose=verbose, max_iter=max_iter) now = time.clock() svm.fit(X, y) res_time = time.clock() - now A = np.zeros(X.shape[0]) A[svm.support_] = np.abs(svm.dual_coef_) return {'w0': svm.intercept_[0], 'w': compute_w(X, y, A), 'A': A, 'time': res_time}
def svm_test(): X_train = np.array([[0, 0], [1, 0], [0, 2], [-2, 0]]) Y_train = np.array([1, 1, 0, 0]) svm = sklearn.svm.SVC(C=100000, kernel="linear", shrinking=False, verbose=False) svm.fit(X_train, Y_train) Y_predict = svm.predict(X_train) print Y_predict b = svm.intercept_[0] print b plt.figure() plt.suptitle("svm test") plt.subplot(211) plot_01(X_train, Y_train) plt.subplot(212) plot_01(X_train, Y_predict) plt.plot(X_train[Y_predict == 0, 0], X_train[Y_predict == 0, 1], "ro") plt.plot(X_train[Y_predict == 1, 0], X_train[Y_predict == 1, 1], "go") plt.show()
def outlier_detection_with_SVM(dataframe, kernel, gamma, outlier_percentage): """ Note that the SVM parameters are higly sensitive to the dataset, so they have to be manually selected for each dataset """ assert isinstance(dataframe, DataFrame), "Expected pandas DataFrame, but got %s."%type(dataframe) from scipy.stats import scoreatpercentile from sklearn import svm svm = svm.OneClassSVM(kernel=kernel, gamma=gamma) points = dataframe.values svm.fit(points) assignment = svm.decision_function(points) score = scoreatpercentile(assignment.ravel(), 1 - outlier_percentage) inliers_idx, dummy = np.where(assignment <= score) outliers_idx, dummy = np.where(assignment > score) print "%s inliers and %s outliers"%(len(inliers_idx), len(outliers_idx)) return inliers_idx, outliers_idx
def train_svm_from_saved_dataset(): dataset = get_thing_from_file("training_dataset.txt") svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))]) save_thing_to_file(svm, "svm.txt") svm = svm.fit(dataset.data, dataset.target) save_thing_to_file(svm, "svm_model.txt")
def hw1q19(): print "----------------------------------------" print " Homework 1 Question 19 " print "----------------------------------------" Y_train_0 = (Y_train == 0).astype(int) Y_test_0 = (Y_test == 0).astype(int) for gamma in (1, 10, 100, 1000, 10000): svm = sklearn.svm.SVC(C=0.1, kernel="rbf", gamma=gamma, tol=1e-7, shrinking=True, verbose=False) svm.fit(X_train, Y_train_0) print "----------------------------------------" print "gamma =", gamma Y_predict_0 = svm.predict(X_test) print "in the prediction:" print "n(+) =", np.count_nonzero(Y_predict_0 == 1), "n(-) =", np.count_nonzero(Y_predict_0 == 0) print "E_out =", np.count_nonzero(Y_test_0 != Y_predict_0) print
def trainSVM_RBF(features, Cparam): ''' Train a multi-class probabilitistic SVM classifier. Note: This function is simply a wrapper to the sklearn functionality for SVM training See function trainSVM_feature() to use a wrapper on both the feature extraction and the SVM training (and parameter tuning) processes. ARGUMENTS: - features: a list ([numOfClasses x 1]) whose elements containt numpy matrices of features each matrix features[i] of class i is [numOfSamples x numOfDimensions] - Cparam: SVM parameter C (cost of constraints violation) RETURNS: - svm: the trained SVM variable NOTE: This function trains a linear-kernel SVM for a given C value. For a different kernel, other types of parameters should be provided. ''' [X, Y] = listOfFeatures2Matrix(features) svm = sklearn.svm.SVC(C = Cparam, kernel = 'rbf', probability = True) svm.fit(X,Y) return svm
def train_svm_from_scratch(): dataset = datasets.load_files(training_data, encoding='utf-8', decode_error='ignore') svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))]) save_thing_to_file(svm, "svm.txt") svm = svm.fit(dataset.data, dataset.target) save_thing_to_file(svm, "svm_model.txt")
def hw1q20(): print "----------------------------------------" print " Homework 1 Question 20 " print "----------------------------------------" Y_train_0 = (Y_train == 0).astype(int) C = 0.1 m = len(Y_train_0) gammas = [1, 10, 100, 1000, 10000] counts = [0] * len(gammas) for nrun in range(10): print "run", nrun # generate a random order of m indices arr = np.arange(m) np.random.shuffle(arr) # pick 1000 for cross validation X_curval_0 = X_train[arr[:1000]] Y_curval_0 = Y_train_0[arr[:1000]] X_curtrain_0 = X_train[arr[1000:]] Y_curtrain_0 = Y_train_0[arr[1000:]] E_vals = [0.0] * len(gammas) for i in range(len(gammas)): gamma = gammas[i] svm = sklearn.svm.SVC(C=C, kernel="rbf", gamma=gamma, tol=1e-3, shrinking=True, verbose=False) svm.fit(X_curtrain_0, Y_curtrain_0) Y_curpredict_0 = svm.predict(X_curval_0) E_val = np.count_nonzero(Y_curval_0 != Y_curpredict_0) E_vals[i] = E_val counts[np.argmin(E_vals)] += 1 for i in range(len(gammas)): print "gamma", gammas[i], "got picked", counts[i], "times"
def main(args): # Get the arguments. filename_lars = args.f algoname = args.n alpha = args.a group_splits = args.g assert os.path.isfile(filename_lars) warnings.simplefilter("ignore", ConvergenceWarning) # Read the files. with h5py.File(filename_lars) as f: values = f["values"].value col_index = f["col_index"].value row_ptr = f["row_ptr"].value labels = f["labels"].value m = scipy.sparse.csr_matrix((values, col_index, row_ptr)) if algoname == "forest_garrote" and len(group_splits) == 0: # Do the lasso. coefs = sklearn.linear_model.lasso_path(m, labels, positive=True, max_iter=100, alphas=[alpha])[1] coefs = coefs[:, -1] elif algoname == "l2_svm": # Use an l2 svm. svm = sklearn.svm.LinearSVC(C=1.0, penalty="l2") svm.fit(m, labels) coefs = svm.coef_[0, :] elif algoname == "l1_svm": # Use an l1 svm. svm = sklearn.svm.LinearSVC(C=1.0, penalty="l1", dual=False) svm.fit(m, labels) coefs = svm.coef_[0, :] elif algoname == "forest_garrote" and len(group_splits) > 0: # Make groups and run the forest garrote on each group. group_splits = [0] + group_splits + [m.shape[1]] n_groups = len(group_splits) - 1 if args.n_threads < 1: args.n_threads = cpu_count() n_threads = min(n_groups, args.n_threads) if n_threads == 1: coef_list = [] for i in xrange(n_groups): begin = group_splits[i] end = group_splits[i + 1] sub_m = m[:, begin:end] coefs = sklearn.linear_model.lasso_path(sub_m, labels, positive=True, max_iter=100, alphas=[alpha])[1] coefs = coefs[:, -1] coef_list.append(coefs) coefs = numpy.concatenate(coef_list) coefs = coefs / n_groups else: in_qu = Queue() out_qu = Queue() procs = [ Process(target=lars_worker, args=(in_qu, out_qu, m, labels, alpha)) for _ in xrange(n_threads) ] for i in xrange(n_groups): begin = group_splits[i] end = group_splits[i + 1] in_qu.put((i, begin, end)) for p in procs: in_qu.put(None) p.start() coef_list = [None] * n_groups for i in xrange(n_groups): k, coefs = out_qu.get() coef_list[k] = coefs for p in procs: p.join() coefs = numpy.concatenate(coef_list) coefs = coefs / n_groups # Use an additional l2 svm to refine the weights. nnz = coefs.nonzero()[0] m_sub = m[:, nnz] svm = sklearn.svm.LinearSVC(C=1.0, penalty="l2") svm.fit(m_sub, labels) new_coefs = svm.coef_[0, :] coefs = numpy.zeros(m.shape[1]) coefs[nnz] = new_coefs else: raise Exception("Unknown algorithm: " + algoname) # Save the results. nnz = coefs.nonzero()[0] nnz_coefs = coefs[nnz] with h5py.File(filename_lars) as f: if "result_nnz" in f: del f["result_nnz"] if "result_nnz_coefs" in f: del f["result_nnz_coefs"] f.create_dataset("result_nnz", data=nnz, compression="gzip", compression_opts=5) f.create_dataset("result_nnz_coefs", data=nnz_coefs, compression="gzip", compression_opts=5)
newRows = len(X_train) newCols = len(X_train[0]) newRowst = len(X_test) newColst = len(X_test[0]) newRowsL = len(y_train) Features = chiSqr(X_train, y_train, feat) allFeatures.append(Features) argument = copy.deepcopy(Features) data_fea = CreateDataSet(argument, X_train) # print("New Data Made, rows= ",len(data_fea)," cols= ",len(data_fea[0])) svm.fit(data_fea, y_train) logReg.fit(data_fea, y_train) NaiveB.fit(data_fea, y_train) NearestCen.fit(data_fea, y_train) TestFeatures = chiSqr(X_test, y_test, feat) test_fea = CreateDataSet(TestFeatures, X_test) len_test_fea = len(test_fea) count_svm = 0 count_log = 0 count_nb = 0 count_nc = 0 count = 0 for j in range(0, len_test_fea, 1): predLab_svm = int(svm.predict([test_fea[j]]))
# Find the accuracy on Testing Dataset predicted_label = blrPredict(W, test_data) print('\n Testing set Accuracy:' + str(100 * np.mean((predicted_label == test_label).astype(float))) + '%') print(confusion_matrix(test_label, predicted_label, labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) stop_time_LR = time.time() - start_time_LR print("Time taken for Logistic Regression {}.seconds\n".format(str(stop_time_LR))) # Code for SVM print("Learning SVM Using Linear Kernel") svm = SVC(kernel = 'linear') #train_label = train_label.flatten() indexes = np.random.randint(50000, size = 10000) sample_data = train_data[indexes, :] sample_label = train_label[indexes, :] svm.fit(sample_data, sample_label.flatten()) traning_accuracy = svm.score(train_data, train_label) traning_accuracy = str(100*traning_accuracy) print("Traning data Accuracy for Linear Kernel: {}%\n".format(traning_accuracy)) validation_accuracy = svm.score(validation_data, validation_label) validation_accuracy = str(100*validation_accuracy) print("Validation data Accuracy for Linear Kernel: {}%\n".format(validation_accuracy)) test_accuracy = svm.score(test_data, test_label) test_accuracy = str(100*test_accuracy) print("Test data Accuracy for Linear Kernel: {}%\n".format(test_accuracy)) time_linear_kernel = time.time() - start_time_linear_kernel print("Time taken for SVM using Linear Kernel {}.seconds\n\n\n".format(str(time_linear_kernel)))
print('Accuracy of GNB classifier on training set: {:.2f}'.format( gnb.score(X_train, Y_train))) print('Accuracy of GNB classifier on test set: {:.2f}'.format( gnb.score(X_test, Y_test))) def svc_param_selection(X, y, nfolds): from sklearn import svm import numpy as np GridSearchCV = sklearn.model_selection.GridSearchCV Cs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1] # gammas = [0.001, 0.01, 0.1, 1] kernels = ['linear', 'rbf'] param_grid = {'C': Cs, 'kernel': kernels} grid_search = GridSearchCV(svm.SVC(), param_grid, cv=nfolds) grid_search.fit(X, y) return grid_search.best_params_ # Learning params = svc_param_selection(X_train, Y_train, 3) print params svm = SVC(**params) # svm = SVC() svm.fit(X_train, Y_train) print('Accuracy of SVM classifier on training set: {:.2f}'.format( svm.score(X_train, Y_train))) print('Accuracy of SVM classifier on test set: {:.2f}'.format( svm.score(X_test, Y_test)))
def run(subj_id, acq_date, subj_data=None): conf = ul_sens_fmri.config.get_conf() conf.ana = ul_sens_analysis.config.get_conf() cm = np.zeros(( len(conf.ana.roi_names), 2, # pres loc (upper, lower), conf.exp.n_img, conf.exp.n_src_locs, # (above, below) 2 # (above, below) predicted )) for (i_vf, vf) in enumerate(("upper", "lower")): # get the data for this VF subj_data = ul_sens_analysis.mvpa.data.get_mvpa_data( subj_id, acq_date, vf) for (i_roi, roi_name) in enumerate(conf.ana.roi_names): beta_data = subj_data[0][roi_name] loc_data = subj_data[1][roi_name] # beta_data needs to be z-scored # combine the images and source locations together so we can get a # mean and std for each run temp_beta = np.concatenate((beta_data[:, 0, ...], beta_data[:, 1, ...])) run_mean = np.mean(temp_beta, axis=0) run_std = np.std(temp_beta, axis=0) # do the z-scoring beta_data = ((beta_data - run_mean[np.newaxis, np.newaxis, ...]) / run_std[np.newaxis, np.newaxis, ...]) node_k = len(loc_data) for i_test_run in xrange(conf.exp.n_runs): # exclude the current 'test' run i_train_runs = np.setdiff1d(range(conf.exp.n_runs), [i_test_run]) train_data = np.empty( (len(i_train_runs) * conf.exp.n_src_locs * conf.exp.n_img, node_k)) train_data.fill(np.NAN) train_labels = np.empty(train_data.shape[0]) train_labels.fill(np.NAN) i_flat = 0 for i_train_run in i_train_runs: for i_img in xrange(conf.exp.n_img): for (i_sl, sl_label) in enumerate([-1, 1]): train_data[i_flat, :] = beta_data[i_img, i_sl, i_train_run, :] train_labels[i_flat] = sl_label i_flat += 1 svm = sklearn.svm.SVC(kernel="linear") svm.fit(train_data, train_labels) # testing for i_img in xrange(conf.exp.n_img): curr_pred = svm.predict(beta_data[i_img, :, i_test_run, :]) for (true_val, pred_val) in zip([-1, 1], curr_pred): if true_val == -1: i_true = 0 else: i_true = 1 if pred_val == -1: i_pred = 0 else: i_pred = 1 cm[i_roi, i_vf, i_img, i_true, i_pred] += 1 return cm
#from sklearn import KNeighborsClassifier from sklearn import neighbors knn = neighbors.KNeighborsClassifier(n_neighbors = 100) knn.fit(x_train,y_train) prediction = knn.predict(x_test) print("knn score:", knn.score(x_test,y_test)) print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, prediction)) print('Mean Squared Error:', metrics.mean_squared_error(y_test, prediction)) print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, prediction))) #%% svm from sklearn import svm svm = svm.SVC(random_state=1) svm.fit(x_train,y_train) prediction_svm = svm.predict(x_test) print("svm accuary: ",svm.score(x_test,y_test)) print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, prediction_svm)) print('Mean Squared Error:', metrics.mean_squared_error(y_test, prediction_svm)) print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, prediction_svm))) #%% rf classification from sklearn import ensemble rf= ensemble.RandomForestClassifier(n_estimators=10,random_state=1) rf.fit(x_train,y_train) prediction_rf = rf.predict(x_test) print("rf accuracy: ",rf.score(x_test,y_test)) print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, prediction_rf)) print('Mean Squared Error:', metrics.mean_squared_error(y_test, prediction_rf)) print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, prediction_rf)))
temp[0][0] = 1 else: temp[0][1] = 1 x_test.append(temp[0]) X_test = [(x, np.empty((0, 2), dtype=np.int)) for x in x_test] print len(x_test) for i in range(len(test_labels)): test_labels = test_labels.astype(int) """ print len(test_labels) pbl = GraphCRF(inference_method='ad3') svm = NSlackSSVM(pbl, C=1,n_jobs = 1,verbose = 1) start = time() print len(X_valid) print len(valid_Y) svm.fit(X_valid, valid_Y) print "fit finished" time_svm = time() - start print X_test[i][0].shape print svm.score(X_valid,valid_Y) print svm.score(X_test,test_Y) y_pred = np.vstack(svm.predict(np.array(X_valid))) print("Score with pystruct crf svm: %f (took %f seconds)" % (np.mean(y_pred == valid_Y), time_svm)) y_predt = np.vstack(svm.predict(np.array(X_test))) print("Score with pystruct crf svm: %f (took %f seconds)" % (np.mean(y_predt == test_Y), time_svm)) #we throw away void superpixels and flatten everything #y_pred, y_true = np.hstack(y_pred), np.hstack(valid_Y)
accuracy_score(test_target, knn_pred) ################## # Random Forest # ################## from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier(n_estimators=500) rf.fit(train, train_target) rf_pred = rf.predict(test) #Confusion Matrix and Accuracy Score print(confusion_matrix(test_target, rf_pred)) accuracy_score(test_target, rf_pred) ################### ## SVM ## ################### from sklearn import svm svm = svm.SVC() svm.fit(train, train_target) svm_pred = svm.predict(test) #Confusion Matrix and Accuracy Score print(confusion_matrix(test_target, svm_pred)) accuracy_score(test_target, svm_pred)
def main(): mnist = fetch_openml(name='mnist_784') echantillon = np.random.randint(70000, size=5000) data = mnist.data[echantillon] target = mnist.target[echantillon] xtrain, xtest, ytrain, ytest = train_test_split(data, target, train_size=0.7) classifier = svm.SVC(kernel='linear') classifier.fit(xtrain, ytrain) error = 1 - classifier.score(xtest, ytest) print(f"Score SVM linéaire : {error}") kernels = [] print("Modification du kernel : ") for kernel in ['linear', 'poly', 'rbf', 'sigmoid']: classifier = svm.SVC(kernel=kernel) start_training = time.time() classifier.fit(xtrain, ytrain) final_training = time.time() - start_training start_prediction = time.time() ypred = classifier.predict(xtest) final_prediction = time.time() - start_prediction error = metrics.zero_one_loss(ytest, ypred) kernels.append((kernel, final_training, final_prediction, error)) print(f"\t {kernels[-1]}") kernels_liste = list(zip(*kernels)) plot_fig(kernels_liste) tol = [] print("Evolution de la tolérance : ") for tolerance in np.linspace(0.1, 1.0, num=5): svm = svm.SVC(C=tolerance) start_training = time.time() svm.fit(xtrain, ytrain) final_training = time.time() - start_training start_prediction = time.time() ypred = svm.predict(xtest) final_prediction = time.time() - start_prediction error = metrics.zero_one_loss(ytest, ypred) error_training = svm.score(xtrain, ytrain) tol.append((tolerance, final_training, final_prediction, error, error_training)) print(f"\t {tol[-1]}") tol_list = list(zip(*tol)) plot_fig(tol_list) plt.figure(figsize=(19, 9)) plt.plot(tol_list[0], tol_list[3], 'x-', color='blue') # erreur de test plt.plot(tol_list[0], tol_list[-1], 'x-', color='orange') # erreur d'entrainement plt.grid(True) plt.show() best_kernel = 'rbf' best_tolerance = 1.0 best_svm = svm.SVC(kernel=best_kernel, C=best_tolerance) start_training = time.time() best_svm.fit(xtrain, ytrain) best_final_entrainement = time.time() - start_training start_prediction = time.time() ypred = best_svm.predict(xtest) best_final_prediction = time.time() - start_prediction cross_val = model_selection.cross_val_score(best_svm, data, target, cv=10) meilleure_erreur = 1 - np.mean(cross_val) print(f"Durée de l'entraînement : {best_final_entrainement}") print(f"Durée de la prédiction : {best_final_prediction}") print(f"Erreur : {meilleure_erreur}") cm = confusion_matrix(ytest, ypred) df_cm = pd.DataFrame(cm, columns=np.unique(ytest), index=np.unique(ytest)) df_cm.index.name = 'Valeur réelle' df_cm.columns.name = 'Valeur prédite' plt.figure(figsize=(16, 9)) sn.heatmap(df_cm, cmap="Blues", annot=True) plt.show()
def generate_roc_curves(tr_data, tr_labels, split_number, te_data, te_labels): # zero-one losses for naive bayes (nb) and support vector machine (svm) params_to_keep = set() for j in range(num_features): best_feature = 0 best_loss = 1.0 for k in range(tr_data.shape[1]): if k in params_to_keep: continue svm = LinearSVC(penalty='l2', C=0.5, dual=False) params_to_keep.add(k) lparams = list(params_to_keep) svm.fit(tr_data[:, lparams], tr_labels) preds = svm.predict(tr_data[:, lparams]) loss = zero_one_loss(preds, tr_labels) params_to_keep.discard(k) if (loss <= best_loss): best_feature = k best_loss = loss params_to_keep.add(best_feature) # We now have the best features lparams = list(params_to_keep) nb1 = ClassifierResult('Naive Bayes (L1 features)', [], []) svm1 = SVMClassifierResult('svm:_c_=_1.0', [], [], []) svm2 = SVMClassifierResult('svm:_c_=_0.75', [], [], []) svm3 = SVMClassifierResult('svm:_c_=_0.50', [], [], []) svm4 = SVMClassifierResult('svm:_c_=_0.25', [], [], []) naive = ClassifierResult('Naive Classifier', [], []) ## SVM DECLARED HERE TO BE INIT-ed WITH THE DATA FOR CROSS VALIDATION classifiers = { 'svm:_c_=_1': svm1, 'svm:_c_=_.75': svm2, 'svm:_c_=_.50': svm3, 'svm:_c_=_.25': svm4 } random_state = np.random.RandomState(0) for model_type in classifiers: train_data = tr_data test_data = te_data if model_type == "svm:_c_=_1": model = LinearSVC(penalty='l1', C=1, dual=False) elif model_type == "svm:_c_=_.75": model = LinearSVC(penalty='l1', C=0.75, dual=False) elif model_type == "svm:_c_=_.50": model = LinearSVC(penalty='l1', C=0.50, dual=False) elif model_type == "svm:_c_=_.25": model = LinearSVC(penalty='l1', C=0.25, dual=False) # elif model_type == "naive bayes": # model = MultinomialNB() model.fit(train_data, tr_labels) y_score = model.decision_function(test_data) print(y_score) print("-=-=-") print(te_labels) fpr, tpr, _ = roc_curve(te_labels - 1, y_score) print(fpr, tpr) roc_auc = auc(fpr, tpr) plt.figure() lw = 2 plt.plot(fpr, tpr, color="darkorange", lw=lw, label="ROC Curve Area = {:.4f}".format(roc_auc)) plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic example') plt.legend(loc="lower right") fn = "figures/ROC_" + model_type + ".png" plt.savefig(fn, bbox_inches='tight') plt.clf()
def greedy_subset_svm(tr_data, tr_labels, num_features, split_number, te_data, te_labels): nb1 = ClassifierResult('GS Naive Bayes', [], []) svm1 = GSSVMClassifierResult('GS svm: c = 0.5', [], [], []) naive = ClassifierResult('Naive Classifier', [], []) greed = GreedyResult('Greedy SVM', num_features) for i in range(split_number): print('Fold: ' + str(i)) cv_tr_data,cv_tr_labels,cv_te_data,cv_te_labels\ = split_data(tr_data,tr_labels,split_number,i) # We want to use some of the training data as 'validation' data for picking # the best subset. We will use 90% of the data for training, 10% for validation. cv_training_data = np.array(cv_tr_data[:int(len(cv_tr_data) * .9)]) cv_training_labels = cv_tr_labels[:int(len(cv_tr_labels) * .9)] cv_validation_data = np.array(cv_tr_data[int(len(cv_tr_data) * .9):]) cv_validation_labels = cv_tr_labels[int(len(cv_tr_labels) * .9):] params_to_keep = set() for j in range(num_features): # print('Feature: ' + str(j)) best_feature = 0 best_loss = 1.0 for k in range(cv_training_data.shape[1]): if k in params_to_keep: continue svm = LinearSVC(penalty='l2', C=0.5, dual=False) params_to_keep.add(k) lparams = list(params_to_keep) svm.fit(cv_training_data[:, lparams], cv_training_labels) preds = svm.predict(cv_validation_data[:, lparams]) loss = zero_one_loss(preds, cv_validation_labels) params_to_keep.discard(k) if (loss <= best_loss): best_feature = k best_loss = loss params_to_keep.add(best_feature) greed.losses[j] = greed.losses[j] + [best_loss] # We now have the best features lparams = list(params_to_keep) svm = LinearSVC(penalty='l2', C=0.5, dual=False) svm.fit(cv_training_data[:, lparams], cv_training_labels) # Use the real cross validation testing data now to get an accurate loss preds = svm.predict(cv_te_data[:, lparams]) loss = zero_one_loss(preds, cv_te_labels) svm1.zero_one_loss += [loss] params = [0 for x in range(cv_training_data.shape[1])] coefs = svm.coef_.ravel() for i in range(0, len(lparams)): params[lparams[i]] = coefs[i] svm1.params += [params] svm1.columns += [lparams] preds = svm.predict(te_data[:, lparams]) loss = zero_one_loss(preds, te_labels) svm1.test_loss += [loss] nb = MultinomialNB() nb.fit(cv_training_data[:, lparams], cv_training_labels) preds = nb.predict(cv_te_data[:, lparams]) loss = zero_one_loss(preds, cv_te_labels) nb1.zero_one_loss += [loss] preds = nb.predict(te_data[:, lparams]) loss = zero_one_loss(preds, te_labels) nb1.test_loss += [loss] # Naive preds = [2 for x in range(len(cv_te_labels))] loss = zero_one_loss(preds, cv_te_labels) naive.zero_one_loss += [loss] preds = [2 for x in range(len(te_labels))] loss = zero_one_loss(preds, te_labels) naive.test_loss += [loss] return nb1, svm1, naive, greed
def k_fold_cross_validation(tr_data, tr_labels, split_number, te_data, te_labels): # zero-one losses for naive bayes (nb) and support vector machine (svm) nb1 = ClassifierResult('Naive Bayes (L1 features)', [], []) svm1 = SVMClassifierResult('svm: c = 1.0', [], [], []) svm2 = SVMClassifierResult('svm: c = 0.75', [], [], []) svm3 = SVMClassifierResult('svm: c = 0.50', [], [], []) svm4 = SVMClassifierResult('svm: c = 0.25', [], [], []) naive = ClassifierResult('Naive Classifier', [], []) for i in range(split_number): cv_tr_data,cv_tr_labels,cv_te_data,cv_te_labels\ = split_data(tr_data,tr_labels,split_number,i) ## SVM DECLARED HERE TO BE INIT-ed WITH THE DATA FOR CROSS VALIDATION svm = LinearSVC(penalty='l1', C=1.0, dual=False) svm.fit(cv_tr_data, cv_tr_labels) preds = svm.predict(cv_te_data) loss = zero_one_loss(preds, cv_te_labels) svm1.zero_one_loss += [loss] svm1.params += [svm.coef_.ravel()] preds = svm.predict(te_data) loss = zero_one_loss(preds, te_labels) svm1.test_loss += [loss] ## SVM_C1 DECLARED HERE TO BE INIT-ed WITH THE DATA FOR CROSS VALIDATION svm_c1 = LinearSVC(penalty='l1', C=0.75, dual=False) svm_c1.fit(cv_tr_data, cv_tr_labels) preds = svm_c1.predict(cv_te_data) loss = zero_one_loss(preds, cv_te_labels) svm2.zero_one_loss += [loss] svm2.params += [svm.coef_.ravel()] preds = svm_c1.predict(te_data) loss = zero_one_loss(preds, te_labels) svm2.test_loss += [loss] ## SVM_C2 DECLARED HERE TO BE INIT-ed WITH THE DATA FOR CROSS VALIDATION svm_c2 = LinearSVC(penalty='l1', C=.50, dual=False) svm_c2.fit(cv_tr_data, cv_tr_labels) preds = svm_c2.predict(cv_te_data) loss = zero_one_loss(preds, cv_te_labels) svm3.zero_one_loss += [loss] svm3.params += [svm.coef_.ravel()] preds = svm_c2.predict(te_data) loss = zero_one_loss(preds, te_labels) svm3.test_loss += [loss] ## SVM_C3 DECLARED HERE TO BE INIT-ed WITH THE DATA FOR CROSS VALIDATION svm_c3 = LinearSVC(penalty='l1', C=0.25, dual=False) svm_c3.fit(cv_tr_data, cv_tr_labels) preds = svm_c3.predict(cv_te_data) loss = zero_one_loss(preds, cv_te_labels) svm4.zero_one_loss += [loss] svm4.params += [svm.coef_.ravel()] preds = svm_c3.predict(te_data) loss = zero_one_loss(preds, te_labels) svm4.test_loss += [loss] nb = MultinomialNB() params_to_use = [ i for i, x in enumerate(svm_c2.coef_.ravel()) if x != 0 ] nb.fit(cv_tr_data[:, list(params_to_use)], cv_tr_labels) preds = nb.predict(cv_te_data[:, list(params_to_use)]) loss = zero_one_loss(preds, cv_te_labels) nb1.zero_one_loss += [loss] preds = nb.predict(te_data[:, list(params_to_use)]) loss = zero_one_loss(preds, te_labels) nb1.test_loss += [loss] # Naive preds = [2 for x in range(len(cv_te_labels))] loss = zero_one_loss(preds, cv_te_labels) naive.zero_one_loss += [loss] preds = [2 for x in range(len(te_labels))] loss = zero_one_loss(preds, te_labels) naive.test_loss += [loss] return nb1, svm1, svm2, svm3, svm4, naive
# decision boundry vector hyperplane db1 = hyperplane(hyp_x_min, self.w, self.b,0) db2 = hyperplane(hyp_x_max, self.w, self.b,0) self.ax.plot([hyp_x_min, hyp_x_max], [db1, db2],'y--') plt.show() data_dict = { -1:np.array([[1,7],[2,8],[3,8]]), 1:np.array([[5,1],[6,-1],[7,3]])} svm = Support_Vector_Machine() svm.fit(data=data_dict) predict_us = [[0,10], [1,3], [3,4], [3,5], [5,5], [5,6], [6,-5], [5,8]] for p in predict_us: print(svm.predict(p)) svm.visualize()
def PCA(x): cov_x = np.cov(x.T) u, s, v = np.linalg.svd(cov_x) k = 2 proj = u[:, 0:k] pca_x = np.matmul(x, proj) return pca_x x_train_std = PCA(x_train_std) x_test_std = PCA(x_test_std) svm = svm.SVC(kernel = 'linear', probability = True) svm.fit(x_train_std, y_train) predict_y = svm.predict(x_test_std) label_y = y_test print(predict_y) print(label_y) correct = (label_y == predict_y).astype(int) correct_rate = np.mean(correct) print('Correct test rate', correct_rate) def plot_decision_boundary(X, y, clf, test_ind = None, resolution = 0.02): ''' x: 2D array, size [batch, features] , features = 2
y = self.inputs["in1"] self.results["out0"] = (x == y) # Load iris dataset iris = datasets.load_iris() X = iris.data[:, :2] y = iris.target # Setup data feeder component feeder = brica1.ConstantComponent() feeder.make_out_port("out0", 2) # Setup components svm = SVMComponent(2) svm.fit(X, y) RFC = RandomForestClassifierComponent(2) RFC.fit(X, y) SR = SVMvsRFC_Component(1) # Connect the components brica1.connect((feeder, "out0"), (svm, "in0")) brica1.connect((feeder, "out0"), (RFC, "in0")) brica1.connect((svm, "out0"), (SR, "in0")) brica1.connect((RFC, "out0"), (SR, "in1")) # Add components to module mod = brica1.Module() mod.add_component("feeder", feeder)
test_tfidf = tfidf_transformer.transform(test_data) #2 antrenare Naive Bayesian multi_naive_bayes = MultinomialNB() multi_naive_bayes.fit(train_tfidf, train_labels) #3 testare Naive Bayesian test_predicted_nb = multi_naive_bayes.predict(test_tfidf) #4 Afisare rezultate separate pe categorii(False/True positiv, False/True negativ) #confusion matrix print_results("Naive Bayes", test_predicted_nb, test_labels) #2 antrenare SVM svm = svm.SVC(C=1000) svm.fit(train_tfidf, train_labels) #3 testare SVM test_predicted_svm = svm.predict(test_tfidf) #4 Afisare rezultate separate pe categorii(False/True positiv, False/True negativ) print_results("SVM", test_predicted_svm, test_labels) #2 antrenare Passive Aggressive pa = PassiveAggressiveClassifier(C=0.5, random_state=5) pa.fit(train_tfidf, train_labels) #3 testare Passive Aggressive test_predicted_pa = pa.predict(test_tfidf) #4 Afisare rezultate separate pe categorii(False/True positiv, False/True negativ)
iris = datasets.load_iris() X = iris.data y = iris.target ##变为2分类 X, y = X[y != 2], y[y != 2] #print(X) # Add noisy features to make the problem harder random_state = np.random.RandomState(0) n_samples, n_features = X.shape X = np.c_[X, random_state.randn(n_samples, 200 * n_features)] # shuffle and split training and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3,random_state=0) # Learn to predict each class against the other svm = svm.SVC(kernel='linear', probability=True,random_state=random_state) ###通过decision_function()计算得到的y_score的值,用在roc_curve()函数中 y_score = svm.fit(X_train, y_train).decision_function(X_test) # Compute ROC curve and ROC area for each class fpr,tpr,threshold = roc_curve(y_test, y_score) ###计算真正率和假正率 roc_auc = auc(fpr,tpr) ###计算auc的值 plt.figure() lw = 2 plt.figure(figsize=(10,10)) plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) ###假正率为横坐标,真正率为纵坐标做曲线 plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic example') plt.legend(loc="lower right")
print '- loading the vocabulary' voc = joblib.load(CONST_DICTIONARY_PATH) print '- setting vocabulary to extractor' extract_bow.setVocabulary(voc) if CONST_RETRAIN_MODEL == 1: print '- creating feature_extraction data' traindata, trainlabels = [], [] for i in range(213): # 20 traindata.extend(bow_features(path(pos, i))); trainlabels.append(1) traindata.extend(bow_features(path(neg, i))); trainlabels.append(-1) print '- feature_extraction the model' svm = svm.NuSVC(nu=0.5, kernel='rbf', gamma=0.1, probability=True) svm.fit(np.array(traindata), np.array(trainlabels)) joblib.dump(svm, CONST_SVM_MODEL_PATH, compress=3) else: print '- loading the model' svm = joblib.load(CONST_SVM_MODEL_PATH) predictions = [] tot_geral_pos, tot_geral_neg = 0, 0 tot_pos_pos, tot_pos_neg = 0, 0 tot_neg_pos, tot_neg_neg = 0, 0 #pos=0 print '- testing the model (pos) -> ' + CONST_TEST_DATA_POS_PATH for file in os.listdir(CONST_TEST_DATA_POS_PATH): if file != '.DS_Store': tot_geral_pos += 1 obj_predict = predict(CONST_TEST_DATA_POS_PATH + file)
X, y = X[y != 2], y[y != 2] # Add noisy features to make the problem harder random_state = np.random.RandomState(0) n_samples, n_features = X.shape X = np.c_[X, random_state.randn(n_samples, 200 * n_features)] # shuffle and split training and test sets X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, y, test_size=.3, random_state=0) # Learn to predict each class against the other svm = svm.SVC(kernel='linear', probability=True, random_state=random_state) # 通过decision_function() 返回wx+b 计算得到的y_score的值,用在roc_curve()函数中 svm_model = svm.fit(X_train, y_train) y_score = svm_model.decision_function(X_test) # Compute ROC curve and ROC area for each class # fp rate:原本是错的,预测是对的比例(越小越好,0是理想) # tp rate:原本是对的,预测为对的比例(越大越好,1是理想) # 分类别 0,1 0.36 < 0.5 => 0 fpr, tpr, threshold = roc_curve(y_test, y_score) # 计算真正率和假正率 roc_auc = auc(fpr, tpr) # 计算auc的值 print(roc_auc) plt.figure() lw = 2 plt.figure(figsize=(10, 10)) plt.plot(fpr, tpr,
def _evaluate(self, Gs, Gs_kwargs, num_gpus): minibatch_size = num_gpus * self.minibatch_per_gpu # Construct TensorFlow graph for each GPU. result_expr = [] for gpu_idx in range(num_gpus): with tf.device('/gpu:%d' % gpu_idx): Gs_clone = Gs.clone() # Generate images. latents = tf.random_normal([self.minibatch_per_gpu] + Gs_clone.input_shape[1:]) labels = self._get_random_labels_tf(self.minibatch_per_gpu) dlatents = Gs_clone.components.mapping.get_output_for( latents, labels, **Gs_kwargs) images = Gs_clone.get_output_for(latents, None, **Gs_kwargs) # Downsample to 256x256. The attribute classifiers were built for 256x256. if images.shape[2] > 256: factor = images.shape[2] // 256 images = tf.reshape(images, [ -1, images.shape[1], images.shape[2] // factor, factor, images.shape[3] // factor, factor ]) images = tf.reduce_mean(images, axis=[3, 5]) # Run classifier for each attribute. result_dict = dict(latents=latents, dlatents=dlatents[:, -1]) for attrib_idx in self.attrib_indices: classifier = misc.load_pkl(classifier_urls[attrib_idx]) logits = classifier.get_output_for(images, None) predictions = tf.nn.softmax( tf.concat([logits, -logits], axis=1)) result_dict[attrib_idx] = predictions result_expr.append(result_dict) # Sampling loop. results = [] for begin in range(0, self.num_samples, minibatch_size): self._report_progress(begin, self.num_samples) results += tflib.run(result_expr) results = { key: np.concatenate([value[key] for value in results], axis=0) for key in results[0].keys() } # Calculate conditional entropy for each attribute. conditional_entropies = defaultdict(list) for attrib_idx in self.attrib_indices: # Prune the least confident samples. pruned_indices = list(range(self.num_samples)) pruned_indices = sorted( pruned_indices, key=lambda i: -np.max(results[attrib_idx][i])) pruned_indices = pruned_indices[:self.num_keep] # Fit SVM to the remaining samples. svm_targets = np.argmax(results[attrib_idx][pruned_indices], axis=1) for space in ['latents', 'dlatents']: svm_inputs = results[space][pruned_indices] try: svm = sklearn.svm.LinearSVC() svm.fit(svm_inputs, svm_targets) svm.score(svm_inputs, svm_targets) svm_outputs = svm.predict(svm_inputs) except: svm_outputs = svm_targets # assume perfect prediction # Calculate conditional entropy. p = [[ np.mean([ case == (row, col) for case in zip(svm_outputs, svm_targets) ]) for col in (0, 1) ] for row in (0, 1)] conditional_entropies[space].append(conditional_entropy(p))
'Delhi Capitals', 'Chennai Super Kings', 'Gujarat Lions', 'Rising Pune Supergiant', 'Pune Warriors India', 'Kochi Tuskers Kerala', 'Deccan Chargers'] teams.sort() teams_le = le.fit_transform(teams) #Test Teain Split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.30, random_state=1008) #--------------------------------------SVM ''' svm = svm.SVC(gamma = 'scale') svm.fit(x_train, y_train) y_pred = svm.predict(x_test) print(f1_score(y_test,y_pred, average='micro')) #--------------------------------------Random Forest classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0) classifier.fit(x_train, y_train) y_pred1 = classifier.predict(x_test) print(f1_score(y_test,y_pred1,average='micro')) ''' #--------------------------------------XGBoosr xg = XGBClassifier()
pred = model.predict(x_cols) nb_max = pd.crosstab(pred, y_cols) print('NB 분류정확도') print(nb_max) acc = (nb_max.ix[0,0]+nb_max.ix[1,1])/len(test_set) print("acc = ",acc) # SVM model x_cols = train_set[cols[1:]] # terms y_cols = train_set[cols[0]] # sms_type svm = svm.SVC(kernel = 'linear') model = svm.fit(x_cols, y_cols) x_cols = test_set[cols[1:]] # terms y_cols = test_set[cols[0]] # sms_type pred = model.predict(x_cols) svm_max = pd.crosstab(pred, y_cols) print("SVM 분류정확도") print(svm_max) acc=(svm_max.ix[0,0]+svm_max.ix[1,1])/len(test_set) print("acc = ",acc) ''' [1 rows x 6823 columns] NB 분류정확도
print '' print 'Report Logistic Regression:' print(classification_report(y_val, y_pred)) # 2.- Support Vector Machine: (https://es.wikipedia.org/wiki/M%C3%A1quinas_de_vectores_de_soporte) #Import the model: from sklearn import svm #If you want to change some of this hiperparameters see: #http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html svm = svm.SVC() print 'Hiperparameters defined in SVM:' print '' print svm print '' #Train the model: svm.fit(X_train, y_train) #Make predictions in validation set: y_pred = svm.predict(X_val) #Create the confussion Matrix: #[[True Negatives, False Negatives]] #[[False Positives, True Positives]] print 'Confusion Matrix SVM:' print '' print confusion_matrix(y_pred, y_val) print '' print 'Report SVM:' print '' print(classification_report(y_val, y_pred))
def train_svm_regression(features, labels, c_param, kernel='linear'): svm = sklearn.svm.SVR(C=c_param, kernel=kernel) svm.fit(features, labels) train_err = np.mean(np.abs(svm.predict(features) - labels)) return svm, train_err
vectorizer = CountVectorizer(ngram_range=args.ngrange, stop_words=stop_words, vocabulary=vectorizer.vocabulary_, binary=args.onehot, analyzer='word', token_pattern=r'\b[^\W\d]+\b') Trained_Vectors = vectorizer.fit_transform(training_corpus).toarray() print "Features Used in this iteration were:" print vectorizer.vocabulary_.keys() print "\n" test_vectors = vectorizer.transform(test_corpus).toarray() #Training each Classifier svm_clf = svm.fit(Trained_Vectors, train_labels) knn_clf = knn.fit(Trained_Vectors, train_labels) dt_clf = dt.fit(Trained_Vectors, train_labels) gnb_clf = gnb.fit(Trained_Vectors, train_labels) lr_clf = lr.fit(Trained_Vectors, train_labels) #Making Predictions svm_predictions = svm_clf.predict(test_vectors) knn_predictions = knn_clf.predict(test_vectors) dt_predictions = dt_clf.predict(test_vectors) gnb_predictions = gnb_clf.predict(test_vectors) lr_predictions = lr_clf.predict(test_vectors) svm_accuracy_list.append(calc_acc(svm_predictions)) knn_accuracy_list.append(calc_acc(knn_predictions)) dt_accuracy_list.append(calc_acc(dt_predictions))
for image_path, descriptor in des_list_test[1:]: descriptors_test = np.vstack((descriptors_test, descriptor)) #Initialize an SVM classifier svm = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, probability=True, degree=2, gamma='auto', kernel='rbf', verbose=False) clf = svm.fit(descriptors_train, y[train]) ##Accuracy score = clf.score(descriptors_test, y[test]) cvscores.append(score) print score ##Confusion matrix conf1 = confusion_matrix(y[test], clf.predict(descriptors_test)) conf = conf + conf1 ####ROC curve probas_ = clf.fit(descriptors_train, y[train]).predict_proba(descriptors_test) fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
X_test.loc[seg_id, 'kurt'] = kurtosis(x) X = X_train.copy() y = y_train.copy() train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1) # 正则化X scaler = StandardScaler() scaler.fit(X_train) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) print("Feature engineering ends.") print("SVM...") svm = NuSVR() svm.fit(X_train_scaled, y_train.values.flatten()) y_pred_svm = svm.predict(X_train_scaled) score = mean_absolute_error(y_train.values.flatten(), y_pred_svm) print(f'Score: {score:0.3f}') y_pred_svm = svm.predict(X_test_scaled) submission['time_to_failure'] = y_pred_svm submission.to_csv('submission_svm.csv') print("SVM ends.") print("LightGBM...") folds = KFold(n_splits=5, shuffle=True, random_state=42) params = { 'objective': "regression", 'boosting': "gbdt", 'metric': "mae", 'boost_from_average': "false",
def trainSVMregression_rbf(Features, Y, Cparam): svm = sklearn.svm.SVR(C=Cparam, kernel='rbf') svm.fit(Features, Y) train_err = numpy.mean(numpy.abs(svm.predict(Features) - Y)) return svm, train_err
# Get an numpy ndarray of the pixels labels = dataset_train[[0]].values.ravel() train = dataset_train.iloc[:, 1:].values test = dataset_test.values # Get PCA start_time_pca = time.time() # Want PCA to have explained_variance_ratio_ > 0.9 (as 0 < num < 1) pca = PCA(n_components=0.8, whiten=True) train_pca = pca.fit_transform(train) end_time_pca = time.time() # Create and train the SVM start_time_svm = time.time() svm = svm.SVC(C=10.0, verbose=True) svm.fit(train_pca, labels) end_time_svm = time.time() # Make the prediction on test but first has to use PCA test_pca = pca.transform(test) pred = svm.predict(test_pca) # Save as a dataSet np.savetxt('./data/solution_svm.csv', np.c_[range(1, len(test) + 1), pred], delimiter=',', header='ImageId,Label', comments='', fmt='%d')
#res = cv2.resize(img,(250,250)) res = img gray_image = cv2.cvtColor(res, cv2.COLOR_BGR2GRAY) xarr = np.squeeze(np.array(gray_image).astype(np.float32)) m, v = cv2.PCACompute(xarr, mean=np.array([])) arr = np.array(v) flat_arr = arr.ravel() training_set.append(flat_arr) training_labels.append(label) print 'done ', root trainData = np.float32(training_set) responses = training_labels #svm = cv2.SVM() svm = sklearn.svm.LinearSVC(C=1.0, random_state=0) svm.fit(trainData, responses) #svm.save('svm_data.dat') print 'training done!' print 'testing...' path = 'test/' testing_set = [] testing_labels = [] for root, dirs, files in os.walk(path): for name in files: if name.endswith((".png")): if (os.path.getsize(root + str('/') + name)) != 0: label = root.split('/')[1] img = cv2.imread(root + str('/') + name)