def main(): tweets_fname = 'your_tweets.txt' labels_fname = 'your_labels.txt' # fill in with your own tweets and labels dictionary = get_words(tweets_fname) feature_vectors = get_features(tweets_fname, dictionary) labels = get_vectors(labels_fname) first_few_features = feature_vectors[:560] first_few_labels = labels[:560] last_few_features = feature_vectors[560:] last_few_labels = labels[560:] k = 5 # this finds how accurate your measure is # metrics = [ "accuracy", "f1_score", "auroc", "precision", # "sensitivity", "specificity" ] # for metric in metrics : # print str(metric) + ": " # c, gamma = get_rbf(first_few_features, first_few_labels, # k, metric) # print "best c: " + str(c) + ", best gamma: " + str(gamma) # rbf_clf = SVC(kernel='rbf', C=c, gamma=gamma) # rbf_clf.fit(first_few_features, first_few_labels) # perf, lower, upper = get_confidence_interval(rbf_clf, last_70_features, # last_70_labels, metric) # print "peformance: " + str(perf) + ", lower: " + str(lower) + ", upper: " + str(upper) X_test = get_features('your_other_tweets.txt', dictionary) clf = SVC(kernel='rbf', gamma=0.01, C=100, probability=True) clf.fit(feature_vectors,labels) y_pred = np.sign(clf.decision_function(X_test)) print(y_pred)
def condition_on_grades(user="******"): c = new_conn.cursor() models = [None, None, None, None, None, None] for i in range(6): c.execute('SELECT easiness, ret_reps, ret_reps_since_lapse, lapses, pred_grade, acq_reps from discrete_log where user_id="%s" and grade=%d' % (user, i)) x_train = np.array(c.fetchall()) c.execute('SELECT interval_bucket from discrete_log where user_id="%s" and grade=%d' % (user, i)) y_train = np.array(c.fetchall())[:,0] clf = SVC() clf.fit(x_train, y_train) print clf.score(x_train, y_train) models[i] = clf print "=====================" c.execute('SELECT user_id from (select user_id, count(distinct grade) as cnt from discrete_log group by user_id) where cnt = 6 limit 5') users = [row[0] for row in c.fetchall()] scores = [0, 0, 0, 0, 0, 0] for user in users: for i in range(6): c.execute('SELECT easiness, ret_reps, ret_reps_since_lapse, lapses, pred_grade, acq_reps from discrete_log where user_id="%s" and grade=%d' % (user, i)) x_train = np.array(c.fetchall()) c.execute('SELECT interval_bucket from discrete_log where user_id="%s" and grade=%d' % (user, i)) y_train = np.array(c.fetchall())[:,0] scores[i] += models[i].score(x_train, y_train) for i in range(6): scores[i] /= len(users); print scores[i]
def test_rfe(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] X_sparse = sparse.csr_matrix(X) y = iris.target # dense model clf = SVC(kernel="linear") rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1) rfe.fit(X, y) X_r = rfe.transform(X) clf.fit(X_r, y) assert_equal(len(rfe.ranking_), X.shape[1]) # sparse model clf_sparse = SVC(kernel="linear") rfe_sparse = RFE(estimator=clf_sparse, n_features_to_select=4, step=0.1) rfe_sparse.fit(X_sparse, y) X_r_sparse = rfe_sparse.transform(X_sparse) assert_equal(X_r.shape, iris.data.shape) assert_array_almost_equal(X_r[:10], iris.data[:10]) assert_array_almost_equal(rfe.predict(X), clf.predict(iris.data)) assert_equal(rfe.score(X, y), clf.score(iris.data, iris.target)) assert_array_almost_equal(X_r, X_r_sparse.toarray())
def multi_SVM(needcv = False): NeedReFetch = NEED_REFETCH OnlyNeedReGenerate = ONLY_NEED_REGENERATE allGenreSongsTrain,allGenreSongsTest = fetchData_CM(NUM_NEED_PER_GENRE,GENRES,NeedReFetch,OnlyNeedReGenerate,USED_GENRES) # allGenreSongsTrain,allGenreSongsTest = featureSelection (allGenreSongsTrain,allGenreSongsTest,method = 'mean',testmode = True,n_features_to_select = 4) # assert(len(allGenreSongsTrain[0][0]) == 106) TrainX = [] TrainY = [] TestX = [] TestY = [] for i in range(sum(USED_GENRES)): for j in allGenreSongsTrain[i]: TrainX.append(j) TrainY.append(i) for k in allGenreSongsTest[i]: TestX.append(k) TestY.append(i) confuseMat = [[0 for i in range(sum(USED_GENRES))] for j in range(sum(USED_GENRES))]; if not needcv: print "Start SVM training ... " model = SVC(probability=True,decision_function_shape='ovo',kernel = 'rbf',gamma = 0.0078125, C = 8) model.fit(TrainX,TrainY) print "Start SVM predicting ... " PredY = model.predict(TestX) for i in range(len(TestY)): confuseMat[TestY[i]][PredY[i]] += 1 print(clfr(TestY, PredY)) else: tuned_parameters = [ ## remained to be play with {'kernel': ['rbf'], 'gamma': [2**i for i in range(-8,8)], 'C': [2**i for i in range(-8,8)]}, # {'kernel': ['linear'], 'C': [2**i for i in range(-8,9,2)]}, # {'kernel': ['poly'], 'gamma': [2**i for i in range(-8,9,2)], 'C': [2**i for i in range(-8,9,2)], 'degree':[2,3,4]}, ] print "Start SVM CV ... " clf = GSCV(SVC(decision_function_shape='ovo'), tuned_parameters, cv=5) clf.fit(TrainX, TrainY) print("Best parameters set found on development set:") print(clf.best_params_) # print("Grid scores on development set:") # print() # for params, mean_score, scores in clf.grid_scores_: # print("%0.4f (+/-%0.03f) for %r" % (mean_score, scores.std(), params)) # print() print "Start SVM predicting ... " PredY = clf.predict(TestX) print(clfr(TestY, PredY)) for i in range(len(TestY)): confuseMat[TestY[i]][PredY[i]] += 1 return confuseMat
def buildAndEvaluateSvm(p_kernel): model = SVC(kernel=p_kernel) model.fit(features, target) expected = target predicted = model.predict(features) print("SCORE: %f" % metrics.roc_auc_score(expected,predicted))
class SVMPredictor(object): """" A simple application of SVM classifier @author: Shaun """ def __init__(self): self.clf = SVC(probability=True) @abstractmethod def fit(self, X, y): """ Method to fit the model. Parameters: X - 2d numpy array of training data y - 1d numpy array of training labels """ self.clf = self.clf.fit(X, y) @abstractmethod def predict(self, X): """ Method to apply the model data Parameters: X - 2d numpy array of test data """ return self.clf.predict_proba(X)[:, 1]
def main(): print 'MIFS' filename = ['../data/arcene.mat', '../data/gisette.mat', '../data/madelon.mat'] for f_num in range(len(filename)): print filename[f_num] mat = scipy.io.loadmat(filename[f_num]) X = mat['X'] # data y = mat['Y'] # label y = y[:, 0] X = X.astype(float) n_sample, n_features = X.shape # split data ss = cross_validation.KFold(n_sample, n_folds=10, shuffle=True) # choose SVM as the classifier clf = SVC() num_fea = np.linspace(5, 300, 60) correct = np.zeros(len(num_fea)) for train, test in ss: # select features F = MIFS.mifs(X[train], y[train], n_selected_features=300) for n in range(len(num_fea)): fea_idx = F[0:num_fea[n]] features = X[:, fea_idx] clf.fit(features[train], y[train]) y_predict = clf.predict(features[test]) acc = accuracy_score(y[test], y_predict) correct[n] += acc correct.astype(float) correct /= 10 for i in range(len(num_fea)): print num_fea[i], correct[i]
def train_svm(X, y): """ Create and train the Support Vector Machine. """ svm = SVC(C=1000000.0, gamma=0.0, kernel='rbf') svm.fit(X, y) return svm
def cvalidate(): from sklearn import cross_validation trainset = np.genfromtxt(open('train','r'), delimiter=' ') targetset = np.genfromtxt(open('target','r'), delimiter=' ') X = np.array([x[0:64] for x in trainset]) y = np.array([x for x in targetset]) #print X,y X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.3, random_state = 0) #X_train, X_test = decomposition_pca(X_train, X_test) X_train, X_test = decomposition_pca(X_train, X_test) c_range = 10.0 ** np.arange(6.5,7.5,1) gamma_range = 10.0 ** np.arange(-2.5,0.5,1) #parameters = {'kernel':['rbf'], 'C':c_range} parameters = {'kernel':['rbf'], 'C':c_range, 'gamma':gamma_range} svr = SVC(kernel = 'rbf', C = 0.72, gamma = 0.299) #clf = grid_search.GridSearchCV(svr, parameters) #print clf.estimator ##clf = Pipeline([('scale', Scaler()), ('svm', SVC())]) svr.fit(X_train, y_train) print svr.score(X_test, y_test)
def get_classes_accury(data, target, test_times = 10, test_size=0.1): target_list = list(set(target)) target_list.sort() scores = np.zeros((test_times,len(target_list))) for t in range(test_times): clf = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma='auto', kernel='linear', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) Xtrain, Xtest, ytrain, ytest = train_test_split(data, target, test_size=test_size, random_state=t) clf.fit(Xtrain, ytrain) print(t, clf.score(Xtest, ytest)) pre = clf.predict(Xtest) for i,c in enumerate(target_list): s = np.logical_and(pre==c, ytest==c).sum()/ (ytest==c).sum() scores[t, i] = s ##### 生成表格 print('|'+'class'+'|'+'|'.join([str(i) for i in target_list]) +'|') print('|'+'-'+'|') for i,score in enumerate(scores): print( '|'+str(i)+'|'+ '|'.join(['{:.4f}'.format(_) for _ in score])+ '|' ) print( '|'+'max'+ '|'+ '|'.join(['{:.4f}'.format(_) for _ in scores.max(axis=0)])+ '|' ) print( '|'+'min'+ '|'+ '|'.join(['{:.4f}'.format(_) for _ in scores.min(axis=0)])+ '|' ) print( '|'+'mean'+'|'+ '|'.join(['{:.4f}'.format(_) for _ in scores.mean(axis=0)])+ '|' ) return scores
class PcaGmm(BaseEstimator): def __init__(self, X_all, pca_components = 12, gmm_components = 4, covariance_type = "full", min_covar = 0.1, gamma = 0, C = 1.0): self.pca_components = pca_components self.gmm_components = gmm_components self.covariance_type = covariance_type self.min_covar = min_covar self.gamma = gamma self.C = C self.X_all = X_all X_all = X_all[:, :pca_components] self.gmm = GMM(n_components = gmm_components, covariance_type = covariance_type, min_covar = min_covar) self.gmm.fit(X_all) def fit(self, X, y): X = X[:, :self.pca_components] X = self.gmm.predict_proba(X) self.svm = SVC(C = self.C, gamma = self.gamma) self.svm.fit(X, y) def predict(self, X): X = X[:, :self.pca_components] return self.svm.predict(self.gmm.predict_proba(X)) def score(self, X, y): y_pred = self.predict(X) return accuracy_score(y, y_pred) def transform(self, X, y = None): X = X[:, :self.pca_components] return self.gmm.predict_proba(X) def __str__(self): return "PCA(%d)-GMM(%d, %s, %f)-SVM(C=%f, gamma=%f)" % (self.pca_components, self.gmm_components,self.covariance_type, self.min_covar,self.C, self.gamma)
def predict(self): if self.stage != Stage.SIMILARITY: print '没有计算相似度' import numpy as np utiles.clearList(self.predictPC) utiles.clearList(self.evaluation) for i in range(len(self.forest)): similarityPCList = [] for sim in self.similarity[i]: similarityPCList.append(int(sim.powerConsume*100000)) similarity_X = np.array(self.similarityList[i]) similarity_y = np.array(similarityPCList) # print similarity_y predictModel = SVC() predictModel.fit(similarity_X, similarity_y) forest_X = np.array(self.forestList[i]) forest_X = forest_X.reshape(1,-1) predicted = predictModel.predict(forest_X) self.forest[i].powerConsume = float(predicted[0]/100000.0) self.predictPC.append(float(predicted[0]/100000.0)) for i in range(len(self.expect)): self.evaluation.append((self.expect[i]-self.predictPC[i])/self.expect[i])
def get_optimize_result(training_data, validation_data, important_cols_result): """ Get the number of cols that gets the best score """ last_score = 0.0 new_score = 0.0 number_of_cols = 1 decreases = 0 optimal_result = {'score': 0.0, 'number_of_cols': 1} # Extract labels from data frames training_data_label, training_data = separate_labels(training_data) validation_data_label, validation_data = separate_labels(validation_data) while True: cols = important_cols_result.index[0: number_of_cols] # Fit models and test clf = SVC() clf.fit(training_data.iloc[:, cols], training_data_label) predictions = clf.predict(validation_data.iloc[:, cols]) new_score = accuracy_score(validation_data_label, predictions) print(new_score) if new_score < optimal_result['score']: optimal_result['score'] = new_score optimal_result['number_of_cols'] = number_of_cols print(optimal_result) if last_score > new_score: decreases += 1 if decreases > 5: break last_score = new_score number_of_cols += 5 cols = important_cols_result.index[0: number_of_cols] print(optimal_result) export_optimal_result(training_data.iloc[:, cols].columns)
def test_cv(self): X, y = datasets.make_classification(n_samples=20, n_features=5, n_informative=2) n_folds = 2 # = With EPAC wf = CV(SVC(kernel="linear"), n_folds=n_folds, reducer=ClassificationReport(keep=True)) r_epac = wf.top_down(X=X, y=y) # = With SKLEARN clf = SVC(kernel="linear") r_sklearn = list() for idx_train, idx_test in StratifiedKFold(y=y, n_folds=n_folds): # idx_train, idx_test = cv.__iter__().next() X_train = X[idx_train, :] X_test = X[idx_test, :] y_train = y[idx_train, :] clf.fit(X_train, y_train) r_sklearn.append(clf.predict(X_test)) # = Comparison key2cmp = "y" + conf.SEP + conf.TEST + conf.SEP + conf.PREDICTION for icv in range(n_folds): comp = np.all(np.asarray(r_epac[0][key2cmp]) == np.asarray(r_sklearn[0])) self.assertTrue(comp, u"Diff CV: EPAC vs sklearn") # test reduce r_epac_reduce = wf.reduce().values()[0][key2cmp] comp = np.all(np.asarray(r_epac_reduce) == np.asarray(r_sklearn)) self.assertTrue(comp, u"Diff CV: EPAC reduce")
def test_perm(self): X, y = datasets.make_classification(n_samples=20, n_features=5, n_informative=2) n_perms = 2 rnd = 0 # = With EPAC wf = Perms(SVC(kernel="linear"), n_perms=n_perms, permute="y", random_state=rnd, reducer=None) r_epac = wf.top_down(X=X, y=y) # = With SKLEARN clf = SVC(kernel="linear") r_sklearn = list() for perm in Permutations(n=y.shape[0], n_perms=n_perms, random_state=rnd): y_p = y[perm, :] clf.fit(X, y_p) r_sklearn.append(clf.predict(X)) key2cmp = "y" + conf.SEP + conf.PREDICTION # = Comparison for iperm in range(n_perms): comp = np.all(np.asarray(r_epac[iperm][key2cmp]) == np.asarray(r_sklearn[iperm])) self.assertTrue(comp, u"Diff Perm: EPAC vs sklearn") # test reduce for iperm in range(n_perms): r_epac_reduce = wf.reduce().values()[iperm][key2cmp] comp = np.all(np.asarray(r_epac_reduce) == np.asarray(r_sklearn[iperm])) self.assertTrue(comp, u"Diff Perm: EPAC reduce")
def main(): images, labels = load_labeled_training(flatten=True) images = standardize(images) unl = load_unlabeled_training(flatten=True) unl = standardize(unl) test = load_public_test(flatten=True) test = standardize(test) shuffle_in_unison(images, labels) #d = DictionaryLearning().fit(images) d = MiniBatchDictionaryLearning(n_components=500, n_iter=500, verbose=True).fit(images) s = SparseCoder(d.components_) proj_test = s.transform(images) pt = s.transform(test) #kpca = KernelPCA(kernel="rbf") #kpca.fit(unl) #test_proj = kpca.transform(images) #pt = kpca.transform(test) #spca = SparsePCA().fit(unl) #test_proj = spca.transform(images) #pt = spca.transform(test) svc = SVC() scores = cross_validation.cross_val_score(svc, proj_test, labels, cv=10) print scores print np.mean(scores) print np.var(scores) svc.fit(proj_test, labels) pred = svc.predict(pt) write_results(pred, '../svm_res.csv')
def svm_solver(train_data, train_label, validation, test, dimreduce, convertbinary) : """ """ logging.info ('begin to train the svm classifier') # train_data = train_data[:100,:] # validation = validation[:100,:] # test = test[:100,:] # train_label = train_label[:100] train_data, validation, test = dimreduce(train_data, train_label, validation, test) # print new_train_data.shape train_data, validation, test = convertbinary(train_data, validation, test) """ svc = SVC () params_rbf = {"kernel": ['rbf'], "class_weight": ['auto'], "C": [0.1 ,0.2 ,0.3 ,0.5 ,1, 2, 3, 5, 10], "gamma": [0.01, 0.03, 0.05, 0.1, 0.2, 0.3, 0.5], "tol": 10.0** -np.arange(1, 5), "random_state": [1000000007]} logging.info ("Hyperparameter opimization using RandomizedSearchCV...") rand_search_result = RandomizedSearchCV (svc, param_distributions = params_rbf, n_jobs = -1, cv = 3, n_iter = 30) # rand_search_result = GridSearchCV (svc , param_grid = params_rbf , n_jobs = 8 , cv = 3) rand_search_result.fit (train_data , train_label) params = tools.report (rand_search_result.grid_scores_) """ params = {'kernel': 'poly', 'C': 0.1, 'random_state': 1000000007, 'tol': 0.001, 'gamma': 0.1, 'class_weight': 'auto'} svc = SVC (probability = True, **params) svc.fit (train_data , train_label) evaluate.get_auc (svc.predict_proba (validation)[:,1]) return svc.predict_proba (test)[:,1]
def classify_chushikoku(): u"""中四国の学生に悩み事検出アルゴリズムを適用する """ import pandas as pd from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC data=pd.read_csv("no_taigaku_train4.csv",sep=",",header=0) train=data.values x=train[:,2:] y=train[:,1] data_t=pd.read_csv("no_taigaku_chushikoku.csv",sep=",",header=0) test=data_t.values x_test=test[:,1:] name_test=test[:,0] print name_test print len(x_test) best_forest=RandomForestClassifier(bootstrap=True,class_weight=None,criterion="gini", max_depth=5,max_features="sqrt",max_leaf_nodes=None, min_samples_leaf=1,min_samples_split=3, min_weight_fraction_leaf=0.001,n_estimators=30,n_jobs=-1, oob_score=False,random_state=None,verbose=0,warm_start=False) best_svm=SVC(C=100,kernel="linear",gamma=0.001) clf=best_svm.fit(x,y) label_predict=clf.predict(x_test) print name_test[label_predict=="chutai"] print len(name_test[label_predict=="chutai"])
def train_model(data, seed = None): ''' Осуществить обучение модели на выборке. Аргументы: data - DataFrame с обучающей выборкой seed - семя генератора псевдослучайных чисел Возвращает: обученную модель ''' # Перемешать выборку: if seed is not None: np.random.seed(seed) data = data.iloc[np.random.permutation(len(data))] # Выбрать все записи, значение целевой функции которых равно 1, # и ровно столько же записей со значением целевой функции 0: pos_data = data[data["similar"] == 1.0] neg_data = data[data["similar"] == 0.0][:len(pos_data)] data = ps.concat([pos_data, neg_data]) print("Размер учебной выборки: {}".format(len(data))) # Выделить данные и значения целевой функции: X = data.drop(["id1", "id2", "similar"], axis=1) y = data["similar"] # Обучить модель: #model = LogisticRegression(penalty='l1', tol=0.28, random_state=seed) #model = RandomForestClassifier(60, 'entropy', 7, random_state=seed) #model = KNeighborsClassifier(28, "uniform") model = SVC(C=0.1, kernel='poly', random_state=seed) model.fit(X, y) #print(X.columns, '\n', model.feature_importances_) return model
def columnSvm(data, columns): loo = cross_validation.LeaveOneOut(len(data)) totalAccurateNum = 0 singleExpression = [0] * 7 singleExpressionAccurate = [0] * 7 eachAccuracy = [0.0] * 7 for i in range(1, 8): singleExpression[i - 1] = sum(data[:, 43] == i) for trainIndex, testIndex in loo: xTrain, yTrain = data[trainIndex, 0 : 43], data[trainIndex, 43] xTest, yTest = data[testIndex, 0 : 43], data[testIndex, 43] xTrain = xTrain[:, columns] xTest = xTest[:, columns] svm = SVC(kernel = "rbf", C = 4.0, gamma = 0.01) svm.fit(xTrain, yTrain) yPredict = svm.predict(xTest) if yTest == yPredict : totalAccurateNum += 1 singleExpressionAccurate[yTest[0] - 1] += 1 else : print yTest, yPredict print "total accuracy: ", float(totalAccurateNum) / len(data) for i in range(0, 7): eachAccuracy[i] = float(singleExpressionAccurate[i]) / singleExpression[i] print i+1, eachAccuracy[i] return totalAccurateNum, eachAccuracy
def svm_grid_search(): #get data training_input,training_target,validation_input,validation_target = prepare_input() #set up scorer for grid search. log-loss is error, not score, so set greater_is_better to false, #and log-loss requires a probability log_loss_scorer = make_scorer(log_loss,greater_is_better=False,needs_proba=True) training_input = training_input[:100000] training_target = training_target[:100000] print training_input.shape[0] print training_target.shape[0] start = time.time() svm = SVC(random_state=31,probability=True) svm_parameters = {'C':[.001,.01,.1,1,10,100],'kernel':["rbf","sigmoid"]} svm_grid_obj = GridSearchCV(svm,svm_parameters,log_loss_scorer,verbose=2,n_jobs=-1) svm_grid_obj = svm_grid_obj.fit(training_input,training_target) svm = svm_grid_obj.best_estimator_ print "Best params: " + str(svm_grid_obj.best_params_) svm_train_error = log_loss(training_target,svm.predict_proba(training_input)) svm_validation_error = log_loss(validation_target,svm.predict_proba(validation_input)) print "Best SVM training error: {:02.4f}".format(svm_train_error) print "Best SVM validation error: {:02.4f}".format(svm_validation_error) end = time.time() print "RF grid search took {:02.4f} seconds".format(end-start) return svm
def main(): print '[INFO, time: %s] Getting Data....' % (time.strftime('%H:%M:%S')) testing_file = file('test.p', 'r') training_file = file('train.p', 'r') train = pickle.load(training_file) test = pickle.load(testing_file) testing_file.close() training_file.close() trainX = train[:,:-1] trainy = train[:,-1] testX = test[:,:-1] testy = test[:,-1] print '[INFO, time: %s] Downsampling ...' % (time.strftime('%H:%M:%S')) trainX = downsample_features(trainX) testX = downsample_features(testX) print '[INFO, time: %s] Fitting %s ...' % (time.strftime('%H:%M:%S'), 'SVM - rbf kernel (i.e. gaussian) with default paramenters') clf = SVC() clf.fit(trainX, trainy) print '[INFO, time: %s] Making Predictions...' % (time.strftime('%H:%M:%S')) prediction = clf.predict(testX) print '[RESULT, time: %s] accuracy = %f' % (time.strftime('%H:%M:%S'),accuracy_score(testy, prediction))
def main() : np.random.seed(1234) # read the tweets and its labels dictionary = extract_dictionary('../data/tweets.txt') X = extract_feature_vectors('../data/tweets.txt', dictionary) y = read_vector_file('../data/labels.txt') metric_list = ["accuracy", "f1_score", "auroc"] ### ========== TODO : START ========== ### # part 1: split data into training (training + cross-validation) and testing set s1 = slice(0, 560, 1) s2 = slice(560, 630, 1) training_data = X[s1] test_data = X[s2] training_label = y[s1] test_label = y[s2] # part 2: create stratified folds (5-fold CV) kf = StratifiedKFold(training_label, n_folds=5) # part 2: for each metric, select optimal hyperparameter for linear-kernel SVM using CV for metric in metric_list: val = select_param_linear(training_data, training_label, kf, metric) print("max c: " + str(val)) # part 3: train linear-kernel SVMs with selected hyperparameters Model = SVC(kernel='linear', C=10) Model.fit(training_data, training_label) # part 3: report performance on test data for metric in metric_list: perf = performance_test(Model, test_data, test_label, metric) print 'Performance of ' + str(metric) + ': ' + str(perf)
def classify_rbf_tf(train_vector, train_label): classifier_RBFSvc_tf = SVC(kernel='rbf', gamma=2) rbf_clf_tf = classifier_RBFSvc_tf.fit(train_vector, train_label) save_clf2 = open("RBFSvcTf.pickle","wb") pickle.dump(rbf_clf_tf,save_clf2) save_clf2.close() return rbf_clf_tf
def classify_poly_tf(train_vector, train_label): classifier_polySvc_tf = SVC(kernel='poly', degree=2, gamma=2) poly_clf_tf = classifier_polySvc_tf.fit(train_vector, train_label) save_clf3 = open("polySvcTf.pickle","wb") pickle.dump(poly_clf_tf,save_clf3) save_clf3.close() return poly_clf_tf
def experiment_three_feature_selection(svm,train_set,test_set,c_star): # Find the weight vectors weight_vectors = svm.coef_ accuracies = np.zeros(58) # Sort the indexes of the weight vectors, then flip to largest to smallest. sorted_index = np.argsort(weight_vectors) reverse_index = np.fliplr(sorted_index)[0] # Randomizing the indices for the experiment np.random.shuffle(reverse_index) for i in range(2,len(reverse_index)+1): selector = reverse_index[:i] split_train = train_set[:,selector] split_test = test_set[:,selector] # Create the new SVM SVM_one = SVC(kernel='linear', C=c_star, probability=True) SVM_one.fit(split_train, train_set[:, -1]) new_predictions = SVM_one.predict(split_test) # Calculate the accuracy, precision, and recall of the SVM accuracy = metrics.accuracy_score(test_set[:,-1], new_predictions) accuracies[i] = accuracy # Was not entirely sure if this part was needed. # Reshuffled this way every time it was a randomized set of m features. # np.random.shuffle(reverse_index) return accuracies
def classify_linear_tf(train_vector, train_label): classifier_linearSvc_tf = SVC(kernel='linear') linear_clf_tf = classifier_linearSvc_tf.fit(train_vector, train_label) save_clf1 = open("linearSvcTf.pickle","wb") pickle.dump(linear_clf_tf,save_clf1) save_clf1.close() return linear_clf_tf
def fitall(X,y): m1 = RandomForestClassifier(n_estimators=500) m2 = LogisticRegression() m3 = SVC(probability=True) lib = [m1, m2, m3] m4 = sl.SuperLearner(lib, loss = "nloglik") return m1.fit(X,y), m2.fit(X,y), m3.fit(X,y), m4.fit(X, y)
def experiment_two_feature_selection(svm,train_set,test_set,c_star): # Find the weight vectors weight_vectors = svm.coef_ # Absolute these for argsort to accurately find the max to min vector weight_vectors = np.absolute(weight_vectors) accuracies = np.zeros(58) # Sort the indexes of the weight vectors, then flip to largest to smallest. sorted_index = np.argsort(weight_vectors) reverse_index = np.fliplr(sorted_index)[0] # Print for the Experiment 2 to find best 5 features. print("Top 5 weight vectors for Experiment 2: ", reverse_index[:5]) for i in range(2,len(reverse_index)+1): selector = reverse_index[:i] split_train = train_set[:,selector] split_test = test_set[:,selector] # Create the new SVM SVM_one = SVC(kernel='linear', C=c_star, probability=True) SVM_one.fit(split_train, train_set[:, -1]) new_predictions = SVM_one.predict(split_test) # Calculate the accuracy, precision, and recall of the SVM accuracy = metrics.accuracy_score(test_set[:,-1], new_predictions) accuracies[i] = accuracy return accuracies
def svc_classifier(X_train, categories,X_test, test_categories): from sklearn.svm import SVC svm_classifier = SVC(C=100, gamma=0.1).fit(X_train, categories) y_svm_predicted = svm_classifier.predict(X_test) print '\n Here is the classification report for support vector machine classiffier:' print metrics.classification_report(test_categories, y_svm_predicted) to_latex(test_categories, y_svm_predicted)
def train(args): print("train call") print("Loading embeddings.") fname = "{}/labels.csv".format(args.workDir) labels = pd.read_csv(fname, header=None).as_matrix()[:, 1] labels = map(itemgetter(1), map(os.path.split, map(os.path.dirname, labels))) # Get the directory. fname = "{}/reps.csv".format(args.workDir) embeddings = pd.read_csv(fname, header=None).as_matrix() le = LabelEncoder().fit(labels) labelsNum = le.transform(labels) nClasses = len(le.classes_) print("Training for {} classes.".format(nClasses)) if args.classifier == 'LinearSvm': clf = SVC(C=1, kernel='linear', probability=True) elif args.classifier == 'GridSearchSvm': print(""" Warning: In our experiences, using a grid search over SVM hyper-parameters only gives marginally better performance than a linear SVM with C=1 and is not worth the extra computations of performing a grid search. """) param_grid = [ {'C': [1, 10, 100, 1000], 'kernel': ['linear']}, {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']} ] clf = GridSearchCV(SVC(C=1, probability=True), param_grid, cv=5) elif args.classifier == 'GMM': # Doesn't work best clf = GMM(n_components=nClasses) # ref: # http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html#example-classification-plot-classifier-comparison-py elif args.classifier == 'RadialSvm': # Radial Basis Function kernel # works better with C = 1 and gamma = 2 clf = SVC(C=1, kernel='rbf', probability=True, gamma=2) elif args.classifier == 'DecisionTree': # Doesn't work best clf = DecisionTreeClassifier(max_depth=20) elif args.classifier == 'GaussianNB': clf = GaussianNB() # ref: https://jessesw.com/Deep-Learning/ elif args.classifier == 'DBN': from nolearn.dbn import DBN clf = DBN([embeddings.shape[1], 500, labelsNum[-1:][0] + 1], # i/p nodes, hidden nodes, o/p nodes learn_rates=0.3, # Smaller steps mean a possibly more accurate result, but the # training will take longer learn_rate_decays=0.9, # a factor the initial learning rate will be multiplied by # after each iteration of the training epochs=300, # no of iternation # dropouts = 0.25, # Express the percentage of nodes that # will be randomly dropped as a decimal. verbose=1) if args.ldaDim > 0: clf_final = clf clf = Pipeline([('lda', LDA(n_components=args.ldaDim)), ('clf', clf_final)]) clf.fit(embeddings, labelsNum) fName = "{}/classifier.pkl".format(args.workDir) print("Saving classifier to '{}'".format(fName)) with open(fName, 'w') as f: pickle.dump((le, clf), f)
#summary of the model predicion print(classification_report(y_test,y_pred)) print('Confusion Matrix:\n',confusion_matrix(y_test,y_pred)) #accuracy score of the model from sklearn.metrics import accuracy_score print('accuracy score :',accuracy_score(y_pred,y_test)) """### **Support Vector Machine(SVM)**""" #Support Vector Machine(SVM) #importing the library from sklearn.svm import SVC #creating local variable classifier classifier = SVC(kernel='linear',random_state=0) #Training the model classifier.fit(X_train,y_train) #predicting the value of Y y_pred = classifier.predict(X_test) #importing metrics for evaluation from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report #summary of the model predicion print(classification_report(y_test,y_pred)) print('Confusion Matrix:\n',confusion_matrix(y_test,y_pred)) #accuracy score of the model
t0 = time() x_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print("done in %0.3fs" % (time() - t0)) ############################################################################### # Train a SVM classification model print("Fitting the classifier to the training set") t0 = time() param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } #clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) clf = SVC(C=1000.0, cache_size=200, class_weight='balanced', coef0=0.0, decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True,tol=0.001, verbose=False) clf = clf.fit(x_train_pca, y_train) #clf = cv2.createFisherFaceRecognizer() #clf.train(x_train_pca, y_train) print("done in %0.3fs" % (time() - t0)) print("Best estimator found by grid search:") #print(clf.best_estimator_) # Save the classifier joblib.dump(clf, "recognition_clf.pkl", compress=3) ###############################################################################
X = dataset.iloc[:, [2,3]].values y = dataset.iloc[:, 4].values # Splitting the dataset into the Training set and Test set from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0) # Feature Scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test) # fitting classifier to the Training set from sklearn.svm import SVC classifier = SVC(kernel ='rbf', random_state = 0) classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) # Visualising the Training set results from matplotlib.colors import ListedColormap X_set, y_set = X_train, y_train X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01), np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01)) plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
y, test_size=0.2, random_state=0) ## Data without information about depth X_train_ND, X_test_ND = np.delete(arr=X_train, obj=[0, 4, 6], axis=1), np.delete(arr=X_test, obj=[0, 4, 6], axis=1) ####### II: Classification ####### # Define Classifiers nb = GaussianNB() knn = KNeighborsClassifier() svc = SVC(probability=True) ## Fit Classifiers without depth: fit_nb_ND = nb.fit(X_train_ND, y_train) fit_knn_ND = knn.fit(X_train_ND, y_train) fit_svc_ND = svc.fit(X_train_ND, y_train) # Predict with Classifiers ## Save methods in dict to iterate over them. methods = {"Naive Bayes": nb, "KNN": knn, "SVM": svc} ## With Depth: accuracies = [] precisions = [] for method_name, method in methods.items():
from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier(max_depth=6, max_features=7) rfc.fit(X_train, y_train) pred_rfc = rfc.predict(X_test) print(confusion_matrix(y_test, pred_rfc)) print(classification_report(y_test, pred_rfc)) print(accuracy_score(y_test, pred_rfc)) rfc.fit(X_train_all, y_train_all) pred_all_rfc = rfc.predict(X_test_all) sub_rfc = pd.DataFrame() sub_rfc['PassengerId'] = df_test['PassengerId'] sub_rfc['Survived'] = pred_all_rfc #sub_rfc.to_csv('randforest.csv',index=False) from sklearn.svm import SVC svc = SVC(gamma = 0.01, C = 100)#, probability=True) svc.fit(X_train_sc, y_train_sc) pred_svc = svc.predict(X_test_sc) print(confusion_matrix(y_test_sc, pred_svc)) print(classification_report(y_test_sc, pred_svc)) print(accuracy_score(y_test_sc, pred_svc)) svc.fit(X_train_all_sc, y_train_all_sc) pred_all_svc = svc.predict(X_test_all_sc) sub_svc = pd.DataFrame() sub_svc['PassengerId'] = df_test['PassengerId'] sub_svc['Survived'] = pred_all_svc sub_svc.to_csv('svc.csv',index=False)
##### splitting data into train and test set x_train, x_test, y_train, y_test = train_test_split(data['cleaned_text'], data['labels'], test_size=0.2, random_state=10) ############### fit frequency based word embeddings into our data set to turn text into wordvectors vectorizer = TfidfVectorizer(lowercase=True, stop_words=STOPWORDS) vectorizer.fit(x_train) x_train_vect = vectorizer.transform(x_train) x_test_vect = vectorizer.transform(x_test) ############# Build our classifier with Linear Support vector machine model = SVC(C=1, kernel='linear', class_weight='balanced') model.fit(x_train_vect, y_train) y_pred = model.predict(x_test_vect) cm = confusion_matrix(y_test, y_pred) ########## confusion matrix for test set pipeline = make_pipeline( vectorizer, model) #### save our model with pipeline function for future analysis def predict(text): score = pipeline.predict([clean_text(text)])
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0) # Feature scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test) # Fitting the SVM to the Training Set from sklearn.svm import SVC cl = SVC(kernel='linear', random_state=0) cl.fit(X_train, Y_train) # Predicint the test set results y_pred = cl.predict(X_test) # Making the confustion matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(Y_test, y_pred) # Visualising the Training set results from matplotlib.colors import ListedColormap X_set, Y_set = X_train, Y_train X1, X2 = np.meshgrid(
feats=[] humor = [] for key in dict.keys(): value = dict[key] feats.append(value[0].tolist()) humor.append(value[1].tolist()) feats = np.array(feats) humor = np.array(humor) if options.clf == 'GaussianProc': clf = GaussianProcessClassifier() elif options.clf == "SVC": clf = SVC() elif options.clf == "LinearSVC": clf = LinearSVC(max_iter=10000,dual=False) elif options.clf == "DecisionTree": clf = DecisionTreeClassifier() elif options.clf == "RandomForest": clf = RandomForestClassifier() elif options.clf == "AdaBoost": clf = AdaBoostClassifier(n_estimators=100) elif options.clf == "XGBoost": clf = XGBClassifier() elif options.clf == "KNN": clf = KNeighborsClassifier(n_neighbors=5) elif options.clf == "GaussianNB": clf = GaussianNB() elif options.clf == "RBF":
from sklearn.naive_bayes import BernoulliNB from sklearn.svm import SVC print ' ' print '=============================' print 'Bernoulli SVC Classifier:' classifierBi = SklearnClassifier(BernoulliNB()).train(train_set) classifierBi.classify_many(test) for pdist in classifierBi.prob_classify_many(test): print pdist.prob('human'), pdist.prob('auto') for i in range(len(classifierBi.classify_many(test))): print classifierBi.classify_many(test)[i] classifierSVC = SklearnClassifier(SVC(), sparse=True).train(train_set) classifierSVC.classify_many(test) # svc = nltk.classify.accuracy(classifierSVC, test_set) # print 'accuracy is %.2f' %round(svc*100,4), '%' def SVC(): classifierBi = SklearnClassifier(BernoulliNB()).train(train_set) return classifierSVC.classify_many(test) print "Performance of running Bernoulli SVC Classifier on test set: ", timeit.timeit( "SVC", setup="from __main__ import SVC", number=1) print ' ' print '=============================' print 'Linear SVC Classifier:' classifierLinSVC = SklearnClassifier(LinearSVC(),
# Provided to give you a starting point. Try a variety of classifiers. # Stratified ShuffleSplit cross-validator. # Provides train/test indices to split data in train/test sets. # This cross-validation object is a merge of StratifiedKFold and ShuffleSplit, # which returns stratified randomized folds. The folds are made by preserving the percentage of samples for each class. # NaiveBayes from sklearn.naive_bayes import GaussianNB nb_clf = GaussianNB() # SVM from sklearn.svm import SVC svm_clf = SVC() # DecisionTree from sklearn.tree import DecisionTreeClassifier dt_clf = DecisionTreeClassifier() # RandomForest from sklearn.ensemble import RandomForestClassifier rf_clf = RandomForestClassifier(n_estimators=25) # AdaBoost from sklearn.ensemble import AdaBoostClassifier ab_clf = AdaBoostClassifier()
def get_res(x_train, y_train, x_test, y_test): knn = KNeighborsClassifier() knn.fit(x_train, y_train) lg = LogisticRegression(penalty='l2') lg.fit(x_train, y_train) dtc = DecisionTreeClassifier() dtc.fit(x_train, y_train) gb = GradientBoostingClassifier(n_estimators=200) gb.fit(x_train, y_train) ab = AdaBoostClassifier() ab.fit(x_train, y_train) gnb = GaussianNB() gnb.fit(x_train, y_train) svm = SVC() svm.fit(x_train, y_train) mnb = MultinomialNB(alpha=0.01) mnb.fit(x_train, y_train) bnb = BernoulliNB(alpha=1.0, binarize=0.31, fit_prior=True, class_prior=None) bnb.fit(x_train, y_train) rtc = RandomForestClassifier(n_estimators=10, max_depth=20, random_state=47) rtc.fit(x_train, y_train) num_list = [ knn.score(x_test, y_test), lg.score(x_test, y_test), dtc.score(x_test, y_test), gb.score(x_test, y_test), ab.score(x_test, y_test), gnb.score(x_test, y_test), svm.score(x_test, y_test), mnb.score(x_test, y_test), bnb.score(x_test, y_test), rtc.score(x_test, y_test) ] name_list = [ 'KNN', 'Logistic', 'DecisionTree', 'GradientBoosting', 'AdaBoost', 'GaussianNB', 'SVC', 'MultinomialNB', 'BernoulliNB', 'RandomForest' ] plt.title('title') num_list = np.around(num_list, decimals=3) autolabel( plt.bar(range(len(num_list)), num_list, color='rb', tick_label=name_list, width=0.4)) plt.show()
nifti_masker = NiftiMasker(mask_img=mask_filename, sessions=session, smoothing_fwhm=4, standardize=True, memory="nilearn_cache", memory_level=1) func_filename = haxby_dataset.func[0] X = nifti_masker.fit_transform(func_filename) # Restrict to non rest data X = X[condition_mask] session = session[condition_mask] ########################################################################### # Build the decoder that we will use # Define the prediction function to be used. # Here we use a Support Vector Classification, with a linear kernel from sklearn.svm import SVC svc = SVC(kernel='linear') # Define the dimension reduction to be used. # Here we use a classical univariate feature selection based on F-test, # namely Anova. We set the number of features to be selected to 500 from sklearn.feature_selection import SelectKBest, f_classif feature_selection = SelectKBest(f_classif, k=500) # We have our classifier (SVC), our feature selection (SelectKBest), and now, # we can plug them together in a *pipeline* that performs the two operations # successively: from sklearn.pipeline import Pipeline anova_svc = Pipeline([('anova', feature_selection), ('svc', svc)]) ###########################################################################
# In[ ]: from sklearn.linear_model import LogisticRegression logmodel = LogisticRegression(max_iter=100) logmodel.fit(X_train, y_train) ypred = logmodel.predict(X_test) print(logmodel.score(X_train, y_train)) print(confusion_matrix(y_test, ypred)) print(classification_report(y_test, ypred)) # *4. SVM* # In[ ]: from sklearn.svm import SVC modelsvc = SVC(probability=True, gamma='auto') modelsvc.fit(X_train, y_train) ypred = modelsvc.predict(X_test) print(modelsvc.score(X_train, y_train)) print(confusion_matrix(y_test, ypred)) print(classification_report(y_test, ypred)) # *6. Decision Tree* # In[ ]: from sklearn.tree import DecisionTreeClassifier dmodel = DecisionTreeClassifier() dmodel.fit(X_train, y_train) ypred = dmodel.predict(X_test) print(dmodel.score(X_train, y_train))
#from sklearn import preprocessing #le = preprocessing.LabelEncoder() #bankdata = bankdata.apply(le.fit_transform) droplist = ['class'] X = bankdata.drop(droplist, axis=1) y = bankdata['class'] #从这儿开始才是算法,上面是处理输入的数据csv from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30) #labels = np.unique(X); print(labels) from sklearn.svm import SVC clf = SVC() #kernel='rbf' #clf = SVC(kernel='poly',degree=4) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) from sklearn.metrics import classification_report, confusion_matrix print(confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred)) #y_pred = svclassifier.predict(X_test) # #from sklearn.metrics import classification_report, confusion_matrix #print(confusion_matrix(y_test,y_pred)) #print(classification_report(y_test,y_pred)) # #
def main(): st.title("Binary Classification Web App") st.sidebar.title("Binary Classification Web App") st.markdown("Are your mushrooms edible or poisonous? 🍄") st.sidebar.markdown("Are your mushrooms edible or poisonous? 🍄") #st.cache : #until and unless the function name and arguments are chaged the data is cached # just use the cached data to rerun #Label Encoding : #refers to converting the labels into numeric form #so as to convert it into the machine-readable form. Machine learning algorithms #can then decide in a better way on how those labels must be operated. #It is an important pre-processing step for the structured dataset in supervised learning. @st.cache(persist=True) def load_data(): data = pd.read_csv("mushrooms.csv") labelencoder = LabelEncoder() for col in data.columns: data[col] = labelencoder.fit_transform(data[col]) #st.write(data) #to check the dataset after label encoding return data @st.cache(persist=True) def split(df): y = df.type x = df.drop(columns=['type']) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0) return x_train, x_test, y_train, y_test def plot_metrics(metrics_list): if 'Confusion Matrix' in metrics_list: st.subheader("Confusion Matrix") plot_confusion_matrix(model, x_test, y_test, display_labels=class_names) st.pyplot() if 'ROC Curve' in metrics_list: st.subheader("ROC Curve") plot_roc_curve(model, x_test, y_test) st.pyplot() if 'Precision-Recall Curve' in metrics_list: st.subheader('Precision-Recall Curve') plot_precision_recall_curve(model, x_test, y_test) st.pyplot() df = load_data() class_names = ['edible', 'poisonous'] #for confusion matrix x_train, x_test, y_train, y_test = split(df) #take user input of hyperparameters st.sidebar.subheader("Choose Classifier") classifier = st.sidebar.selectbox("Classifier", ("Support Vector Machine (SVM)", "Logistic Regression", "Random Forest")) if classifier == 'Support Vector Machine (SVM)': st.sidebar.subheader("Model Hyperparameters") #choose parameters C = st.sidebar.number_input("C (Regularization parameter)", 0.01, 10.0, step=0.01, key='C_SVM') kernel = st.sidebar.radio("Kernel", ("rbf", "linear"), key='kernel') gamma = st.sidebar.radio("Gamma (Kernel Coefficient)", ("scale", "auto"), key='gamma') metrics = st.sidebar.multiselect( "What metrics to plot?", ('Confusion Matrix', 'ROC Curve', 'Precision-Recall Curve')) if st.sidebar.button("Classify", key='classify'): st.subheader("Support Vector Machine (SVM) Results") model = SVC(C=C, kernel=kernel, gamma=gamma) model.fit(x_train, y_train) accuracy = model.score(x_test, y_test) y_pred = model.predict(x_test) st.write("Accuracy: ", accuracy.round(2)) st.write( "Precision: ", precision_score(y_test, y_pred, labels=class_names).round(2)) st.write("Recall: ", recall_score(y_test, y_pred, labels=class_names).round(2)) plot_metrics(metrics) if classifier == 'Logistic Regression': st.sidebar.subheader("Model Hyperparameters") C = st.sidebar.number_input("C (Regularization parameter)", 0.01, 10.0, step=0.01, key='C_LR') max_iter = st.sidebar.slider("Maximum number of iterations", 100, 500, key='max_iter') metrics = st.sidebar.multiselect( "What metrics to plot?", ('Confusion Matrix', 'ROC Curve', 'Precision-Recall Curve')) if st.sidebar.button("Classify", key='classify'): st.subheader("Logistic Regression Results") model = LogisticRegression(C=C, penalty='l2', max_iter=max_iter) model.fit(x_train, y_train) accuracy = model.score(x_test, y_test) y_pred = model.predict(x_test) st.write("Accuracy: ", accuracy.round(2)) st.write( "Precision: ", precision_score(y_test, y_pred, labels=class_names).round(2)) st.write("Recall: ", recall_score(y_test, y_pred, labels=class_names).round(2)) plot_metrics(metrics) if classifier == 'Random Forest': st.sidebar.subheader("Model Hyperparameters") n_estimators = st.sidebar.number_input( "The number of trees in the forest", 100, 5000, step=10, key='n_estimators') max_depth = st.sidebar.number_input("The maximum depth of the tree", 1, 20, step=1, key='n_estimators') bootstrap = st.sidebar.radio("Bootstrap samples when building trees", ('True', 'False'), key='bootstrap') metrics = st.sidebar.multiselect( "What metrics to plot?", ('Confusion Matrix', 'ROC Curve', 'Precision-Recall Curve')) if st.sidebar.button("Classify", key='classify'): st.subheader("Random Forest Results") model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, bootstrap=bootstrap, n_jobs=-1) model.fit(x_train, y_train) accuracy = model.score(x_test, y_test) y_pred = model.predict(x_test) st.write("Accuracy: ", accuracy.round(2)) st.write( "Precision: ", precision_score(y_test, y_pred, labels=class_names).round(2)) st.write("Recall: ", recall_score(y_test, y_pred, labels=class_names).round(2)) plot_metrics(metrics) if st.sidebar.checkbox("Show raw data", False): st.subheader("Mushroom Data Set (Classification)") st.write(df) st.markdown( "This [data set](https://archive.ics.uci.edu/ml/datasets/Mushroom) includes descriptions of hypothetical samples corresponding to 23 species of gilled mushrooms " "in the Agaricus and Lepiota Family (pp. 500-525). Each species is identified as definitely edible, definitely poisonous, " "or of unknown edibility and not recommended. This latter class was combined with the poisonous one." )
# Hyperparameter search over all possible dimensions for PCA reduction # 'pca__n_components': np.arange(1, 17), # 'svm__gamma': np.arange(0.001, 0.1, 0.001) } svm_classification_pipeline = Pipeline( [ # Apply PCA to SVM Classification #('pca', PCA()), # Apply scaling to SVM Classification #('scale', StandardScaler()), ('svm', SVC()) ] ) _accuracy_grid_search(values_train, hdi_class_train, svm_classification_pipeline, classification_svm_parameters) # ## u) # In[17]: classification_svm_parameters = { # Use linear kernel for SVM Classification
y = dataset.iloc[:, 4].values #Splitting the dataset into the Training set and Test set from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0) #Feature Scaling (Zscore, it standardizes the data) no need in from sklearn.preprocessing import StandardScalar sc_X = StandardScalar() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test) #Fitting regression model to the Training set #Create regression model from sklearn.svm import SVC classifier = SVC(kernal = 'linear', random_state = 0) classifier.fit(X_train, y_train) #Predicting the Test set results y_pred = classifier.predict(X_test) #Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) #Visualizing the Training set results (use this to see test set results by changing the variable) from matplotlib.colors import ListedColormap X_set, y_set = X_train, y_train X1, X2 = np.meshgrind(np.arage(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01), np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step 0.01)) plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape), alpha = 0.75, cmap = ListedColormap(('red', 'green'))) plt.xlim(X1.min(), X1.max())
second_pc = pca.components_[1] #print var, sum(var), eigenfaces.shape, ei_mean.shape, X_train_pca.shape ############################################################################### # Train a SVM classification model print "Fitting the classifier to the training set" t0 = time() param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } # for sklearn version 0.16 or prior, the class_weight parameter value is 'auto' #Grid encuantra el mejor parametro de C y gamma pa ser utilizado con el kernel rbf clf = GridSearchCV( SVC(kernel='rbf', class_weight='balanced', probability=True), param_grid) clf = clf.fit(X_train_pca, y_train) print "done in %0.3fs" % (time() - t0) print "Best estimator found by grid search:" print clf.best_estimator_ ############################################################################### # Quantitative evaluation of the model quality on the test set print "Predicting the people names on the testing set" t0 = time() y_pred = clf.predict(X_test_pca) y_proba = clf.predict_proba(X_test_pca) print "done in %0.3fs" % (time() - t0) #Guardar Variables del modelo ya entrenado with open('Clasificador.pkl', 'w') as f: # Python 3: open(..., 'wb')
X = data[:, 0:4] Y = data[:, 4] val_size = 0.2 scoring = "accuracy" (X_train, X_val, Y_train, Y_val) = model_selection.train_test_split(X, Y, test_size=val_size) models = { "LR": LogisticRegression(solver="lbfgs", multi_class="auto"), "LDA": LinearDiscriminantAnalysis(solver='lsqr'), "KNN": KNeighborsClassifier(), "DTC": DecisionTreeClassifier(), "NB": GaussianNB(), "SVC": SVC(), "MLP": MLPClassifier(), } results = [] for name, model in models.items(): kfold = model_selection.KFold(n_splits=10) cross_res = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring) results.append((name, cross_res)) for name, res in results: print("{:6} {:2.4} {:2.4}").format(name, res.mean(), res.std())
for alp in np.linspace(0.0001,1,alp_count): ridge = lm.SGDClassifier(loss='squared_loss', penalty='l2', max_iter=maxIter, tol=tolerance, alpha=alp) ridge.fit(trainXCon, trainY) print('ALPHA: ', alp, 'ACC: ', ridge.score(testXCon, testY)) ''' #%% 4. Construct CLassifiers # Choose model # from sklearn import gaussian_process # Gaussian = gaussian_process.GaussianProcess(theta0=1e-2, thetaL=1e-4, thetaU=1e-1) # GaussianProcessRegressor # from sklearn import metrics from sklearn.ensemble import BaggingClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.svm import SVC svm = SVC(kernel='linear') from sklearn.svm import LinearSVC svmLinear = LinearSVC() from sklearn import tree cartTree = tree.DecisionTreeClassifier() linear_square = lm.SGDClassifier(loss='squared_loss', penalty='none', max_iter=maxIter, tol=tolerance) ridge = lm.SGDClassifier(loss='squared_loss', penalty='l2', max_iter=maxIter, tol=tolerance, alpha=0.5) # ridgel1 = lm.SGDClassifier(loss='squared_loss', penalty='l2', max_iter=maxIter, tol=tolerance)
validation_size = 0.20 seed = 7 X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed) # Test options and evaluation metric seed = 7 scoring = 'accuracy' # Spot Check Algorithms models = [] models.append(('LR', LogisticRegression())) models.append(('LDA', LinearDiscriminantAnalysis())) models.append(('KNN', KNeighborsClassifier())) models.append(('CART', DecisionTreeClassifier())) models.append(('NB', GaussianNB())) models.append(('SVM', SVC())) print "--------different model accuray evaluation--------" # evaluate each model in turn results = [] names = [] for name, model in models: kfold = model_selection.KFold(n_splits=10, random_state=seed) cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) model.fit(X_train, Y_train) predictions = model.predict(X_validation) msg = "%s: %f (%f), accuracy score: %f" % (name, cv_results.mean(), cv_results.std(), accuracy_score(Y_validation, predictions)) print(msg)
ngram_range=(1, 5), use_idf=1,smooth_idf=1,sublinear_tf=1, stop_words = 'english') # Fit TFIDF tfv.fit(traindata) X = tfv.transform(traindata) X_test = tfv.transform(testdata) # Initialize SVD svd = TruncatedSVD() # Initialize the standard scaler scl = StandardScaler() # We will use SVM here.. svm_model = SVC() # Create the pipeline clf = pipeline.Pipeline([('svd', svd), ('scl', scl), ('svm', svm_model)]) # Create a parameter grid to search for best parameters for everything in the pipeline param_grid = {'svd__n_components' : [200, 400], 'svm__C': [10, 12]} # Kappa Scorer kappa_scorer = metrics.make_scorer(quadratic_weighted_kappa, greater_is_better = True) # Initialize Grid Search Model model = grid_search.GridSearchCV(estimator = clf, param_grid=param_grid, scoring=kappa_scorer,
### and testing datasets, respectively ### labels_train and labels_test are the corresponding item labels features_train, features_test, labels_train, labels_test = preprocess() # features_train = features_train[:len(features_train)/100] # labels_train = labels_train[:len(labels_train)/100] ######################################################### ### your code goes here ### print() print("Classifier: SVC") # import the sklearn module for GaussianNB from sklearn.svm import SVC # create classifier clf = SVC(gamma='auto', C=10000.0, kernel='rbf') #TODO # fit the classifier on the training features and labels t0 = time() clf.fit(features_train, labels_train) #TODO print "training time:", round(time()-t0, 3), "s" # predict labels for the test features t1 = time() pred = clf.predict(features_test) #TODO print "prediction time:", round(time()-t1, 3), "s" import collections counter = collections.Counter(pred) print "no. of emails predicted Chris': "+str(counter[1]) print "no. of emails predicted to be Sara's:"+str(counter[0])
from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.ensemble import VotingClassifier, RandomForestClassifier # Loading some example data iris = datasets.load_iris() X = iris.data[:, [0, 2]] y = iris.target print y # Training classifiers clf1 = DecisionTreeClassifier(max_depth=1) clf2 = DecisionTreeClassifier(max_depth=4) clf3 = SVC(kernel='rbf',random_state=0, gamma=1.0, C=1.0) eclf = SVC(kernel='rbf',random_state=0, gamma=100.0, C=1.0) """ # Training classifiers clf1 = RandomForestClassifier(max_depth=1) clf2 = DecisionTreeClassifier(max_depth=2) clf3 = DecisionTreeClassifier(max_depth=5) eclf = DecisionTreeClassifier(max_depth=10) # Training classifiers clf1 = DecisionTreeClassifier(max_depth=4) clf2 = KNeighborsClassifier(n_neighbors=7) clf3 = SVC(kernel='rbf', probability=True) eclf = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2), ('svc', clf3)],
X = stdata.drop('eligible', axis=1) y = stdata['eligible'] from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC clf = make_pipeline(StandardScaler(), SVC(gamma='auto')) clf.fit(X_train, y_train) prediction = clf.predict(X_test) from sklearn.metrics import classification_report from sklearn.metrics import accuracy_score, confusion_matrix, classification_report from sklearn.metrics import f1_score classification_report(y_test, prediction) f = f1_score(y_test, prediction, average=None) print("F1 Score ::", f[0])
import pandas as pd dataset=pd.read_csv('tenureTime.csv').values data=dataset[:,0:11] target=dataset[:,11] from sklearn.model_selection import train_test_split #dataset splitting function train_data,test_data,train_target,test_target=train_test_split(data,target,test_size=0.3) #from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC algorithm=SVC(kernel='poly',degree=2) #loading the SVM algorithm into "algorithm" algorithm.fit(train_data,train_target) #training result=algorithm.predict(test_data) #testing #print('Actual Target:',test_target) #print('Predicted Target:',result) from sklearn.metrics import accuracy_score acc=accuracy_score(test_target,result)
# 追加 emotion_dataset = load_emotion_dataset() emotion_dataset.features = scale_clf.transform(emotion_dataset.features) emotion_dataset.features = emotion_dataset.features[:, emotion_dataset. features_label_list. isin(selected_features)] #------------------------------- # 予測 #------------------------------- # モデル設定 #clf = load(model) clf = SVC(decision_function_shape='ovo', kernel="linear", C=0.9545484566618342, probability=True, max_iter=-1, random_state=1) accuracy = clf.fit(emotion_dataset.features, emotion_dataset.targets).predict(selected_test) print("accuracy: {}".format(accuracy)) digit_score = clf.predict_proba(selected_test) print(digit_score) np.savetxt(r"C:\Users\akito\Desktop\shibata_2.csv", digit_score, delimiter=",") # 描画 time_record = pd.read_excel(time_record_path, header=0, index_col=0) end_time = (time_record.loc["Amusement", "FinishDatetime"] - time_record.loc["Neutral", "StartDatetime"]).total_seconds() stress_start = (time_record.loc["Stress archimetic", "FinishDatetime"] -
(train, test) = train_test(men) X = train.loc[:, best_columns[:-1]] Y = train['diagnosis'] x = test.loc[:, best_columns[:-1]] y = test['diagnosis'] knn = train_knn(X, Y, x, y) del X, Y, x, y X = men.loc[:, best_columns[:-1]] Y = men['diagnosis'] skf = StratifiedKFold(Y, random_state=1, n_folds=10) lm = LogisticRegression(random_state=1) gnb = GaussianNB() svc = SVC() rfc = RandomForestClassifier(random_state=1) classifiers = [lm, knn, gnb, svc, rfc] plot_crossvalidated_roc(skf, X, Y, classifiers) # lm.fit(X, Y) # gnb.fit(X, Y) # svc.fit(X, Y) # rfc.fit(X, Y) # class_report(x, y, lm) # class_report(x, y, knn) # class_report(x, y, gnb) # class_report(x, y, svc) # class_report(x, y, rfc)
# SVM from sklearn.svm import SVC X = fruits_df[['width', 'height']] y = fruits_df['fruit_label'].copy() # 将不是apple的标签设置为0 y[y != 1] = 0 # 分割数据 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/5, random_state=0) c_value = [0.001, 1, 100] for c_value in c_value: # 建立模型 svm_model = SVC(C=c_value) # 训练模型 svm_model.fit(X_train, y_train) # 验证模型 y_pred = svm_model.predict(X_test) acc= accuracy_score(y_test, y_pred) print('C={},模型精准率为:{}'.format(c_value, acc)) print('*******************************决策树*******************************') from sklearn.datasets import load_iris from sklearn.tree import DecisionTreeClassifier iris = load_iris() X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0)