def __init__(self, symb, predlen, cat='RL', kwargs=None): self.symb = symb self.predlen = predlen self.kwargs = kwargs self.cat = cat if cat == 'RF': if kwargs != None: self.learner = RF.RandomForest(**kwargs) else: self.learner = RF.RandomForest() elif cat == 'KNN': if kwargs != None: self.learner = KNN.KNN(**kwargs) else: self.learner = KNN.KNN() elif cat == 'SVM': if kwargs != None: self.learner = SVM.SVM(**kwargs) else: self.learner = SVM.SVM() elif cat == 'NN': if kwargs != None: self.learner = NN.NN(**kwargs) else: self.learner = NN.NN()
def test(path): li = [] X, y = load_data_set(path) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0) kernel_ = kernel.Kernel.gaussian(0.5) svm = SVM.SVM( kernel=kernel_, c=0.5, ) svm = svm.training(X_train, y_train) accuracy1 = svm.calc_accuracy(X_train, y_train) accuracy = svm.calc_accuracy(X_test, y_test) li.append((accuracy1, accuracy)) # print("(%.3f%%, %.3f%%)" % (accuracy1 * 100, accuracy * 100)) for i in range(1, 7): try: X_train_decom, X_test_decom = svm_decom(X_train, y_train, X_test, kernel_, i) kernel_ = kernel.Kernel.linear() svm = SVM.SVM( kernel=kernel_, c=0.5, ) svm = svm.training(X_train_decom, y_train) accuracy1 = svm.calc_accuracy(X_train_decom, y_train) accuracy = svm.calc_accuracy(X_test_decom, y_test) li.append((accuracy1, accuracy)) # print("(%.3f%%, %.3f%%)" % (accuracy1 * 100,accuracy * 100)) except: li.append((0, 0)) return li
def mix_up(): """creating a hybrid model mixing ML algorithms and neural net , it accumulates the errors from individual algorithms and it increases the error in neural net so much that the model is not flexible enough to decide the trend in market or find patterns in data , model2 removes that redundant error .""" ind = 0 for i in bar(xrange(len(x))): b_pred, b_y = bayes.naive_bayes_model(x[i], net=True) s_pred, s_y = SVM.svm_model(x[i], net=True) k_pred, k_y = KNN.knn_algo_model(x[i], net=True) print b_pred, b_y mix.new_net(s_pred, b_pred, k_pred, s_y, x[i]) ind = 0 report = pd.DataFrame(index=range(0), columns=[ 'Stock Name', 'accuracy', 'profit count', 'loss count', 'total no of rise', 'total number of loss' ]) for i in bar(xrange(len(x))): b_pred, b_y = bayes.naive_bayes_model(x[i], net=True, actual=True) s_pred, s_y = SVM.svm_model(x[i], net=True, actual=True) k_pred, k_y = KNN.knn_algo_model(x[i], net=True, actual=True) p_count, total_count_p, l_count, total_count_l, accuracy = mix.new_net( s_pred, b_pred, k_pred, s_y, x[i], create=False) report.loc[ind] = [ x[i], accuracy, p_count, l_count, total_count_p, total_count_l ] ind = ind + 1 print "Mean accuracy----------", report['accuracy'].mean() report.to_csv("./report/mix_result.csv")
def plot_convergence_synthetic(svm, lr): for i in range(4): a = data[i] b = Label[i] n, m = np.shape(a) a = np.vstack((a, np.ones((1, m)))) b = b.reshape(-1, 1) x = np.zeros((3, 1)) a1 = a b1 = b if svm: x_bt, norm_bt, _, acc_bt, duration_bt, diteration_bt = SVM.BackTracking( a, b, x, args.l, args.d, False, a1, b1) f_bt = open('svm_acc_bt_' + str(i) + '.txt', mode='a+') f_bt.write( np.str(acc_bt) + ' ' + str(duration_bt) + ' ' + str(diteration_bt)) x_AGM, norm_AGM, _, acc_AGM, duration_AGM, diteration_AGM = SVM.AGM( a, b, x, args.l, args.d, False, a1, b1) f_AGM = open('svm_acc_AGM_' + str(i) + '.txt', mode='a+') f_AGM.write( np.str(acc_AGM) + ' ' + str(duration_AGM) + ' ' + str(diteration_AGM)) x_BFGS, norm_BFGS, _, acc_BFGS, duration_BFGS, diteration_BFGS = SVM.G_BFGS( a, b, x, args.l, args.d, False, a1, b1) f_BFGS = open('svm_acc_BFGS_' + str(i) + '.txt', mode='a+') f_BFGS.write( np.str(acc_BFGS) + ' ' + str(duration_BFGS) + ' ' + str(diteration_BFGS)) Draw.gradient_plot(len(norm_bt), norm_bt, 'backtracking') Draw.gradient_plot(len(norm_AGM), norm_AGM, 'AGM') Draw.gradient_plot(len(norm_BFGS), norm_BFGS, 'BFGS') plt.savefig('SVM_convergence_' + str(i)) # plt.show() plt.close() if lr: x_AGM, norm_AGM, _, acc_AGM, duration_AGM, diteration_AGM = LR.AGM( a, b, x, args.l, False, a1, b1) f_AGM = open('lr_acc_AGM_' + str(i) + '.txt', mode='a+') f_AGM.write( np.str(acc_AGM) + ' ' + str(duration_AGM) + ' ' + str(diteration_AGM)) x_L_BFGS, norm_L_BFGS, _, acc_L_BFGS, duration_L_BFGS, diteration_L_BFGS = LR.L_BFGS( a, b, x, args.m, args.l, False, a1, b1) f_L_BFGS = open('lr_acc_LBFGS_' + str(i) + '.txt', mode='a+') f_L_BFGS.write( np.str(acc_L_BFGS) + ' ' + str(duration_L_BFGS) + ' ' + str(diteration_L_BFGS)) Draw.gradient_plot(len(norm_AGM), norm_AGM, 'AGM') Draw.gradient_plot(len(norm_L_BFGS), norm_L_BFGS, 'L_BFGS') plt.savefig('lr_convergence_' + str(i)) # plt.show() plt.close()
def start_training(X, y): ''' LogisticRegression.logistic_regression_classifier(X, y) DecisionTree.decision_tree_classifier(X, y) KNN.knn_classifier(X,y) ''' SVM.svm_classifier(X, y)
def main(): init() for i in range(10): data_quantifier.split_categorical_data(0.4) transformer.transform_secondary_structure() Verifier.verify_validity("conversion") data_quantifier.quantify_data() SVM.create_and_store_svm() Prediction.predict_and_test()
def test(data): import numpy as np import feature_extraction import SVM data = np.ravel(data) features = feature_extraction.mfcc_convert(np,data) # print(features[:5]) svc = SVM.control_information() print("Predicted as :",SVM.test(svc,features))
def test(): with closing(connect_db()) as db: svm = SVM(C=1) fill_negative_votes() X, Y = get_feature_vecs(db) # Divide up X into S chunks N = len(X) S = N # Cross validation, and get the averate number of misclassified count = 0 total_incorrect = 0 for s in range(S): print "iter:", count size_of_fold = math.ceil(1.0*N/S) start = s*size_of_fold end = start + size_of_fold if end > N: print "end > N" end = N print "range:", start, end holdoutX = X[start:end,:] trainingX = np.concatenate( (X[0:start], X[end:N]) ) holdoutY = Y[start:end] trainingY = np.concatenate( (Y[0:start], Y[end:N]) ) print "len holdoutX:", len(holdoutX) print "len trainingX;", len(trainingX) print "len holdoutY:", len(holdoutY) print "len trainingY;", len(trainingY) svm.train_dual(trainingX, trainingY) num_misclass = svm.num_incorrect(holdoutX, holdoutY) total_incorrect += num_misclass print "Num incorrect:", num_misclass count +=1 print "Total misclassified with SVM:", 1.0 * total_incorrect # Now use keyword classification votes = get_all_votes() num_incorrect = 0 for vote, event, doc in votes: classify = keyword_classify(event, doc) if classify != vote: print "Classified as:", classify, "actual:", vote num_incorrect += 1 else: print "Classified as:", classify, "actual:", vote print "Total misclassified with keyword approach:", num_incorrect
def task_2_results(): kf = KFold(n_folds=k) accuracies = [] for train_index, test_index in kf.split(y): X_train = X_manual[train_index] X_test = X_manual[test_index] y_train = y[train_index] y_test = y[test_index] SVM.learn_svm(X_train, y_train, prefix+"task_2_model") accuracies.append(SVM.test_svm_accuracy(X_test, y_test, prefix+"task_2_model")) print "Accuracy for task_2:", np.mean(accuracies), "+-", np.std(accuracies) return accuracies
def test_model(): start_time = time.time() # Read all data (emails, y) = read_email_data('txt_lists/E-MAILS.txt', 'txt_lists/LABELS.txt') (test_emails, test_labels) = read_email_data('txt_lists/TESTS.txt', 'txt_lists/TEST_LABELS.txt') websites = read_website_list('txt_lists/websites.txt') #get the website about which the email is intended_websites = find_intended_websites(websites, emails) #a dictionary with the actual class names class_dict = create_class_dict() #check for easy typos spell_chck = SpellChecker('en') [spell_chck.add(word) for word in open('websites.txt').read().split('\n')] emails = [detect_and_correct_typos(spell_chck, mail) for mail in emails] test_emails = [ detect_and_correct_typos(spell_chck, mail) for mail in test_emails ] #extract the needed features from the datasets feature_extractor = extract_features.FeatureExtractor() #if we laoded it form the disk, then dont train it. read the trained model data if not feature_extractor.has_vocab: (train_features, ) = feature_extractor.extract_email_train_features(emails) else: #feature_names = feature_extractor.vect.get_feature_names() train_features = feature_extractor.vect.training_data_features #extract the test data test_features = feature_extractor.extract_email_test_features(test_emails) #classify (clf, already_trained) = SVM.get_classifier() if not already_trained: SVM.train_clf(clf, train_features, y) train_guesses = SVM.classify(clf, train_features) test_guesses = SVM.classify(clf, test_features) print('training performance: ', SVM.eval_performance(train_guesses, y, class_dict)) print('test performance: ', SVM.eval_performance(test_guesses, test_labels, class_dict)) feature_extractor.save_vectorizer() SVM.save_classifier_to_disk(clf) print("--- %s seconds ---" % (time.time() - start_time))
def predicting_using_SVM(train_mat_with, train_mat_without, train_por_with, train_por_without, test_mat_with, test_mat_without, test_por_with, test_por_without, kernel='Linear', c=1.0): # Divide into labels and sets label_mat_with, train_mat_with_t = extract_label(train_mat_with) label_mat_without, train_mat_without_t = extract_label(train_mat_without) label_por_with, train_por_with_t = extract_label(train_por_with) label_por_without, train_por_without_t = extract_label(train_por_without) # To change kernel function, use 'Quadratic' or 'Gaussian' instead predictlabel_mat_with = SVM.svm_solver(train_mat_with_t, label_mat_with, test_mat_with, c, kernel=kernel) predictlabel_mat_without = SVM.svm_solver(train_mat_without_t, label_mat_without, test_mat_without, c, kernel=kernel) predictlabel_por_with = SVM.svm_solver(train_por_with_t, label_por_with, test_por_with, c, kernel=kernel) predictlabel_por_without = SVM.svm_solver(train_por_without_t, label_por_without, test_por_without, c, kernel=kernel) # get measurement f_score1, accuracy1 = SVM.calculate_measurements(predictlabel_mat_without, test_mat_without) f_score2, accuracy2 = SVM.calculate_measurements(predictlabel_mat_with, test_mat_with) f_score3, accuracy3 = SVM.calculate_measurements(predictlabel_por_without, test_por_without) f_score4, accuracy4 = SVM.calculate_measurements(predictlabel_por_with, test_por_with) # show result print('Kernel function type:' + kernel) print('SVM Mat Without G1, G2: Accuracy: ' + str(accuracy1) + ' f_score: ' + str(f_score1)) print('SVM Mat With G1, G2: Accuracy: ' + str(accuracy2) + ' f_score: ' + str(f_score2)) print('SVM Por Without G1, G2: Accuracy: ' + str(accuracy3) + ' f_score: ' + str(f_score3)) print('SVM Por With G1, G2: Accuracy: ' + str(accuracy4) + ' f_score: ' + str(f_score4)) return 0
def fit_svm(self): from numpy import linalg def linear_kernel(x1, x2): return np.dot(x1, x2) def polynomial_kernel(x, y, p=3): return (1 + np.dot(x, y))**p def gaussian_kernel(x, y, sigma=5.0): return np.exp(-linalg.norm(x - y)**2 / (2 * (sigma**2))) pos_samples = self._posdata.drop("class", axis=1) pos_labels = self._posdata.loc[:, "class"] nag_samples = self._nagdata.drop("class", axis=1) nag_labels = self._nagdata.loc[:, "class"] pos_array = pos_samples.values pos_arraylabels = np.ones((len(pos_samples), 1)) nag_array = nag_samples.values nag_arraylabels = np.ones((len(nag_samples), 1)) * -1 # clf=SVM.SVM(gaussian_kernel) # clf = SVM.SVM(C=1) clf = SVM.SVM() x_train = np.vstack((pos_array, nag_array)) y_train = np.vstack((pos_arraylabels, nag_arraylabels)) clf.fit(x_train, y_train) y_predict = clf.predict(x_train) correct = np.sum(y_predict == y_train) acc = correct / len(self._frequency)
def runAll(rawX, rawY, rawXTesting, rawYTesting): print "\n\nMultinomial NB\n\n" nb = multinomialNB(rawX, rawY, rawXTesting, rawYTesting) print "\n\nSVM\n\n" svm = support.supportFunction(rawX, rawY, rawXTesting, rawYTesting) km = kmeans.kmeansFunction(rawX, rawY, rawXTesting, rawYTesting) return [nb, svm, km]
def showVocabDistribution(filename, n_stopword): counter = 0 f = open('data/stopDict_' + filename + '.json', 'r') stopDict = json.load(f) f.close() n_vocab = SVM.getDictSize("VocabDict", filename) sum_stopword = 0 sum_vocab = 0 sum_other = 0 for key, value in sorted(stopDict.items(), key=lambda x: int(x[1]), reverse=True): if counter < n_stopword: sum_stopword += int(value) elif counter < n_vocab + n_stopword: sum_vocab += int(value) else: sum_other += int(value) counter += 1 print sum_stopword print sum_vocab print sum_other total = float(sum_stopword + sum_vocab + sum_other) print "stopword :", float(sum_stopword) / total * 100, "%" print "vocab :", float(sum_vocab) / total * 100, "%" print "other :", float(sum_other) / total * 100, "%"
def s(x): log1,log2 = logistic_regression.predict(x) svm1,svm2 = SVM.predict(x) nb1,nb2 = NaiveBayes.predict(x) X = np.concatenate((log1.reshape(len(log1),1) , log2.reshape(len(log2),1), svm1.reshape(len(svm1),1), svm2.reshape(len(svm2),1),nb1.reshape(len(nb1),1),nb2.reshape(len(nb2),1)),axis = 1) prediction = model.predict(X) return prediction
def question2(): sum = [0, 0, 0, 0] for i in range(5): for j in range(i+1,5): print("i: ", i , "j: ", j) S = sp for k in range(5): if k != i and k != j: S = S[S['ethnicity'] != k] # print(S) X = S[['gender', 'education', 'lunch', 'test_preparation_course', 'math', 'reading', 'writing']] X = MinMaxScaler().fit_transform(X) y = S['ethnicity'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) kernel_mode = 'linear' log_pred = logistic_regression.run_log(X_train, X_test, y_train) ad_pred = adaboost_test.run_adaboost(X_train, X_test, y_train) svm_pred = SVM.run_svm(X_train, X_test, y_train, kernel_mode) knn_pred = KNN.run_KNN(X_train, X_test, y_train) sum[0] += metrics.accuracy_score(y_test, log_pred) sum[1] += metrics.accuracy_score(y_test, ad_pred) sum[2] += metrics.accuracy_score(y_test, svm_pred) sum[3] += metrics.accuracy_score(y_test, knn_pred) for i in range(4): sum[i] = sum[i] / 10 print(sum)
def get_predict(svm, X): m = X.shape[0] predict = [] for i in range(m): pre_y = SVM.svm_predict(svm, X[i, :]) predict.append(sign(pre_y[0, 0])) return mat(predict).T
def ClassificationModels(): classificationPreprocessingVar = CPre.Preprocessing('tmdb_5000_movies_classification.csv', 'tmdb_5000_credits.csv') classificationPreprocessingVar.reformat() #classificationPreprocessingVar.deleteMissigData() #classificationPreprocessingVar.meanNormalization() X_train, y_train, X_test, y_test = classificationPreprocessingVar.GetData() obj1 = LR.Logistic_Regression(X_train, y_train, X_test, y_test) obj1.FitModel() obj1.TrainAndTestModel() obj2 = svm.SVM(X_train, y_train) obj2.FitModel() obj2.TestModel(X_test, y_test) obj3 = knn.Knn_Classifier(X_train, y_train, 5) obj3.FitModel() obj3.TestModel(X_test, y_test) obj5 = dt.DecisionTreeClassifier(X_train, y_train) obj5.FitModel() obj5.TestModel(X_test, y_test) obj6 = rf.RandomForestClassifier(X_train, y_train) obj6.FitModel() obj6.TestModel(X_test, y_test) obj7 = ad.AdaBoostClassifier(X_train, y_train) obj7.FitModel() obj7.TestModel(X_test, y_test)
def DeveloperTools(): global dev dev = Tk() dev.title("Developer Tools") root.withdraw() dev.geometry("500x200") #Set window size tk.Button(dev, text='Train Binomial Model', command=VerifyBinomial).pack(fill=tk.X) tk.Button(dev, text='Train Multinomial Model', command=VerifyMultinomial).pack(fill=tk.X) tk.Button(dev, text='Score Predictions Using Pipeline', command=lambda: SVM.MakePredictions(True)).pack(fill=tk.X) tk.Button(dev, text='Create Vectorizer Model', command=VerifyVectorizer).pack(fill=tk.X) tk.Button(dev, text='Create K-Means Clusters', command=KMeansCluster).pack(fill=tk.X) tk.Button( dev, text= 'Return to Previous Menu', #go back to main menu and close developer tools command=lambda: [dev.withdraw(), SelectAction()]).pack(fill=tk.X) dev.mainloop()
def gridSearch_Gaussian(trainData, trainLabel, testData, testLabel): ''' 网格搜索高斯核参数 ============== ''' C = [np.power(2.0, i) for i in range(-5, 16, 2)] Sigma = [np.power(2.0, i) for i in range(-3, 8)] Epsilon = [np.power(10.0, i) for i in range(-6, 0)] subRange = 100 maximumArguments = (0, 0, 0) maximumF1Score = 0 for c, sigma, epsilon in [(c, sigma, epsilon) for c in C for sigma in Sigma for epsilon in Epsilon]: print(c, sigma, epsilon) predictLabel = SVM.predict(trainData[:subRange], trainLabel[:subRange], testData, C=c, sigma=sigma, epsilon=epsilon) f1Score = modelTest(testLabel, predictLabel) if f1Score > maximumF1Score: maximumF1Score = f1Score maximumArguments = (c, sigma, epsilon) print(maximumF1Score, maximumArguments)
def run(self): if self.algorithm == 'logistic': import logistic self.model = logistic.run() self.name = 'LogisticRegression' elif self.algorithm == 'randomforest': import randomforest self.model = randomforest.run() self.name = 'RandomForest' elif self.algorithm == 'xgboost': import XGboost self.model = XGboost.run() self.name = 'xgboost' elif self.algorithm == 'SVM': import SVM self.model = SVM.run() self.x_test_scaled = np.load('x_test_scaled.npy') self.name = 'SVM' else: print('算法参数有误') if self.algorithm == 'SVM': self.y_pred_prob = self.model.decision_function(self.x_test_scaled) self.y_pred = self.model.predict(self.x_test_scaled) else: self.y_pred_prob = self.model.predict_proba(self.x_test)[:, 1] self.y_pred = self.model.predict(self.x_test)
def test_model2(): """A Sort of noise is introduced to remove redundancy error from all the ML algorithms, this makes the model more flexible and in turn increases the accuracy .""" ind = 0 pred = [] report = pd.DataFrame(index=range(0), columns=[ 'Stock Name', 'accuracy', 'profit count', 'loss count', 'total no of rise', 'total number of loss' ]) for i in bar(xrange(len(x))): try: k_pred, k_y = KNN.knn_algo_model(x[i], net=True) b_pred, b_y = bayes.naive_bayes_model(x[i], net=True) s_pred, s_y = SVM.svm_model(x[i], net=True) p_count, total_count_p, l_count, total_count_l, accuracy = mix.model1( s_pred, b_pred, k_pred, s_y, x[i]) report.loc[ind] = [ x[i], accuracy, p_count, l_count, total_count_p, total_count_l ] ind = ind + 1 except: print "!!!!!############ error" print "Mean accuracy----------", report['accuracy'].mean() report.to_csv("./report/mix_model2_result.csv")
def __init__(self, norm_type="Normalization", iterations=5, base_classifier="SVM"): self.iterations = iterations self.norm_type = norm_type self.base_classifier = SVM.SVMClassifier() self.prediction = None self.probability = None self.classifier_set = None
def plotSVM(X,Y,SV_X, SV_Y, SV_a,b_bound,sig): C1 = np.where(Y== -1)[0] C2 = np.where(Y== 1)[0] C = (C1,C2) colors = ("red", "green") for c,color in zip(C, colors): plt.scatter(X[c,0], X[c,1], alpha=1.0, c=color) step = 0.025 x_axis = np.arange(0.0, 1.0+step, step) y_axis = np.arange(0.0, 1.0+step, step) X_mesh, Y_mesh = np.meshgrid(x_axis, y_axis) Z_mesh = np.zeros(X_mesh.shape) for x in range(X_mesh.shape[0]): for y in range(Y_mesh.shape[0]): input = np.array([X_mesh[x,y], Y_mesh[x,y]],ndmin=2) Z_mesh[x,y] = SVM.evalSVM(input,SV_X, SV_Y, SV_a,b_bound,sig) plt.contour(x_axis, y_axis, Z_mesh,(0.0)) #contour(X,Y,Z,V) #draw contour lines at the values specified in sequence V, #which must be in increasing order. plt.show()
def main(args): training_set = utilis.readExamples(args[1]) training_labels_set = utilis.readLabelSet(args[2]) init_weights = np.zeros((3, training_set.shape[1])) test_set = utilis.readExamples(args[3]) norm_tra_set, norm_tes_set = norm.Min_Max_normalization( training_set, test_set, MAX_RANGE, MIN_RANGE) seed = utilis.generate_Seed(SEED_RANGE, BIAS) perc_obj = Perceptron.Perceptron(PERC_EPOCHS, PERC_RATE, norm_tra_set, training_labels_set, seed) perceptron_weights = perc_obj.training(None, None, init_weights) pa_obj = PA.PA(PA_EPOCHS, norm_tra_set, training_labels_set, seed) pa_weights = pa_obj.training(None, None, init_weights) norm_tra_set, norm_tes_set = norm.Zscore_normalization( training_set, test_set) svm_obj = SVM.SVM(SVM_EPOCHS, SVM_LAMBDA, SVM_RATE, norm_tra_set, training_labels_set, seed) svm_weights = svm_obj.training(None, None, init_weights) utilis.printThePredictions(perceptron_weights, svm_weights, pa_weights, test_set)
def softsvm(traindata, trainlabel, testdata, testlabel, sigma, C): if sigma == 0: kernel_type='linear' else: kernel_type='quadratic' model = SVM.SVM(kernel_type, C, sigma) model.fit(traindata, trainlabel) result = model.predict(testdata) # print(result) ypred = [] np2p = 0 nn2p = 0 for i in range(600): if result[i] == 1: ypred.append(1) if testlabel[i] == 1: np2p += 1 else: nn2p += 1 else: ypred.append(0) #print(nn2p, np2p) SP = np2p / (np2p + nn2p) SR = np2p / 100 F = SP * SR * 2 / (SP + SR) print(ypred, SP, SR, F)
def GetAnswer(self, event): # The list used to get all the ratio of string similarity. ratio = [] # Get the classifier. classifier = SVM.DoSVM() # Get the vectoricer. vectorizer = KeywordProcessor.DataTransformer()[1] # Get the PCA transformer. pcaTransform = PCADataProcessor.PCADataProcessor()[1] # Get the users' question. testData = self.text_user.GetValue() # Used to store the users' question. example = [] # Split the users' question. seg_list = jieba.cut_for_search(testData) # Store the users's question. example.append(" ".join(seg_list)) # Transformed users' question into the matrix. transDataExample = vectorizer.transform(example) # Store the new matrix. dataMatrixExample = transDataExample.toarray() # Compress the matrix. newExample = pcaTransform.transform(dataMatrixExample) # Get the prediction value. result = classifier.predict(newExample) # Get the question and the answer from the corresponding cluster. Questions = sheetForQuestion.col_values(result[0]) Answers = sheetForAnswer.col_values(result[0]) # Do the string similarity to get the answer. for item in Questions: ratio.append( difflib.SequenceMatcher(None, testData, item).quick_ratio()) # Ouput the answer. wx.MessageBox(Answers[ratio.index(max(ratio))])
def callTrainSelectionMethods(self): if(len(self.train_Algorithms.curselection())!=0 and len(self.trainSet.curselection())!=0): alg=self.selectTrainAlg() trainSet=self.selectTrainer() trainPath="" if(trainSet=="IMDB Movie set(25000)"): trainPath="Datasets/aclImdb/train" elif(trainSet=="IMDB Movie subset(750)"): trainPath="Datasets/debugSets/train" elif(trainSet=="Custom Set 1"): trainPath="Datasets/custom1/train" elif(trainSet=="Custom Set 2"): trainPath="Datasets/custom2/train" elif(trainSet=="Custom Set 3"): trainPath="Datasets/custom3/train" if(alg=="Naive Bayes"): nb=naiveBayes.NaiveBayes() nb.train(trainPath) print("done training naive bayes") elif(alg=="Support Vector Machine"): svm=SVM.SVM() svm.trainSVM(trainPath) print("done training support vector machine") else: print("Please select an algorithm and a test set.")
def PolynomialSVMTest(pca_option): import SVM SVM.SVMSimulation(SVM.svm_poly, processing.linear_pca, processing.overall_training_data, pca_option) processing.final_validation = np.array(processing.final_validation) FV_features = [] FV_labels = [] FV_features, FV_labels = processing.createFeatures_Labels( processing.final_validation) FV_features_data = None FV_labels_data = None FV_features_data, FV_labels_data = processing.convertToDataFrame( FV_features, FV_labels, processing.column_titles) global SVM_POLY_final_predictions if(pca_option == 'yes' or pca_option == 'both'): transformed_FV = processing.linear_pca.transform(FV_features_data) final_predictions = SVM.svm_poly.predict(transformed_FV) SVM_GAUS_final_predictions = final_predictions accuracy = metrics.accuracy_score(final_predictions, FV_labels) precision = metrics.precision_score( FV_labels, final_predictions, average='micro') recall = metrics.recall_score( FV_labels, final_predictions, average='micro') print('POLYNOMIAL SVM MODEL FINAL TEST DATA ACCURACY: ', 100 * accuracy) print('POLYNOMIAL SVM MODEL FINAL TEST DATA PRECISION: ', 100 * precision) print('POLYNOMIAL SVM MODEL FINAL TEST DATA RECALL: ', 100 * recall) print() return accuracy, precision, recall else: final_predictions = SVM.svm_poly.predict(FV_features_data) SVM_GAUS_final_predictions = final_predictions accuracy = metrics.accuracy_score(final_predictions, FV_labels) precision = metrics.precision_score( FV_labels, final_predictions, average='micro') recall = metrics.recall_score( FV_labels, final_predictions, average='micro') print('POLYNOMIAL SVM MODEL FINAL TEST DATA ACCURACY: ', 100 * accuracy) print('POLYNOMIAL SVM MODEL FINAL TEST DATA PRECISION: ', 100 * precision) print('POLYNOMIAL SVM MODEL FINAL TEST DATA RECALL: ', 100 * recall) print() return accuracy, precision, recall
def grid_search(label, X, y, svm_params, methods, train_size=0.75, graph=False): """ Implementation of the cross validation Parameters: - kernel: function, kernel function - label: int (0, 1 or 2), label of the set of data - X: array, observations - y: array, labels - svm_params: array, parameters of the SVM classifier - kernel_params: array, parameters of the kernel function - train_size: float (between 0 and 1), proportion of data for the train part - graph: bool, plot the evolution of the accuracy wrt log(svm_params) or not Returns the best SVM classifier """ Xtr, ytr, Xte, yte = train_test_split(X, y, train_size) best_score = 0 best_clf = None for method in methods: kernel, kernel_param = kernels.select_method(method) print() scores = [] for c in svm_params: print('Parameters : ' + str([method, c])) gram_file = "../gram_matrix/gramMat_" + str( label) + "_" + method + ".p" clf = SVM.SupportVectorMachine(kernel=kernel, C=c, kernel_params=kernel_param) clf.fit(Xtr, ytr, gram_file) score = accuracy(clf.predict(Xte), yte) if score > best_score: best_score = score best_clf = clf print("Accuracy score = " + str(score) + '\n') scores.append(score) if graph: plt.plot(np.log10(svm_params), scores, label='kernel_param = ' + str(kernel_param)) if graph: plt.title('Evolution of the accuracy wrt log(C)') plt.legend() plt.savefig('../res/cross_val' + str(label) + '.png') plt.show() return best_clf
def driver(classifier): print (getTitle(classifier)) if classifier == 4: trainX, trainY, testX, testY = data_handler.splitData2TestTrain('ATNTFaceImages400.txt', 10, '1:10') print ("\nAverage Accuracy for 5 folds: %s"% SVM.cross_validate(trainX, trainY, testX, testY)) else: data, indexes = data_handler.get_data("ATNTFaceImages400.txt") print ("\nAverage Accuracy for 5 folds: %s"%cross_validator(5, data, indexes, classifier))
def svm_predict(company: str, verbose=False, train_size=0.80, scaled=False): X_train, X_test, y_train, y_test, prices, times = get_features(company, train_size=train_size, scaled=scaled) true_labels, SVM_predictions = SVM.predict(X_train, y_train, X_test, y_test) accuracy = accuracy_score(true_labels, SVM_predictions) if verbose: print("SVM Accuracy: " + str(accuracy * 100) + "%") prediction_distribution(SVM_predictions, true_labels) return prices, times, SVM_predictions, accuracy
def Part2(createData): # optical flow and motion direction histogram calculation v = 100 # mdh_all = OM.createMotionDirectionHistograms('Oberstdorf16-shots.csv', 'videos/oberstdorf16.mp4', v, False, True) # FileIO.save_histograms_to_file('mdh_16_' + str(v) + '.csv', mdh_all) if createData: SVM.save_shot_images('videos/oberstdorf16.mp4', SVM.SSI_CENTER, 'Oberstdorf16-shots.csv', False) #svm training and predicting mdh_training = FileIO.read_histograms_from_file('mdh_8_' + str(v) + '.csv') mdh_test = FileIO.read_histograms_from_file('mdh_16_' + str(v) + '.csv') predicted_labels = SVM.svm_use(mdh_training, mdh_test) stitched_shots, all_shots, outstitched_shots = SVM.get_results( predicted_labels, 'Oberstdorf16-shots.csv', True) return stitched_shots, all_shots, outstitched_shots
def task_3_results(): kf = KFold(n_folds=k) num_manual_features = len(X_manual[0]) accuracies = [] for i in range(num_manual_features): accuracies.append([]) X_manual_np = np.asarray(X_manual) for train_index, test_index in kf.split(X_content): X_train = X_content[train_index] X_test = X_content[test_index] for i in range(num_manual_features): y_train = X_manual_np[train_index, i] y_test = X_manual_np[test_index, i] SVM.learn_svm(X_train, list(y_train), prefix+"task_3_model") accuracies[i].append(SVM.test_svm_accuracy(X_test, list(y_test), prefix+"task_3_model")) for i in range(num_manual_features): print "Accuracy for task_3 (" + str(names[i]) + "):", np.mean(accuracies[i]), "+-", np.std(accuracies[i]) return accuracies
def Part1(): # values = [100, 200, 500, 1000] values = [100] for v in values: print "Points: " + str(v) #optical flow and motion direction histogram calculation # mdh_all = OM.createMotionDirectionHistograms('GroundTruth.csv', 'videos/oberstdorf08small.mp4', v, False, False) # FileIO.save_histograms_to_file('mdh_8_' + str(v) + '.csv', mdh_all) # print "Histograms created." # #svm training and predicting mdh_compl = FileIO.read_histograms_from_file('mdh_8_' + str(v) + '.csv') accuracy, ITERATIONS, NF = SVM.svm_accuracy(mdh_compl) print "average accuracy: " + str(accuracy/ITERATIONS/NF)
def testSVM(): ## loading data log = "Step 1: loading data..." writeLog(log) print log test_x, test_y = loadDigitTestData() # scales from -1 to 1 test_x = test_x/255.0*2 - 1 # initialize the vote matrix for testing data, Votes[m, 10] m, dump = shape(test_y) Votes = mat(zeros((m, 10))) ## testing data log = "Step 2: testing data..." for i in range(10): for j in range(i+1, 10): log = "--working on model: " + str(i) + '&' + str(j) print log writeLog(log) # loading the models d = shelve.open('./models/svm_' + str(i) + '_' + str(j)) svmClassifier = d['svm'] d.close() # testing using the given model and votes Votes_k, Votes_l = SVM.testDigitScores(svmClassifier, test_x, m) # write to the Votes Votes[:, i] += Votes_k Votes[:, j] += Votes_l ## saving Votes matrix log = "Step 3: saving votes..." print log writeLog(log) d = shelve.open('./models/Votes_Score_noscale') d['vote'] = Votes d.close()
def run_with_svm(image_filename="../Wheat_Images/004.jpg", ser_filename=None): ''' Estimates the number of grains in a given image using a Support Vector Machine. Args: image_filename: The path to the image from which a grain count is to be obtained. ser_filename: path to serialized list of isub-images already extracted from the image from which a grain count is to be obtained. Returns: count: An estimate of the number of grains in the provided image. ''' global img_data # Chop image up into sub-images and serilaise or just load serialised data if # it already exists. if(ser_filename == None and image_filename == "../Wheat_Images/004.jpg"): ser_filename = "../Wheat_Images/xxx_004.data" if(Helper.unserialize(ser_filename) == None): img = img_as_ubyte(io.imread(image_filename)) roi_img = spectral_roi.extract_roi(img, [1]) Helper.block_proc(roi_img, (20,20), blockfunc) #Helper.serialize(ser_filename, img_data) else: img_data = Helper.unserialize(ser_filename) # classify r = SVM.classify(img_data, featureRepresentation='glcm', shouldSaveResult=True) # Count number of '1s' in the result and return count = r.tolist().count(1) print("COUNT: {}".format(count)) return count
def task_4_results(): kf = KFold(n_folds=k) num_manual_features = len(X_manual[0]) accuracies = [] X_manual_np = np.asarray(X_manual) for train_index, test_index in kf.split(X_content): X_content_train = X_content[train_index] X_manual_test = X_manual[test_index] # Just for structure, populated below X_content_test = X_content[test_index] num_test = len(test_index) for i in range(num_manual_features): X_np_train = X_manual_np[train_index, i] SVM.learn_svm(X_content_train, list(X_np_train), prefix+"task_4_model") for j in range(num_test): X_manual_test[j][i] = SVM.load_svm(prefix+"task_4_model").predict([X_content_test[j]])[0] X_manual_train = X_manual[train_index] y_train = y[train_index] y_test = y[test_index] SVM.learn_svm(X_manual_train, y_train, prefix+"task_4_model_1") accuracies.append(SVM.test_svm_accuracy(X_manual_test, y_test, prefix+"task_4_model_1")) print "Accuracy for task_4:", np.mean(accuracies), "+-", np.std(accuracies) return accuracies
from SVM import * from documento import * from pattern.web import * from model import * import os # reload(sys) # Reload does the trick! # # sys.setdefaultencoding('UTF8') # print sys.getdefaultencoding() lecturaArchivo('data/documentos.csv','entrenamiento') X = getDocumentosAtributos('entrenamiento') Y = getDocumentosClase() unSVM = SVM(1.0,'poly',.7,.3,X,Y) unSVM.training() print "Precision : " ,unSVM.testing() lecturaArchivo('data/prediccion.csv','predecir') X = getDocumentosAtributos('predecir') print unSVM.predecir(X)
median_fare[f] = test_df[ test_df.Pclass == f + 1]['Fare'].dropna().median() # loop 0 to 2 for f in range(0, 3): test_df.loc[(test_df.Fare.isnull()) & ( test_df.Pclass == f + 1), 'Fare'] = median_fare[f] # Collect the test data's PassengerIds before dropping it ids = test_df['PassengerId'].values # Remove the Name column, Cabin, Ticket, and Sex (since I copied and # filled it to Gender) test_df = test_df.drop( ['Name', 'Sex', 'Ticket', 'Cabin', 'PassengerId'], axis=1) test_data = test_df.values return ids, test_data def clean_data(train_df,test_df): train_data,Ports_dict = clean_train_data(train_df) ids, test_data = clean_test_data(Ports_dict,test_df) return train_data,test_data,ids if __name__ == '__main__': # Data cleanup # Load the test file into a dataframe test_df = pd.read_csv('data/test.csv', header=0) # Load the train file into a dataframe train_df = pd.read_csv('data/train.csv', header=0) train_data,test_data,ids = clean_data(train_df,test_df) SVM.run(train_data,test_data,ids)
for i in dataSet: newI = HN.HyperIntervalNumber(i) out = newI.GetAllPosPoint() for j in out: NewdataSet.append(j) print "The test data is:" print dataSet dataSet = mat(NewdataSet) print len(dataSet) labels = mat(labels).T train_x = dataSet[0:800, :] train_y = labels[0:800, :] test_x = dataSet[801:1599, :] test_y = labels[801:1599, :] ## step 2: training... print "step 2: training..." C = 3 toler = 0.001 maxIter = 50 svmClassifier = SVM.trainSVM(train_x, train_y, C, toler, maxIter, kernelOption = ('rbf', 0)) ## step 3: testing print "step 3: testing..." accuracy = SVM.testSVM(svmClassifier, test_x, test_y) ## step 4: show the result print "step 4: show the result..." print 'The classify accuracy is: %.3f%%' % (accuracy * 100) #SVM.showSVM(svmClassifier)
numFeaturesTrueP= getTotalCountFeature(type4, trueTable) numFeaturesDecP= getTotalCountFeature(type4, decTable) numFeaturesPosP= getTotalCountFeature(type4, posTable) numFeaturesNegP= getTotalCountFeature(type4, negTable) numFeaturesTrueGivenPP= getTotalCountFeature(type4, truePosTable) numFeaturesDecGivenPP= getTotalCountFeature(type4, decPosTable) numFeaturesTrueGivenNP= getTotalCountFeature(type4, trueNegTable) numFeaturesDecGivenNP= getTotalCountFeature(type4, decNegTable) for te in test: posOrNegC= SVM.getTrueOrDeceptive(type1, numFeaturesPosC, numFeaturesNegC, te[type1], posTable, negTable) posOrNegU= SVM.getTrueOrDeceptive(type2, numFeaturesPosU, numFeaturesNegU, te[type2], posTable, negTable) posOrNegB= SVM.getTrueOrDeceptive(type3, numFeaturesPosB, numFeaturesNegB, te[type3], posTable, negTable) posOrNegP= SVM.getTrueOrDeceptive(type4, numFeaturesPosP, numFeaturesNegP, te[type4], posTable, negTable) trueOrDecC=0 trueOrDecU=0 trueOrDecB=0 trueOrDecP=0 if posOrNegC == 1: trueOrDecC= SVM.getTrueOrDeceptive(type1, numFeaturesTrueGivenPC, numFeaturesDecGivenPC, te[type1],truePosTable, decPosTable) else: trueOrDecC= SVM.getTrueOrDeceptive(type1, numFeaturesTrueGivenNC, numFeaturesDecGivenNC, te[type1],trueNegTable, decNegTable) if posOrNegU == 1:
def CrossValidation(K,X,gamma, c): #crossValidation m=1 then use explicit fit, m=2 then use gradient descent fit print(X.shape[0]) classes=[] classNum=0 for y in X[:,-1]: if not y in classes: classes.append(y) classNum+=1 precisions=np.zeros(shape=(classNum)).tolist() recalls=np.zeros(shape=(classNum)).tolist() accuracys=0 fMeasures=np.zeros(shape=(classNum)).tolist() for k in xrange(K): #for k in range(0,1): training=np.ndarray(shape=(0,X.shape[1])) validation=np.ndarray(shape=(0,X.shape[1])) for i in range(0,X.shape[0]): if i % K!=k: training=np.vstack([training,X[i]]) else: validation=np.vstack([validation,X[i]]) yExpected=validation[:,-1] xs=validation[:,0:validation.shape[1]-1] classifier=SVM.svm(training[:,0:X.shape[1]-1],training[:,X.shape[1]-1],gamma=gamma,c=c) classifier.train() confusionMatrix=np.zeros(shape=(classNum,classNum),dtype=float) count=0 for x in xs: #print(classifier.predict(x)) j=classifier.predict(x) confusionMatrix[j,yExpected[count]]=confusionMatrix[j,yExpected[count]]+1 count+=1 #confusionMatrix[classes.index(ys[count]),classes.index(y)]=confusionMatrix[classes.index(ys[count]),classes.index(y)]+1 print(confusionMatrix) precision=np.zeros(shape=(classNum),dtype=float) recall=np.zeros(shape=(classNum),dtype=float) accuracy=0 fMeasure=np.zeros(shape=(classNum),dtype=float) for i in range(classNum): if np.sum(confusionMatrix[i,:])==0: precision[i]=0 else: precision[i]=confusionMatrix[i,i]/np.sum(confusionMatrix[i,:]) if np.sum(confusionMatrix[:,i])==0: recall[i]=0 else: recall[i]=confusionMatrix[i,i]/np.sum(confusionMatrix[:,i]) accuracy+=confusionMatrix[i,i] if precision[i]==0 or recall[i]==0: fMeasure[i]=0 else: fMeasure[i]=2*precision[i]*recall[i]/(precision[i] +recall[i]) accuracy=accuracy/validation.shape[0] precisions=precisions+precision recalls=recalls+recall accuracys=accuracys+accuracy fMeasures=fMeasures+fMeasure p=np.array(precisions)/K r=np.array(recalls)/K a=accuracys/K f=np.array(fMeasures)/K print("precision:") print(p) print("recall:") print(r) print("accuracy:") print(a) print("F measure:") print(f)
def main(argv): SVM.classify()
def main(): #data_set = pd.read_csv('creditdata.csv', index_col=0) data_set = pd.read_csv('creditSmall.csv', index_col=0) data_set = fix_header(data_set) data_set.EDUCATION[data_set.EDUCATION == '0'] = '4' data_set.EDUCATION[data_set.EDUCATION == '5'] = '4' data_set.EDUCATION[data_set.EDUCATION == '6'] = '4' data_set.MARRIAGE[data_set.MARRIAGE == '0'] = '3' data_set = data_set.astype(float) print(data_set.DEFAULTER.mean()*100) data_set['BILL_PAY_RATIO1'] = (data_set['BILL_AMT1'] - data_set['PAY_AMT1']) / data_set['LIMIT_BAL'] data_set['BILL_PAY_RATIO2'] = (data_set['BILL_AMT2'] - data_set['PAY_AMT2']) / data_set['LIMIT_BAL'] data_set['BILL_PAY_RATIO3'] = (data_set['BILL_AMT3'] - data_set['PAY_AMT3']) / data_set['LIMIT_BAL'] data_set['BILL_PAY_RATIO4'] = (data_set['BILL_AMT4'] - data_set['PAY_AMT4']) / data_set['LIMIT_BAL'] data_set['BILL_PAY_RATIO5'] = (data_set['BILL_AMT5'] - data_set['PAY_AMT5']) / data_set['LIMIT_BAL'] data_set['BILL_PAY_RATIO6'] = (data_set['BILL_AMT6'] - data_set['PAY_AMT6']) / data_set['LIMIT_BAL'] x = data_set.drop(['DEFAULTER'], axis=1) y = data_set.DEFAULTER # rescale the metrics to the same mean and standard deviation scaler = preprocessing.StandardScaler() x = scaler.fit(x).transform(x) # Further divide the train data into train test split 70% & 30% respectively x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y, random_state=2) # creating classifier # classifier = neighbors.KNeighborsClassifier(n_neighbors=3) # classifier.fit(X_train, Y_train) # accuracy = classifier.score(X_test, Y_test) # print("Accuracy :",accuracy) #myaccuracy= KNN(x_train,y_train,y_test,x_test,3) #print(myaccuracy) # Predicition using Naive Bayes # model = fit(X_train, Y_train) # predictions = getPredictions(model, X_test) # accuracies = getAccuracy(Y_test, predictions) # print('Accuracy: ', accuracies) # # gaussNb = GaussianNB() # gaussNb.fit(X_train, Y_train) # print(gaussNb) # y_expect = Y_test # y_pred = gaussNb.predict(X_test) # print(accuracy_score(y_expect, y_pred)) # # classification = classification_report(Y_test, predictions) # print(classification) # label = [0, 1] # cmatrix = confusion_matrix(Y_test, predictions, label) # print(cmatrix) # # classification1 = classification_report(Y_test, y_pred) # print(classification1) # label1 = [0, 1] # cmatrix1 = confusion_matrix(Y_test, y_pred, label1) # print(cmatrix1) # plot_confusion_matrix(cmatrix, label) # plot_confusion_matrix(cmatrix1, label1, title="2") clf = SVM() y_svmTrain = np.where(y_train == 0, -1, 1) clf.fit(x_train, y_svmTrain) y_predict = clf.predict(x_test) correct = np.sum(y_predict == y_test) print("%d out of %d predictions correct" % (correct, len(y_predict))) print() print(y_test) plot_margin(x_train[y_train == 1], x_train[y_train == -1], clf) print(x_test)
def buildSVM(k, l): ''' Description: Build the SVM model for classes in Digit Data. @param: k: the SVM model for first class, 0<=k<=9 l: the SVM model for second class, 0<=l<=9 @procedure saves the SVM simplier model between class k and l. ''' ## Step 1: load data log = "Step 1: loading data..." writeLog(log) print log train_x, train_y, test_x, test_y = loadDigitData() # set_printoptions(threshold='nan') # extract k, l classes K_IndexTrain = nonzero(train_y.A == k)[0] L_IndexTrain = nonzero(train_y.A == l)[0] IndexTrain = concatenate((K_IndexTrain, L_IndexTrain)) # random shuffle the array IndexTrain = random.permutation(IndexTrain) K_IndexTest = nonzero(test_y.A == k)[0] L_IndexTest = nonzero(test_y.A == l)[0] IndexTest = concatenate((K_IndexTest, L_IndexTest)) # random shuffle the array IndexTest = random.permutation(IndexTest) train_x = train_x[IndexTrain] train_y = train_y[IndexTrain] test_x = test_x[IndexTest] test_y = test_y[IndexTest] # sets label to -1 and +1 train_y[train_y==k] = -1 train_y[train_y==l] = 1 test_y[test_y==k] = -1 test_y[test_y==l] = 1 # scales the features value between [-1~1] train_x = train_x/255.0*2 - 1 test_x = test_x/255.0*2 - 1 ## Step 2: training data log = "Step 2: training data..." writeLog(log) print log C = 16 toler = 0.001 maxIter = 50 svmClassifier = SVM.train(train_x, train_y.T, C, toler, maxIter, kernel = ('rbf', 13)) # saves the model to disk for feature prediction svmClassifier.save('./models/svm_' + str(k) + '_' + str(l)) # simpleSVM = SVMSimpleStruct(svmClassifier) # simpleSVM.save('./models/simple_svm_' + str(k_class)) # # load the model # print 'Step 2: loading model.. # d = shelve.open('./models/svm_' + str(k) + '_' + str(l)) # svmClassifier = d['svm'] # d.close() # # Step 3: testing data log = "Step 3: testing data..." writeLog(log) print log accuracy = SVM.test(svmClassifier, test_x, test_y) ## Step 4: show the results log = 'The classify accuracy is: %.3f%%' % (accuracy * 100) print log writeLog(log)
df = Parser.load_parsed_data_from_file(filename + ".parsed_encoded_data") training_set_fraction = 0.7 training_data = df.loc[:training_set_fraction * float(df.shape[0])] ###### ###### ###### ###### ###### ###### START TRAINING ###### ###### ###### ###### ###### ###### import time stime = time.time() ### One Class Support Vector Machine ### import SVM from sklearn.externals import joblib OCSVM = SVM.trainOCSVM(training_data, tol=0.001, cache_size=2000, shrinking=False, nu=0.05, verbose=True) joblib.dump(OCSVM, filename=filename + ".fitted_SVM_model") clf = joblib.load(filename + ".fitted_SVM_model") ######################################## ### Autoassociative NN ### import Autoencoder import tensorflow as tf sess = tf.Session() x = tf.placeholder("float", [None, df.shape[1]]) autoencoder = Autoencoder.create(x, [48, 24, 12]) EWMACost = 0 Autoencoder.train_AE(df=training_data, sess=sess, x=x, denoising=False, verbose=False, autoencoder=autoencoder)
def classify_function(filename, method, k, kernal,n): if filename == '': raise data.ValidationError('please select file') X, Y, subjectID = data.load_data("control_features_combinedSubject.txt", "dementia_features_combinedSubject.txt") X = data.get_useful_features_mat(X) alz_count = 0 for y in Y: if y: alz_count = alz_count + 1 #normalize features X_scaled, normalizer = data.normalize_features(X) #print X_scaled #print filename testList = [] filenameSubj = filename[0:3] testList.append(filenameSubj) try: X_train, Y_train, X_test, Y_test, trainID, testID = data.split_train_test(X_scaled,Y,subjectID,testID=testList) except ValueError: print filenameSubj raise data.ValidationError("combined data missing!") #PCA if n < 1 and n != -1: raise data.ValidationError('# features has to be greater than 0') elif n > 16: raise data.ValidationError('# features has to be less than 16') elif (n >= 1 and n <= 16): pca, explained_variance_ratio_ = data.reduce_dimension(X_train, n) X_train = pca.transform(X_train) #load the real testing data! Y_test should remain the same! X_test = data.load_testing_visit("control_features_per_visit.txt", "dementia_features_per_visit.txt", filename[0:5]) #print "visit" #print X_test X_test = data.get_useful_features_mat(X_test) #print "visit" #print X_test X_test = normalizer.transform(X_test) #print "visit" #print X_test #if use PCA if (n >= 1 and n <= 16): X_test = pca.transform(X_test) #print "visit" #print X_test if method == 0: raise data.ValidationError('please select classification method') #SVM elif method == 2: if kernal == 0: raise data.ValidationError('please select kernel') clf = SVM.train(X_train,Y_train,kernal) result = SVM.test(X_test,clf) #print X_train #print X_test #print result #print clf #KNN elif method == 1: if k == '': raise data.ValidationError('please select number of neighbors (k)') neigh = KNN.train(X_train,Y_train,k) result = KNN.test(X_test,neigh) return result[0], Y_test[0]
print "step 1: load data..." dataSet = [] labels = [] with open('../../data/testSet.txt', 'r') as file: for line in file.readlines(): line = line.strip().split('\t') dataSet.append([float(line[0]), float(line[1])]) labels.append(float(line[2])) dataSet = mat(dataSet) labels = mat(labels).T train_x = dataSet[0:81, :] train_y = labels[0:81, :] test_x = dataSet[80:101, :] test_y = labels[80:101, :] ## step 2: training... print "step 2: training..." C = 0.6 toler = 0.001 maxIter = 50 svmClassifier = SVM.train(train_x, train_y.T, C, toler, maxIter, kernel = ('linear', 0)) ## step 3: testing print "step 3: testing..." accuracy = SVM.test(svmClassifier, test_x, test_y) ## step 4: show the result print "step 4: show the result..." print 'The classify accuracy is: %.3f%%' % (accuracy * 100) SVM.show(svmClassifier)
def plot_learning_curve(features_train, labels_train, features_test, labels_test, outputClasses=None, K="linear", C=1, G=0.01, method="error", classifier="SVM", a=0, b=1, showFigure=True, saveFigure=False, savePath=None, nb_epochs=10): # run for every 10% of training set and compute training error and testing error step = len(features_train) / 10 train = [] test = [] maj_clas = [] for i in range(0, 10): print 'iteration : ', i # train for (i+1)*10 percent of training set f = features_train[0:((i + 1) * (step))] l = labels_train[0:((i + 1) * (step))] assert f.shape[0] == l.shape[0], 'Wrong number of input data! ' if classifier == "SVM": # train classifier for the specific subset of training set model = SVM.train(f, l, k=K, c=C, g=G) # get training error predictionTrain = SVM.predict(f, model) # get testing error predictionTest = SVM.predict(features_test, model) elif classifier == "LR": # train classifier for the specific subset of training set model = LogisticRegression.train(f, l, c=C) # get training error predictionTrain = LogisticRegression.predict(f, model) # get testing error predictionTest = LogisticRegression.predict(features_test, model) elif classifier == "CNN": model = Keras_test.CNN().train(features=f, labels=l,outputClasses=outputClasses, learning_curves_OR_Cross_Val=True,nb_epoch=nb_epochs) # get training error predictionTrain = Keras_test.CNN().predictClasses(features=f, model=model) # get testing error predictionTest = Keras_test.CNN().predictClasses(features=features_test, model=model) # TODO : CNN MINIBATCHES LEARNING CURVES , Implementation : read 10% of data and train cnn.train with all the data # elif classifier == "CNN_minibatches": elif classifier == "MLP": model = Keras_test.MLP().train(features=f, labels=l, outputClasses=outputClasses, learning_curves_OR_Cross_Val=True, nb_epoch=nb_epochs) # get training error predictionTrain = Keras_test.MLP().predict(features=f, model=model, ShowAccuracy=False) # get testing error predictionTest = Keras_test.MLP().predict(features=features_test, model=model, ShowAccuracy=False) elif classifier == "SimpleRNN": model = Keras_test.RNN().trainRNN(features=f, labels=l, outputClasses=outputClasses, learning_curves=True) # get training error predictionTrain = Keras_test.RNN().predict(features=f, model=model, ShowAccuracy=False) # get testing error predictionTest = Keras_test.RNN().predict(features=features_test, model=model, ShowAccuracy=False) elif classifier == "RNN_LSTM": model = Keras_test.RNN().trainRnnLSTM(features=f, labels=l, outputClasses=outputClasses, learning_curves=True) # get training error predictionTrain = Keras_test.RNN().predict(features=f, model=model, ShowAccuracy=False) # get testing error predictionTest = Keras_test.RNN().predict(features=features_test, model=model, ShowAccuracy=False) # get error for majority classifier predictionMajority = MajorityClassifier.predictMaj(labels_test) if method == "error": train.append(measures.error(l, predictionTrain)) test.append(measures.error(labels_test, predictionTest)) maj_clas.append(measures.error(labels_test, predictionMajority)) elif method == "avgF1": train.append(measures.avgF1(l, predictionTrain, a, b)) test.append(measures.avgF1(labels_test, predictionTest, a, b)) maj_clas.append(measures.avgF1(labels_test, predictionMajority, a, b)) print test[9] x = np.arange(len(train)) * 10 plt.plot(x, train, color="blue", linewidth="2.0", label=classifier) plt.plot(x, test, color="blue", linestyle="dashed", linewidth="2.0") plt.plot(x, maj_clas, color="red", linewidth="2.0") plt.ylim(0, 1) plt.ylabel(method) plt.xlabel("% of messages") if method == "error": plt.legend(loc="upper left") elif method == "avgF1": plt.legend(loc="lower left") if saveFigure: assert savePath != None, "Give image path to save image" # with figure i can save it anywhere i want # fig1 = plt.gcf() plt.savefig(savePath) # clear current canvas . if we have show and save together we will have a problem... plt.clf() if showFigure: plt.show()
dataSet = [] labels = [] fileIn = open('/Users/lixurong/Downloads/DataSet.txt') for line in fileIn.readlines(): lineArr = line.strip().split('\t') dataSet.append([float(lineArr[0]), float(lineArr[1])]) labels.append(float(lineArr[2])) dataSet = mat(dataSet) labels = mat(labels).T train_x = dataSet[0:81, :] train_y = labels[0:81, :] test_x = dataSet[80:101, :] test_y = labels[80:101, :] ## step 2: training... print "step 2: training..." C = 0.6 toler = 0.001 maxIter = 50 svmClassifier = SVM.trainSVM(train_x, train_y, C, toler, maxIter, kernelOption = ('linear', 0)) ## step 3: testing print "step 3: testing..." accuracy = SVM.testSVM(svmClassifier, test_x, test_y) ## step 4: show the result print "step 4: show the result..." print 'The classify accuracy is: %.3f%%' % (accuracy * 100) SVM.showSVM(svmClassifier)
#!/usr/bin/env python import SVM dataMat,labelMat=SVM.loadDataSet('testSet.txt') print(dataMat) print(labelMat) b,alphas=SVM.smoSimple(dataMat,labelMat,0.6,0.001,40) print(b) print(alphas) from numpy import * print("The number of Support Vector:") print(shape(alphas[alphas>0])) for i in range(100): if(alphas[i]>0.0): print(dataMat[i],labelMat[i])
def run(): #sciPpn.run() #logReg.run() svm.run()
def rate(review): review = SVM.asciify(review) inputData = [] # 1st element = bayes with unigram bayesClassifier.loadData("U") inputData.append(bayesClassifier.percentPositive(review)) # bayes with adjective bayesClassifier.loadData("A") inputData.append(bayesClassifier.percentPositive(review)) # bayes with POS bayesClassifier.loadData("P") bayesClassifier.partOfSpeech = True inputData.append(bayesClassifier.percentPositive(review)) # SVM with unigram review = review.split() SVM.loadModule("U") SVM.loadWords("U") X = SVM.intersection(SVM.wordList, review) # SVM with adjective inputData.append(SVM.movieReviewer.predict(X)[0]) SVM.loadModule("A") SVM.loadWords("A") X = SVM.intersection(SVM.wordList, review) inputData.append(SVM.movieReviewer.predict(X)[0]) return Tree.predict(inputData)[0]
#!/usr/bin/python import SVM from sklearn.externals import joblib import os from sys import argv os.system("echo -n 'loading keywords...\t\t'") SVM.loadModule(argv[1]) SVM.loadWords(argv[1]) os.system("echo -n '[done]\n'") os.system("echo -n 'testing files:\n[00%'") files = os.listdir('./test/pos')[:3000] total = float(len(files)) * 2 done = 0 progress = 0 correct = 0 for i in files: f = SVM.asciify(open('./test/pos/' + i, 'r').read()).split() result = SVM.movieReviewer.predict(SVM.intersection(SVM.wordList, f))[0] if result > 0: correct += 1 done += 1 if (done / total * 100 >= progress + 5): progress += 5 #os.system("echo -n '\b\b\b=%2d%%'" % progress) files = os.listdir('./test/neg')[:3000]
def process(self): ''' Description : Process function is designed to pull all the function , load the data & process in the gui.Initially all the scores, tuned parameter ,categorical variable parameters & data parameters from svm frame are assigned to respective values. The mask row list generated from mask row function is loaded to mask row value. The imputer parameters are given to Scalar IV, Categorical IV & binary DV then these values are assigned to the respective imputer parameters. ''' scoring = self.svmFrame.getScores() tuned_parameters = self.svmFrame.getTunedParameters() cv_parameters = self.svmFrame.getCVParameters() data_parameters = self.svmFrame.getDataParameters() maskRow = self.getMaskRow() imputerSIV = preprocessing.Imputer(missing_values='NaN', strategy=data_parameters['impute'], axis=0, copy=True) imputerCIV = preprocessing.Imputer(missing_values='NaN', strategy='most_frequent', axis=0, copy=True) imputerBDV = preprocessing.Imputer(missing_values='NaN', strategy='most_frequent', axis=1, copy=True) numRow = len(self.csv) - 1 #-1 because of header row in csv) numSIV = 0 numCIV = 0 for variable in self.variables: if variable.selectedType.get() == 'Scalar IV': numSIV += 1 elif variable.selectedType.get() == 'Categorical IV': numCIV += 1 SIV = np.empty(shape=(numRow,numSIV)) i = 0 for variable in self.variables: if variable.selectedType.get() == 'Scalar IV': SIV[:,i] = np.asarray(variable.values).T i += 1 CIV = np.empty(shape=(numRow,numCIV)) i = 0 for variable in self.variables: if variable.selectedType.get() == 'Categorical IV': variable.catDict = variable.makeCatDict() temp = [] for v in variable.values: temp.append(variable.catDict.get(v, None)) CIV[:,i] = np.asarray(temp).T i += 1 self.variables[self.indexDV].catDict = self.dvFrame.makeCatDict() temp = [] for v in self.variables[self.indexDV].values: temp.append(self.variables[self.indexDV].catDict[v]) y = np.asarray(temp).T if data_parameters['cleanup'] == 'delete': SIV = np.delete(SIV, maskRow, axis=0) CIV = np.delete(CIV, maskRow, axis=0) y = np.delete(y, maskRow, axis=0) else: imputerSIV.fit(SIV) SIV = imputerSIV.transform(SIV) imputerCIV.fit(CIV) CIV = imputerCIV.transform(CIV) imputerBDV.fit(y) y = imputerBDV.transform(y)[0] if data_parameters['scale']: self.stdScaler = preprocessing.StandardScaler().fit(SIV) SIV = self.stdScaler.transform(SIV) if data_parameters['oneHot']: self.encScaler = preprocessing.OneHotEncoder().fit(CIV) CIV = self.encScaler.transform(CIV).toarray() X = np.concatenate((SIV, CIV), axis=1) SVM.skSVM(X, y, scoring, tuned_parameters, data_parameters, cv_parameters)