def classify(data, labels): method = sys.argv[1].upper() method_name = {'SVM': 'SVM', 'RF': 'Random Forest'} test_size_arr = [0.6, 0.5, 0.4] print '方法:%s' % method_name[method] for test_size in test_size_arr: if (method == 'SVM'): train_score, test_score = svm.run(data, labels, test_size) elif (method == 'RF'): train_score, test_score = rf.run(data, labels, test_size) else: train_score, test_score = svm.run(data, labels, test_size) print '=============================================' print '训练集 %s | 测试集 %s' % (1 - test_size, test_size) print '训练集正确率:%s' % train_score print '测试集正确率:%s' % test_score
def runSVM(self): # print("--------TRAINING--------") if self.fileName != "": self.splitSize = int(self.split_lineEdit.text()) self.kernelType = self.kernel_cb.currentText() self.degree = int(self.degree_lineEdit.text()) self.regParam = float(self.regParam_lineEdit.text()) self.tol = float(self.tol_lineEdit.text()) if self.splitSize <=40: # print("test size =",self.splitSize,"%") # print("kernel:",self.kernelType) # print("degree:",self.degree) # print("tolerance value:",self.tol) # print("regularization parameter:",self.regParam) self.results = svm.run(self.fileName,self.splitSize,self.kernelType,self.degree,self.tol,self.regParam) else: pass# print("cannot train on such small dataset") else: pass# print("incorrect file name!") # print("--------SUCCESSFUL--------") QMessageBox.about(self,"Results:", self.results)
def run(k, X, y, gamma): # Your code goes here return z n = np.shape(X)[0] d = np.shape(X)[1] y_pred = np.zeros(shape=(n, 1)) for i in range(k): # boundaries for each fold lower = math.floor(n * i / k) upper = math.floor((n * (i + 1) / k) - 1) # testing set T = list(range(lower, upper + 1)) # training set S = [*range(lower), *range(upper + 1, n)] X_train = np.zeros(shape=(len(S), d)) Y_train = np.zeros(shape=(len(S), 1)) # fill up X_train and Y_train according to training set for j in range(len(S)): row = S[j] X_train[j] = X[row] Y_train[j] = y[row] # training clf = svm.run(X_train, Y_train.reshape((len(S))), gamma) # predicting labels for t in T: test_point = X[t].reshape((1, d)) y_pred[t] = svmpredict.run(clf, test_point) error = np.mean(y != y_pred) return error
def result(): sns.set(palette="muted", color_codes=True) # 3a print '=============================================' print 'HW1-3a' print '=============================================' # fittingGaussian('average_age') normalTest(df['average_age'].values, name='Col[7]') # 3b print '=============================================' print 'HW1-3b' print '=============================================' arr_7 = grouping('average_age') for i in range(0, 5): normalTest(arr_7[i], name='Col[7] in Group %s' % (i + 1)) print '---------------------------------------------' leveneTest(arr_7, name='Col[7]') # 3c print '=============================================' print 'HW1-3c' print '=============================================' oneway(arr_7, name='Col[7]') # 4 print '=============================================' print 'HW1-4' print '=============================================' selectCol = ['message_number', 'variance_age', 'conversation_number'] for key in selectCol: normalTest(df[key].values, name='Col[%s]' % key) print '---------------------------------------------' for key in selectCol: normalTest(np.log(df[key].values), name='log Col[%s]' % key) # 5b print '=============================================' print 'HW1-5' print '=============================================' # no box-cox for key in selectCol: arr = grouping(key) oneway(arr, 'Col[%s]' % key) print '---------------------------------------------' # box-cox for key in selectCol: arr = grouping(key, mode=True) oneway(arr, 'box-cox Col[%s]' % key) # 6 print '=============================================' print 'HW1-5' print '=============================================' testSizeArr = [0.6, 0.5, 0.4] print "SVM ['average_age', 'variance_age']" for size in testSizeArr: train_score, test_score = svm.run(df[['average_age', 'variance_age']], df['group_category'], size) print '[train:test = %s:%s] train correct=%s,test correct=%s.' % ( 1 - size, size, train_score, test_score) print '---------------------------------------------' print "Random Forest ['average_age', 'variance_age']" for size in testSizeArr: train_score, test_score = rf.run(df[['average_age', 'variance_age']], df['group_category'], size) print '[train:test = %s:%s] train correct=%s,test correct=%s.' % ( 1 - size, size, train_score, test_score)
if __name__ == "__main__": loan_2 = 'data/loan_2.csv' sourceData = np.loadtxt(loan_2, dtype=float, delimiter=',', converters={0: loan_type}, skiprows=1) # x是数据 y是标签 y, x = np.split(sourceData, (1, ), axis=1) method = sys.argv[1].upper() method_name = {'SVM': 'SVM', 'RF': 'Random Forest'} test_size_arr = [0.6, 0.5, 0.4] print '方法:%s' % method_name[method] for test_size in test_size_arr: if (method == 'SVM'): train_score, test_score = svm.run(x, y, test_size) elif (method == 'RF'): train_score, test_score = rf.run(x, y, test_size) else: train_score, test_score = svm.run(x, y, test_size) print '=============================================' print '训练集 %s | 测试集 %s' % (1 - test_size, test_size) print '训练集正确率:%s' % train_score print '测试集正确率:%s' % test_score
def run(arg_model, arg_modelname, arg_train, arg_test, arg_features, arg_featurename, arg_name, arg_preprocess, arg_labels, arg_dev=True, arg_hyperopt=False, arg_dataset="mds", arg_n_feats=398, arg_anova="f_classif", arg_nodes=297, dataloc="/u/sjeblee/research/va/data/datasets", arg_rebalance=""): dataloc = dataloc + "/" + arg_dataset trainname = arg_train + "_cat" # all, adult, child, or neonate devname = arg_test + "_cat" pre = arg_preprocess print "pre: " + pre labels = arg_labels featureset = arg_featurename # Name of the feature set for feature file features = arg_features # type, checklist, narr_bow, narr_tfidf, narr_count, narr_vec, kw_bow, kw_tfidf, symp_train modeltype = arg_model # svm, knn, nn, lstm, nb, rf modelname = arg_modelname #resultsloc_name = arg_name # Location of data files resultsloc = "/u/sjeblee/research/va/data/" + arg_name heideldir = "/u/sjeblee/tools/heideltime/heideltime-standalone" scriptdir = "/u/sjeblee/research/va/git/verbal-autopsy" # Setup if not os.path.exists(resultsloc): os.mkdir(resultsloc) trainset = dataloc + "/train_" + trainname + ".xml" devset = "" devfeatures = "" devresults = "" if arg_dev: devset = dataloc + "/dev_" + devname + ".xml" devfeatures = dataloc + "/dev_" + devname + ".features." + featureset devresults = resultsloc + "/dev_" + devname + ".results." + modelname + "." + featureset else: devset = dataloc + "/test_" + devname + ".xml" devfeatures = dataloc + "/test_" + devname + ".features." + featureset devresults = resultsloc + "/test_" + devname + ".results." + modelname + "." + featureset trainfeatures = dataloc + "/train_" + trainname + ".features." + featureset element = "narrative" # Preprocessing spname = "spell" print "Preprocessing..." if "spell" in pre: print "Running spelling correction..." trainsp = dataloc + "/train_" + trainname + "_" + spname + ".xml" devsp = "" if arg_dev: devsp = dataloc + "/dev_" + devname + "_" + spname + ".xml" else: devsp = dataloc + "/test_" + devname + "_" + spname + ".xml" if not os.path.exists(trainsp): print "spellcorrect on train data..." spellcorrect.run(trainset, trainsp) if not os.path.exists(devsp): print "spellcorrect on test data..." spellcorrect.run(devset, devsp) trainset = trainsp devset = devsp devname = devname + "_" + spname trainname = trainname + "_" + spname if "heidel" in pre: print "Running Heideltime..." with cd(heideldir): trainh = dataloc + "/train_" + trainname + "_ht.xml" if not os.path.exists(trainh): heidel_tag.run(trainset, trainh) fixtags(trainh) trainset = trainh devh = "" if arg_dev: devh = dataloc + "/dev_" + devname + "_ht.xml" else: devh = dataloc + "/test_" + devname + "_ht.xml" if not os.path.exists(devh): heidel_tag.run(devset, devh) fixtags(devh) devset = devh devname = devname + "_ht" trainname = trainname + "_ht" if "medttk" in pre: print "Running medttk..." #with cd(heideldir): trainh = dataloc + "/train_" + trainname + "_medttk.xml" if not os.path.exists(trainh): medttk_tag.run(trainset, trainh) fixtags(trainh) trainset = trainh devh = "" if arg_dev: devh = dataloc + "/dev_" + devname + "_medttk.xml" else: devh = dataloc + "/test_" + devname + "_medttk.xml" if not os.path.exists(devh): medttk_tag.run(devset, devh) fixtags(devh) devset = devh devname = devname + "_medttk" trainname = trainname + "_medttk" element = "narr_medttk" if "symp" in pre: print "Tagging symptoms..." sympname = "symp" trainsp = dataloc + "/train_" + trainname + "_" + sympname + ".xml" #devsp = "" #if arg_dev: # devsp = dataloc + "/dev_" + devname + "_" + sympname + ".xml" #else: # devsp = dataloc + "/test_" + devname + "_" + sympname + ".xml" if not os.path.exists(trainsp): print "tag_symptoms on train data..." tag_symptoms.run(trainset, trainsp) #fixtags(trainsp) #if not os.path.exists(devsp): # print "tag_symptoms on test data..." # tag_symptoms.run(devset, devsp) # fixtags(devsp) trainset = trainsp #devset = devsp #devname = devname + "_" + spname trainname = trainname + "_" + spname # Feature Extraction if arg_dev: devset = dataloc + "/dev_" + devname + ".xml" devfeatures = dataloc + "/dev_" + devname + ".features." + featureset devresults = resultsloc + "/dev_" + devname + ".results." + modelname + "." + featureset else: devset = dataloc + "/test_" + devname + ".xml" devfeatures = dataloc + "/test_" + devname + ".features." + featureset devresults = resultsloc + "/test_" + devname + ".results." + modelname + "." + featureset trainfeatures = dataloc + "/train_" + trainname + ".features." + featureset print "trainfeatures: " + trainfeatures print "devfeatures: " + devfeatures stem = False lemma = False if "stem" in pre: stem = True if "lemma" in pre: lemma = True print "stem: " + str(stem) + " lemma: " + str(lemma) if not (os.path.exists(trainfeatures) and os.path.exists(devfeatures)): print "Extracting features..." extract_features_dirichlet.run(trainset, trainfeatures, devset, devfeatures, features, labels, stem, lemma, element) # Rebalance dataset? if arg_rebalance != "": rebalancedfeatures = trainfeatures + "." + arg_rebalance rebalance.run(trainfeatures, rebalancedfeatures, labels, arg_rebalance) trainfeatures = rebalancedfeatures # Model if arg_hyperopt: print "Running hyperopt..." model_dirichlet.hyperopt(modeltype, trainfeatures, devfeatures, devresults, resultsloc, labels) else: print "Creating model..." if modeltype == "nb": svm.run(modeltype, trainfeatures, devfeatures, devresults, labels) #elif modeltype == "cnn": # model_temp.run(modeltype, modelname, trainfeatures, devfeatures, devresults, resultsloc, labels, arg_n_feats, arg_anova, arg_nodes) else: model_dirichlet.run(modeltype, modelname, trainfeatures, devfeatures, devresults, resultsloc, labels, arg_n_feats, arg_anova, arg_nodes) # Results statistics print "Calculating scores..." results_stats.run(devresults, devresults + ".stats") print "Done"
dataTypes = ['los', 'heart', 'adult'] dataTypes = ['los', 'heart', 'adult'] dataTypes = ['Heart', 'Adult'] # dataTypes = ['heart'] # dataTypes = ['adult'] scores = [] for dataType in dataTypes: print(dataType) scores.append(runAll(dataType)) print('DT') dt.run(dataType) print('Neural Network') network.run(dataType) print('Boosted') boosted.run(dataType) print('SVM') svm.run(dataType) print('KNN') knn.run(dataType) endTime = time() print(str(endTime - startTime)) print(scores)
alpha, iterr = kerperceptron.run(5, X_train, y_train) #make prediction for each test point index = 0 for t in X_test: y_pred_perceptron[index] = kerpredict.run(alpha, X_train, y_train, t) index += 1 perceptron_err = np.mean(y_test != y_pred_perceptron) perceptron_error.append(perceptron_err) #svm (training_length, features) = np.shape(X_train) #training svm clf = svm.run(X_train, y_train.reshape((training_length)), 0.001) #testing svm y_pred_svm = svmpredict.run(clf, X_test) y_pred_svm = y_pred_svm.reshape((test_length, 1)) svm_err = np.mean(y_test != y_pred_svm) svm_error.append(svm_err) print(svm_error) print(perceptron_error) # plotting data fig, axs = plt.subplots(2) fig.suptitle('Error Rate vs. Sample Size') axs[0].plot(subset, svm_error) axs[0].set_ylabel("SVM Error Rate")