def test_main(): directory = 'ds2' directory = 'dataset' directory = 'ds3' # load the dataset from disk files = sklearn.datasets.load_files(directory) # refine them refine_all_emails(files.data) # calculate the BOW representation word_counts = bagOfWords(files.data) # TFIDF tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=True).fit(word_counts) X_tfidf = tf_transformer.transform(word_counts) X = X_tfidf #cross validation # clf = sklearn.naive_bayes.MultinomialNB() # clf = sklearn.svm.LinearSVC() n_neighbors = 5 weights = 'uniform' # weights = 'distance' clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights) scores = cross_validation(X, files.target, clf, cv=5) pretty_print_scores(scores)
def read(self, filename): with open(filename, 'r') as f: x, y = f.readlines() trainx, trainy, testx, testy = cross_validation(x, y) return trainx, trainy, testx, testy
def test_main(): directory = 'ds2' directory = 'dataset' directory = 'ds3' # load the dataset from disk files = sklearn.datasets.load_files(directory) # refine them refine_all_emails(files.data) # calculate the BOW representation word_counts = bagOfWords(files.data) # TFIDF tf_transformer = sklearn.feature_extraction.text.TfidfTransformer( use_idf=True).fit(word_counts) X_tfidf = tf_transformer.transform(word_counts) X = X_tfidf # cross validation # clf = sklearn.naive_bayes.MultinomialNB() # clf = sklearn.svm.LinearSVC() n_neighbors = 5 weights = 'uniform' # weights = 'distance' clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights) scores = cross_validation(X, files.target, clf, cv=5) pretty_print_scores(scores)
def cross_validation_Halfaker(training, validation): logger.debug("Cross validation...") data = training.append(validation) data = data.undersample(1) logger.debug("Data size: %d" % len(data)) logger.debug("Vandalism: %d" % data.getY().sum()) clf = sklearn.ensemble.RandomForestClassifier(verbose=0, n_jobs=-1, random_state=1) cv = sklearn.cross_validation.StratifiedKFold(data.getY(), n_folds=5, shuffle=True, random_state=None) cross_validation(clf, data, 'roc_auc', cv) cross_validation(clf, data, SCORERS['pr_auc'], cv)
def one_at_a_time(frame, columns, label, norm=False, **kwargs): scores = [] for col in columns: cross_val = cross_validation(frame, [col], label, **kwargs) score = cross_val_performance(cross_val).mean() scores.append(score) scores = np.array(scores) if norm: scores = scores / float(scores.max()) return scores
def video_cross_validation(video_list): global NB_accuracy_array, NB_precision_array, NB_recall_array global SVM_accuracy_array, SVM_precision_array, SVM_recall_array global LDA_accuracy_array, LDA_precision_array, LDA_recall_array NB_accuracy_array = [] NB_precision_array = [] NB_recall_array = [] SVM_accuracy_array = [] SVM_precision_array = [] SVM_recall_array = [] LDA_accuracy_array = [] LDA_precision_array = [] LDA_recall_array = [] for i in range(0,len(video_list)) : test = video_list[i] train = video_list[0:i] + video_list[i+1:] print '\n-------------------------------------------------------------\nRound ',i,':' cross_validation(train, test) NB_acc = float(sum(NB_accuracy_array) / len(NB_accuracy_array)) #SVM_acc = float(sum(SVM_accuracy_array) / len(SVM_accuracy_array)) LDA_acc = float(sum(LDA_accuracy_array) / len(LDA_accuracy_array)) NB_prec = float(sum(NB_precision_array) / len(NB_precision_array)) LDA_prec = float(sum(LDA_precision_array) / len(LDA_precision_array)) NB_rec = float(sum(NB_recall_array) / len(NB_recall_array)) LDA_rec = float(sum(LDA_recall_array) / len(LDA_recall_array)) print '\nTotal Results: \n- NB accuracy = {} % NB precision {} % NB recall {} % \n- LDA accuracy = {} % LDA precision {} % LDA recall {} % '.format(NB_acc, NB_prec, NB_rec, LDA_acc, LDA_prec, LDA_rec)
def train_topic_classifier_cv(topics, classes, full_selection, max_depth, features, classifier_fn, instance_weight_fn, cross_validation, evaluation_measure, param_grid, classifier_params) -> GridSearchCV: tuned_clf = CategorySelectionClassifier( full_selection=full_selection, features=features, classifier_fn=classifier_fn, max_depth=full_selection._max_depth, instance_weight=instance_weight_fn, **classifier_params) clf = GridSearchCV(estimator=tuned_clf, param_grid=param_grid, cv=cross_validation(classes), scoring=evaluation_measure) clf.fit(topics, classes) return clf
from os.path import isfile, join # df = gd.GetData(gd.f) MSEId, MSERegr, MSERegr_ey, MSE_log, MSE1NN, MSE2NN, MSE3NN = 0, 0, 0, 0, 0, 0, 0 MSE = [0, 0, 0, 0, 0, 0, 0] path = 'data_all/' #onlyfiles = [f for f in listdir(path) if isfile(join(path, f))] #path = '' onlyfiles1 = ['aaa', 'data_test.csv'] onlyfiles = ['pilotaz'+str(i+1)+'.csv' for i in range(5)] print(onlyfiles) for file in onlyfiles: DF = gd.get_data(path+file) CV = cross_validation((DF['stimulus']), (DF['converted'])) MSE2 = MSE MSE = [MSE2[i] + CV[i] for i in range(len(CV))] l = len(onlyfiles) msn_labels = ['MSEId:', 'MSERegr:', 'MSERegr_ey:', 'MSE_log:', 'MSE1NN:', 'MSE2NN:', 'MSE3NN:'] for index, mse in enumerate(MSE): print(msn_labels[index] + " ", mse/l) # print(cross_validation((df['stimulus']), (df['converted'])))
t_2 = t.time() vec_dist = dist_vec(x_train,x_test) t_3 = t.time() t_vec = t_3 - t_2 print "Calculate distance matrix with vectorization in", t_vec, "s" # assert that both method yield the same results assert(vec_dist.shape == loop_dist.shape) assert( np.array_equal(vec_dist,loop_dist) ) print "Both methods yield the same result." print "Nearest Neighbor Classifier:" labels = (1,3) error_rate = test_classifier(nn_classifier, data, target, labels) print "Classification rate for distinguishing 1 and 3:", error_rate labels = (1,7) error_rate = test_classifier(nn_classifier, data, target, labels) print "Classification rate for distinguishing 1 and 7:", error_rate print "k-Nearest Neighbor Classifier" for k in (1,3,5,9,17,33): error_rate = test_classifier(knn_classifier, data, target, labels) print "k =", k, ": Classification rate for distinguishing 1 and 7:", error_rate print "Cross validation" for n in (2,5,10): mean, var = cross_validation(data, target, n) print "For n =", n, ": mean classifcation rate =", mean, "with variance = ", var
training_data = Preparation.read_in(sys.argv[1], sys.argv[2]) random.shuffle(training_data) # Unzip labels and texts into separate lists: labels, feature_vectors = zip(*training_data) vectorizer = DictVectorizer() X = vectorizer.fit_transform(feature_vectors) y = labels # Train a classifier print("Starting the cross-validation...") cross_validation() print("Done!") #################################################################### # Print Instructions # #################################################################### else: print("\nUSAGE:\n") print("python3 cross_validation.py spam_data.json ham_data.json \n") print("spam_data.json: labeled spam data in JSON format.") print("ham_data.json: labeled ham data in JSON format.\n\n")
X_valid = np.transpose([kf_A_valid[i], kf_B_valid[i]]) y_valid = np.array(kf_L_valid[i]) result = method(X, y) current_score = result.score(X_valid, y_valid) # Print the score for this fold print "Validation score: ", current_score # Store the best result and best score # We will use this fold and test data to obtain the test score if current_score > best_score: best_result = result best_score = current_score print "Best Score:", best_score # Plot the boundary with best result plot_boundary(best_result, X, y) print "Test score: ", best_result.score(X_test, y_test) C = 2.0 # SVM regularization parameter degree = 6 # Polynomial degree cross_validation(rbf_svc) cross_validation(lin_svc) cross_validation(poly_svc) cross_validation(logistic_regression) cross_validation(random_forest_classifier) cross_validation(perceptron)
## use above function print_faces(faces.images, faces.target, 40) #### Split data (or holdout) x_train, x_test, y_train, y_test = train_test_split(faces.data, faces.target, test_size=0.25, random_state=0) ## Holdout #### Prepare classifer clf = svm.SVC(kernel='linear') #### Cross-validation def cross_validation(clf, x, y, k): cv = KFold(len(y), k, shuffle=True, random_state=0) ## create a k-fold cross validation iterator scores = cross_val_score(clf, x, y, cv=cv) ## by default the score used is the one returned by score method of the estimator (accuracy) print(scores) cross_validation(clf, x_train, y_train, 5) #### Train and Evaluate from sklearn import metrics def train_and_evaluate(clf, x_train, x_test, y_train, y_test): ## Train clf.fit(x_train, y_train) print("Accuracy on training set:") print(clf.score(x_train, y_train)) ## Evaluate y_pred = clf.predict(x_test) print("Classification Report:") print(metrics.classification_report(y_test, y_pred)) print("Confusion Matrix:")
print """ Linear SVM, 3-class 5-fold CV, no-class weights """ thresh = 0.000005/2 print "thresh = {}".format(thresh) hl = 100 K = 5 quotes['label'] = 0 quotes.ix[quotes['log_returns_100+'] > thresh, 'label'] = 1 quotes.ix[quotes['log_returns_100+'] < -thresh, 'label'] = -1 #clf = LRclf(thresh) clf = svm.LinearSVC(class_weight='auto') cv_results = cross_validation(quotes, clf, feature_names, label='label', K=5) y = quotes['label'].values #reg = sm.OLS(y, quotes[feature_names]).fit() #print reg.summary() clf_output(cv_results, y, K, feature_names) print """ Linear Regression Classifier, 3-class 5-fold CV, no-class weights """ thresh = 0.000005*100 print "thresh = {}".format(thresh) hl = 100 K = 5 quotes['label'] = 0
def main(): # save_likelihood("likelihood") cross_validation("data/enron2/",k=10,verbose=True) return
t_2 = t.time() vec_dist = dist_vec(x_train, x_test) t_3 = t.time() t_vec = t_3 - t_2 print "Calculate distance matrix with vectorization in", t_vec, "s" # assert that both method yield the same results assert (vec_dist.shape == loop_dist.shape) assert (np.array_equal(vec_dist, loop_dist)) print "Both methods yield the same result." print "Nearest Neighbor Classifier:" labels = (1, 3) error_rate = test_classifier(nn_classifier, data, target, labels) print "Classification rate for distinguishing 1 and 3:", error_rate labels = (1, 7) error_rate = test_classifier(nn_classifier, data, target, labels) print "Classification rate for distinguishing 1 and 7:", error_rate print "k-Nearest Neighbor Classifier" for k in (1, 3, 5, 9, 17, 33): error_rate = test_classifier(knn_classifier, data, target, labels) print "k =", k, ": Classification rate for distinguishing 1 and 7:", error_rate print "Cross validation" for n in (2, 5, 10): mean, var = cross_validation(data, target, n) print "For n =", n, ": mean classifcation rate =", mean, "with variance = ", var
from os import listdir from os.path import isfile, join # df = gd.GetData(gd.f) MSEId, MSERegr, MSERegr_ey, MSE_log, MSE1NN, MSE2NN, MSE3NN = 0, 0, 0, 0, 0, 0, 0 MSE = [0, 0, 0, 0, 0, 0, 0] path = 'data_all/' #onlyfiles = [f for f in listdir(path) if isfile(join(path, f))] #path = '' onlyfiles1 = ['aaa', 'data_test.csv'] onlyfiles = ['pilotaz' + str(i + 1) + '.csv' for i in range(5)] print(onlyfiles) for file in onlyfiles: DF = gd.get_data(path + file) CV = cross_validation((DF['stimulus']), (DF['converted'])) MSE2 = MSE MSE = [MSE2[i] + CV[i] for i in range(len(CV))] l = len(onlyfiles) msn_labels = [ 'MSEId:', 'MSERegr:', 'MSERegr_ey:', 'MSE_log:', 'MSE1NN:', 'MSE2NN:', 'MSE3NN:' ] for index, mse in enumerate(MSE): print(msn_labels[index] + " ", mse / l) # print(cross_validation((df['stimulus']), (df['converted'])))
c = 1 #run 2 thresh = 0.000005 hl = 100 K = 5 filtered_data['label'] = 0 filtered_data.loc[filtered_data[pred_col] > thresh, 'label'] = 1 filtered_data.loc[filtered_data[pred_col] < -thresh, 'label'] = -1 #clf = LRclf(thresh) clf = svm.LinearSVC(C=c, class_weight='auto') cv_results = cross_validation(filtered_data, clf, feature_names, label='label', K=5) y = filtered_data['label'].values #reg = sm.OLS(y, quotes[feature_names]).fit() #print reg.summary() print """ Filtered-data Linear SVM, 3-class 5-fold CV, auto class weights """ print "crossover_hls = {}".format(crossover_hls) print "Pred_col = {}".format(pred_col) print "thresh = {}".format(thresh) print "C = {}".format(c) clf_output(cv_results, y, K) #run 3
for i, mse in enumerate( [MSEId, MSERegr, MSERegr_ey, MSERegr_log, MSE1NN, MSE2NN, MSE3NN]): print(names[i] + ": ", mse) N = 31 path = 'data_all/' with open(path + "data_all.pkl", 'rb') as f: data_all = pickle.load(f) allX, allId, allY, allRegr, allRegr_ey, allRegr_log, all1NN, all2NN, all3NN = [], [], [], [], [], [], [], [], [] for data in data_all: resultsX, resultsId, resultsY, resultsRegr, resultsRegr_ey, resultsRegr_log, results1NN, results2NN, results3NN = cross_validation( data['stimulus'], data['converted']) allX += resultsX allId += resultsId allY += resultsY allRegr += resultsRegr allRegr_ey += resultsRegr_ey allRegr_log += resultsRegr_log all1NN += results1NN all2NN += results2NN all3NN += results3NN MSEId, MSERegr, MSERegr_ey, MSERegr_log, MSE1NN, MSE2NN, MSE3NN = mse( allX, allId), mse(allY, allRegr), mse(allY, allRegr_ey), mse( allY, allRegr_log), mse(allY, all1NN), mse(allY, all2NN), mse(allY, all3NN)