Beispiel #1
0
def test_main():
    directory = 'ds2'
    directory = 'dataset'
    directory = 'ds3'
    # load the dataset from disk
    files = sklearn.datasets.load_files(directory)

    # refine them
    refine_all_emails(files.data)

    # calculate the BOW representation
    word_counts = bagOfWords(files.data)

    # TFIDF
    tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=True).fit(word_counts)
    X_tfidf = tf_transformer.transform(word_counts)


    X = X_tfidf

    #cross validation
    # clf = sklearn.naive_bayes.MultinomialNB()
    # clf = sklearn.svm.LinearSVC()
    n_neighbors = 5
    weights = 'uniform'
    # weights = 'distance'
    clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
    scores = cross_validation(X, files.target, clf, cv=5)
    pretty_print_scores(scores)
    def read(self, filename):

        with open(filename, 'r') as f:
            x, y = f.readlines()

        trainx, trainy, testx, testy = cross_validation(x, y)
        return trainx, trainy, testx, testy
Beispiel #3
0
def test_main():
    directory = 'ds2'
    directory = 'dataset'
    directory = 'ds3'
    # load the dataset from disk
    files = sklearn.datasets.load_files(directory)

    # refine them
    refine_all_emails(files.data)

    # calculate the BOW representation
    word_counts = bagOfWords(files.data)

    # TFIDF
    tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(
        use_idf=True).fit(word_counts)
    X_tfidf = tf_transformer.transform(word_counts)

    X = X_tfidf

    # cross validation
    # clf = sklearn.naive_bayes.MultinomialNB()
    # clf = sklearn.svm.LinearSVC()
    n_neighbors = 5
    weights = 'uniform'
    # weights = 'distance'
    clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
    scores = cross_validation(X, files.target, clf, cv=5)
    pretty_print_scores(scores)
Beispiel #4
0
def cross_validation_Halfaker(training, validation):
    logger.debug("Cross validation...")
    data = training.append(validation)
    data = data.undersample(1)
    logger.debug("Data size: %d" % len(data))
    logger.debug("Vandalism: %d" % data.getY().sum())

    clf = sklearn.ensemble.RandomForestClassifier(verbose=0,
                                                  n_jobs=-1,
                                                  random_state=1)

    cv = sklearn.cross_validation.StratifiedKFold(data.getY(),
                                                  n_folds=5,
                                                  shuffle=True,
                                                  random_state=None)
    cross_validation(clf, data, 'roc_auc', cv)
    cross_validation(clf, data, SCORERS['pr_auc'], cv)
Beispiel #5
0
def one_at_a_time(frame, columns, label, norm=False, **kwargs):
    scores = []
    for col in columns:
        cross_val = cross_validation(frame, [col], label, **kwargs)
        score = cross_val_performance(cross_val).mean()
        scores.append(score)

    scores = np.array(scores)
    if norm:
        scores = scores / float(scores.max())

    return scores
Beispiel #6
0
def one_at_a_time(frame, columns, label, norm=False, **kwargs):
    scores = []
    for col in columns:
        cross_val = cross_validation(frame, [col], label, **kwargs)
        score = cross_val_performance(cross_val).mean()
        scores.append(score)

    scores = np.array(scores)
    if norm:
        scores = scores / float(scores.max())

    return scores
def video_cross_validation(video_list):
    global NB_accuracy_array, NB_precision_array, NB_recall_array
    global SVM_accuracy_array, SVM_precision_array, SVM_recall_array
    global LDA_accuracy_array, LDA_precision_array, LDA_recall_array

    NB_accuracy_array = []
    NB_precision_array = []
    NB_recall_array = []

    SVM_accuracy_array = []
    SVM_precision_array = []
    SVM_recall_array = []

    LDA_accuracy_array = []
    LDA_precision_array = []
    LDA_recall_array = []


    for i in range(0,len(video_list)) :
	test = video_list[i]
	train = video_list[0:i] + video_list[i+1:]

	print '\n-------------------------------------------------------------\nRound ',i,':'
	cross_validation(train, test)
	

    NB_acc = float(sum(NB_accuracy_array) / len(NB_accuracy_array))
    #SVM_acc = float(sum(SVM_accuracy_array) / len(SVM_accuracy_array))
    LDA_acc = float(sum(LDA_accuracy_array) / len(LDA_accuracy_array))

    NB_prec = float(sum(NB_precision_array) / len(NB_precision_array))
    LDA_prec = float(sum(LDA_precision_array) / len(LDA_precision_array))

    NB_rec = float(sum(NB_recall_array) / len(NB_recall_array))
    LDA_rec = float(sum(LDA_recall_array) / len(LDA_recall_array))

    print '\nTotal Results: \n- NB accuracy = {} % NB precision {} % NB recall {} % \n- LDA accuracy = {} % LDA precision {} % LDA recall {} % '.format(NB_acc, NB_prec, NB_rec, LDA_acc, LDA_prec, LDA_rec)
Beispiel #8
0
def train_topic_classifier_cv(topics, classes, full_selection, max_depth,
                              features, classifier_fn, instance_weight_fn,
                              cross_validation, evaluation_measure, param_grid,
                              classifier_params) -> GridSearchCV:
    tuned_clf = CategorySelectionClassifier(
        full_selection=full_selection,
        features=features,
        classifier_fn=classifier_fn,
        max_depth=full_selection._max_depth,
        instance_weight=instance_weight_fn,
        **classifier_params)
    clf = GridSearchCV(estimator=tuned_clf,
                       param_grid=param_grid,
                       cv=cross_validation(classes),
                       scoring=evaluation_measure)
    clf.fit(topics, classes)
    return clf
Beispiel #9
0
from os.path import isfile, join
# df = gd.GetData(gd.f)


MSEId, MSERegr, MSERegr_ey, MSE_log, MSE1NN, MSE2NN, MSE3NN = 0, 0, 0, 0, 0, 0, 0
MSE = [0, 0, 0, 0, 0, 0, 0]
path = 'data_all/'
#onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]
#path = ''
onlyfiles1 = ['aaa', 'data_test.csv']

onlyfiles = ['pilotaz'+str(i+1)+'.csv' for i in range(5)]
print(onlyfiles)

for file in onlyfiles:
	DF = gd.get_data(path+file)
	
	CV = cross_validation((DF['stimulus']), (DF['converted']))
	MSE2 = MSE
	MSE = [MSE2[i] + CV[i] for i in range(len(CV))]

l = len(onlyfiles)
msn_labels = ['MSEId:', 'MSERegr:', 'MSERegr_ey:', 'MSE_log:', 'MSE1NN:', 'MSE2NN:', 'MSE3NN:']
for index, mse in enumerate(MSE):
	print(msn_labels[index] + " ", mse/l)



	# print(cross_validation((df['stimulus']), (df['converted'])))

Beispiel #10
0
	t_2 = t.time()
	vec_dist = dist_vec(x_train,x_test)
	t_3 = t.time()
	t_vec = t_3 - t_2
	print "Calculate distance matrix with vectorization in", t_vec, "s"

	# assert that both method yield the same results
	assert(vec_dist.shape == loop_dist.shape)
	assert( np.array_equal(vec_dist,loop_dist) )
	print "Both methods yield the same result."

	print "Nearest Neighbor Classifier:"

	labels = (1,3)
	error_rate = test_classifier(nn_classifier, data, target, labels)
	print "Classification rate for distinguishing 1 and 3:", error_rate
	
	labels = (1,7)
	error_rate = test_classifier(nn_classifier, data, target, labels)
	print "Classification rate for distinguishing 1 and 7:", error_rate

	print "k-Nearest Neighbor Classifier"
	for k in (1,3,5,9,17,33):
		error_rate = test_classifier(knn_classifier, data, target, labels)
		print "k =", k, ": Classification rate for distinguishing 1 and 7:", error_rate 

	print "Cross validation"
	for n in (2,5,10):
		mean, var = cross_validation(data, target, n)
		print "For n =", n, ": mean classifcation rate =", mean, "with variance = ", var
Beispiel #11
0
    training_data = Preparation.read_in(sys.argv[1], sys.argv[2])

    random.shuffle(training_data)

    # Unzip labels and texts into separate lists:
    labels, feature_vectors = zip(*training_data)

    vectorizer = DictVectorizer()

    X = vectorizer.fit_transform(feature_vectors)
    y = labels

    # Train a classifier
    print("Starting the cross-validation...")
    cross_validation()

    print("Done!")

####################################################################
#     Print Instructions                                           #
####################################################################

else:

    print("\nUSAGE:\n")

    print("python3 cross_validation.py spam_data.json ham_data.json \n")
    print("spam_data.json: labeled spam data in JSON format.")
    print("ham_data.json: labeled ham data in JSON format.\n\n")
Beispiel #12
0
        X_valid = np.transpose([kf_A_valid[i], kf_B_valid[i]])
        y_valid = np.array(kf_L_valid[i])

        result = method(X, y)
        current_score = result.score(X_valid, y_valid)

        # Print the score for this fold
        print "Validation score: ", current_score

        # Store the best result and best score
        # We will use this fold and test data to obtain the test score
        if current_score > best_score:
            best_result = result
            best_score = current_score

    print "Best Score:", best_score

    # Plot the boundary with best result
    plot_boundary(best_result, X, y)
    print "Test score: ", best_result.score(X_test, y_test)


C = 2.0  # SVM regularization parameter
degree = 6  # Polynomial degree
cross_validation(rbf_svc)
cross_validation(lin_svc)
cross_validation(poly_svc)
cross_validation(logistic_regression)
cross_validation(random_forest_classifier)
cross_validation(perceptron)
Beispiel #13
0
## use above function
print_faces(faces.images, faces.target, 40)

#### Split data (or holdout)
x_train, x_test, y_train, y_test = train_test_split(faces.data, faces.target, test_size=0.25, random_state=0) ## Holdout

#### Prepare classifer
clf = svm.SVC(kernel='linear')

#### Cross-validation
def cross_validation(clf, x, y, k):
    cv      = KFold(len(y), k, shuffle=True, random_state=0) ##  create a k-fold cross validation iterator
    scores  = cross_val_score(clf, x, y, cv=cv) ## by default the score used is the one returned by score method of the estimator (accuracy)
    print(scores)

cross_validation(clf, x_train, y_train, 5)

#### Train and Evaluate
from sklearn import metrics
def train_and_evaluate(clf, x_train, x_test, y_train, y_test):
    
    ## Train
    clf.fit(x_train, y_train)
    print("Accuracy on training set:")
    print(clf.score(x_train, y_train))
    
    ## Evaluate
    y_pred = clf.predict(x_test)
    print("Classification Report:")
    print(metrics.classification_report(y_test, y_pred))
    print("Confusion Matrix:")
print """
Linear SVM, 3-class 5-fold CV, no-class weights
"""
thresh = 0.000005/2
print "thresh = {}".format(thresh)
hl = 100
K = 5

quotes['label'] = 0
quotes.ix[quotes['log_returns_100+'] > thresh, 'label'] = 1
quotes.ix[quotes['log_returns_100+'] < -thresh, 'label'] = -1

#clf = LRclf(thresh)

clf = svm.LinearSVC(class_weight='auto')
cv_results = cross_validation(quotes, clf, feature_names, label='label', K=5)
y = quotes['label'].values
#reg = sm.OLS(y, quotes[feature_names]).fit()
#print reg.summary()
clf_output(cv_results, y, K, feature_names)


print """
Linear Regression Classifier, 3-class 5-fold CV, no-class weights
"""
thresh = 0.000005*100
print "thresh = {}".format(thresh)
hl = 100
K = 5

quotes['label'] = 0
Beispiel #15
0
def main():
#    save_likelihood("likelihood")
    cross_validation("data/enron2/",k=10,verbose=True)
    return
Beispiel #16
0
    t_2 = t.time()
    vec_dist = dist_vec(x_train, x_test)
    t_3 = t.time()
    t_vec = t_3 - t_2
    print "Calculate distance matrix with vectorization in", t_vec, "s"

    # assert that both method yield the same results
    assert (vec_dist.shape == loop_dist.shape)
    assert (np.array_equal(vec_dist, loop_dist))
    print "Both methods yield the same result."

    print "Nearest Neighbor Classifier:"

    labels = (1, 3)
    error_rate = test_classifier(nn_classifier, data, target, labels)
    print "Classification rate for distinguishing 1 and 3:", error_rate

    labels = (1, 7)
    error_rate = test_classifier(nn_classifier, data, target, labels)
    print "Classification rate for distinguishing 1 and 7:", error_rate

    print "k-Nearest Neighbor Classifier"
    for k in (1, 3, 5, 9, 17, 33):
        error_rate = test_classifier(knn_classifier, data, target, labels)
        print "k =", k, ": Classification rate for distinguishing 1 and 7:", error_rate

    print "Cross validation"
    for n in (2, 5, 10):
        mean, var = cross_validation(data, target, n)
        print "For n =", n, ": mean classifcation rate =", mean, "with variance = ", var
Beispiel #17
0
from os import listdir
from os.path import isfile, join
# df = gd.GetData(gd.f)

MSEId, MSERegr, MSERegr_ey, MSE_log, MSE1NN, MSE2NN, MSE3NN = 0, 0, 0, 0, 0, 0, 0
MSE = [0, 0, 0, 0, 0, 0, 0]
path = 'data_all/'
#onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]
#path = ''
onlyfiles1 = ['aaa', 'data_test.csv']

onlyfiles = ['pilotaz' + str(i + 1) + '.csv' for i in range(5)]
print(onlyfiles)

for file in onlyfiles:
    DF = gd.get_data(path + file)

    CV = cross_validation((DF['stimulus']), (DF['converted']))
    MSE2 = MSE
    MSE = [MSE2[i] + CV[i] for i in range(len(CV))]

l = len(onlyfiles)
msn_labels = [
    'MSEId:', 'MSERegr:', 'MSERegr_ey:', 'MSE_log:', 'MSE1NN:', 'MSE2NN:',
    'MSE3NN:'
]
for index, mse in enumerate(MSE):
    print(msn_labels[index] + " ", mse / l)

    # print(cross_validation((df['stimulus']), (df['converted'])))
Beispiel #18
0
c = 1

#run 2
thresh = 0.000005
hl = 100
K = 5

filtered_data['label'] = 0
filtered_data.loc[filtered_data[pred_col] > thresh, 'label'] = 1
filtered_data.loc[filtered_data[pred_col] < -thresh, 'label'] = -1

#clf = LRclf(thresh)

clf = svm.LinearSVC(C=c, class_weight='auto')
cv_results = cross_validation(filtered_data, clf, feature_names, label='label', K=5)
y = filtered_data['label'].values
#reg = sm.OLS(y, quotes[feature_names]).fit()
#print reg.summary()

print """
Filtered-data Linear SVM, 3-class 5-fold CV, auto class weights
"""
print "crossover_hls = {}".format(crossover_hls)
print "Pred_col = {}".format(pred_col)
print "thresh = {}".format(thresh)
print "C = {}".format(c)
clf_output(cv_results, y, K)


#run 3
    for i, mse in enumerate(
        [MSEId, MSERegr, MSERegr_ey, MSERegr_log, MSE1NN, MSE2NN, MSE3NN]):
        print(names[i] + ":  ", mse)


N = 31
path = 'data_all/'

with open(path + "data_all.pkl", 'rb') as f:
    data_all = pickle.load(f)

allX, allId, allY, allRegr, allRegr_ey, allRegr_log, all1NN, all2NN, all3NN = [], [], [], [], [], [], [], [], []

for data in data_all:

    resultsX, resultsId, resultsY, resultsRegr, resultsRegr_ey, resultsRegr_log, results1NN, results2NN, results3NN = cross_validation(
        data['stimulus'], data['converted'])
    allX += resultsX
    allId += resultsId
    allY += resultsY
    allRegr += resultsRegr
    allRegr_ey += resultsRegr_ey
    allRegr_log += resultsRegr_log
    all1NN += results1NN
    all2NN += results2NN
    all3NN += results3NN

MSEId, MSERegr, MSERegr_ey, MSERegr_log, MSE1NN, MSE2NN, MSE3NN = mse(
    allX, allId), mse(allY, allRegr), mse(allY, allRegr_ey), mse(
        allY, allRegr_log), mse(allY, all1NN), mse(allY,
                                                   all2NN), mse(allY, all3NN)