def runCrossValidationTest(classifier_name, classifier_args=None, ngram=2, folds=5): if classifier_args is None: classifier_args = {} classifier = valid_classifiers[classifier_name](**classifier_args) X, y = load_non_preprocessed_data() # confusion = numpy.array([[0,0,0],[0,0,0],[0,0,0]]) ml_pipeline = Pipeline([ ('tfidf', TfidfVectorizer(sublinear_tf=True, ngram_range=(1,ngram))), ('Classifier', classifier), ]) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y, test_size = 0.25, random_state=0) ml_pipeline.fit(X_train, y_train) predictions = ml_pipeline.predict(X_test) confusion = confusion_matrix(y_test, predictions) f1 = f1_score(y_test, predictions, pos_label=None, average = 'micro') precision = precision_score(y_test, predictions, pos_label=None, average = 'micro') recall = recall_score(y_test, predictions, pos_label=None, average = 'micro') print(" >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>") print("F1 score: " + str(f1)) print("precision score: " + str(precision)) print("recall score: " + str(recall)) print(confusion) numpy.savetxt("data/test_results_confusion_matrix_" + classifier_name+".csv", confusion, delimiter=",") return ((f1, precision, recall))
def runTest( classifier_name, classifier_args=None, ngram=2, folds=5): print() print('running test') if classifier_args is None: classifier_args = {} classifier = valid_classifiers[classifier_name](**classifier_args) X, y = load_non_preprocessed_data() kfold = KFold(n=len(X), n_folds = folds) print(kfold) f1_scores = [] precision_scores = [] recall_scores = [] confusion = numpy.array([[0,0,0],[0,0,0],[0,0,0]]) # print(confusion) ml_pipeline = Pipeline([ ('tfidf', TfidfVectorizer(sublinear_tf=True, ngram_range=(1,ngram))), ('Classifier', classifier), ]) for train_i, test_i in kfold: print(".") X_train = [] y_train = [] X_test = [] y_test = [] for i in train_i: X_train.append(X[i]) y_train.append(y[i]) for i in test_i: X_test.append(X[i]) y_test.append(y[i]) ml_pipeline.fit(X_train, y_train) predictions = ml_pipeline.predict(X_test) # print(confusion_matrix(y_test, predictions)) confusion += confusion_matrix(y_test, predictions) f1_scores.append(f1_score(y_test, predictions, pos_label=None, average = 'micro')) precision_scores.append(precision_score(y_test, predictions, pos_label=None, average = 'micro')) recall_scores.append(recall_score(y_test, predictions, pos_label=None, average = 'micro')) average_f1_score=sum(f1_scores)/len(f1_scores) average_precision_score=sum(precision_scores)/len(precision_scores) average_recall_score=sum(recall_scores)/len(recall_scores) print(" >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>") print("F1 score: " + str(average_f1_score)) print("precision score: " + str(average_precision_score)) print("recall score: " + str(average_recall_score)) print(confusion) numpy.savetxt("data/test_results_confusion_matrix.csv", confusion, delimiter=",") return ((average_f1_score, average_precision_score, average_recall_score))
def main(classifier_name, classifier_args=None, ngram=2, folds=5, preprocessed=False, preprocess_records = None ): if preprocess_records: X,y = preprocess_records elif preprocessed: X, y = load_preprocessed_data() else: X, y = load_non_preprocessed_data() # StratifiedKFold makes sure that there's no unfortunate data split skf = StratifiedKFold(y, folds) ############################### # Training and testing models # ############################### print() print('training classifier') if classifier_args is None: classifier_args = {} classifier = valid_classifiers[classifier_name](**classifier_args) params = { # "tfidf__ngram_range": [(1, 2)], # "Classifier__class_weight": [{ 0: 1, 1: 100, 2: 1}, { 0: 1, 1: 1, 2: 1}], # "Classifier__C": [.01, .1, 1, 10, 100], # "Classifier__kernel": ['rbf', 'linear', 'poly', 'sigmoid'], # "Classifier__penalty": ['l1', 'l2', 'elasticnet'], # "Classifier__loss" : ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'], # "Classifier__n_neighbors": [3, 5, 7, 11], # "Classifier__algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute'] } ml_pipeline = Pipeline([ ('tfidf', TfidfVectorizer(sublinear_tf=True, ngram_range=(1,ngram))), # ('Vectorization', CountVectorizer(binary='false')), # ('Feature Refinement', TfidfTransformer(use_idf=False)), # ('Feature Selection', SelectKBest(chi2, 1000)), ('Feature Reduction', ClassifierOvOFeaturesReduction()), ('Classifier', classifier), ]) # f1_scorer = make_scorer(f1_score) gs = GridSearchCV(ml_pipeline, params, cv = folds, verbose=2, n_jobs=-1) gs.fit(X, y) # print(gs.best_params_) print(gs.best_score_) print('>>>>>>>>>>') # print(gs.grid_scores_) return(gs.best_score_)
def main(classifier_name, classifier_args=None, ngram=2 ): X, y = load_non_preprocessed_data() ############################### # Training and testing models # ############################### print() print('training classifier') if classifier_args is None: classifier_args = {} classifier = valid_classifiers[classifier_name](**classifier_args) ml_pipeline = Pipeline([ ('tfidf', TfidfVectorizer(sublinear_tf=True, ngram_range=(1,ngram))), ('Classifier', classifier), ]) ml_pipeline.fit(X, y) print('labeling data') with open('data/tomato_db.json') as data_file: data = json.load(data_file); reviews = data["reviews"]; total_time = 0; for review in reviews: start_time = time.clock(); review['sentiment'] = numpy.asscalar(ml_pipeline.predict([review['review']])[0]) total_time += time.clock() - start_time; f = open('data/tomato_db_labeled.json', 'w', encoding='UTF-8') f.write(json.dumps(data, indent = 4)) f.close() print("Time taken per record: %f" % (total_time / len(reviews)));