def main(): """ main """ DATASET_LOCATION = '../datasets/' POS_DATASET_NAME = 'brown_pos_dataset.hdf5' POS_DATASET_PATH = os.path.join(DATASET_LOCATION, POS_DATASET_NAME) WORD_BY_WORD_MATRIX = 'brown.word-by-word.normalized.npy' VECTOR_INDEX_PATH = os.path.join(DATASET_LOCATION, WORD_BY_WORD_MATRIX) CUTOFF = 10000 MODELS_PATH = "../trained_models/" MODEL_NAME = "mlp_model.pkl" classifier = os.path.join(MODELS_PATH, MODEL_NAME) test_dataset = prep_dataset(POS_DATASET_PATH, VECTOR_INDEX_PATH, which_sets=['test'], cutoff=CUTOFF, cast_y=False) test_X, test_y = test_dataset test_y = test_y.get_value().astype('int32') predictions = predict(classifier, test_X.get_value()) CORPUS_INDICES = 'brown_pos_dataset.indices' with open(os.path.join(DATASET_LOCATION, CORPUS_INDICES)) as indices_file: corpus_indices = cPickle.load(indices_file) # map tag ids back to strings y_test_actual = [corpus_indices['idx2tag'][tag_idx] for tag_idx in test_y] y_test_hat = [corpus_indices['idx2tag'][tag_idx] for tag_idx in predictions] # Quick Evaluation acc = sum([y==p for y,p in zip(predictions, test_y)]) / float(len(predictions)) print "ACC: {}".format((acc)) # get class names class_names = list(set(y_test_actual)) # Compute confusion matrix cm = confusion_matrix(y_test_actual, y_test_hat, labels=class_names) # Normalize the confusion matrix by row (i.e by the number of samples in each class) cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] plt.figure() plot_confusion_matrix(cm_normalized, class_names, title='Normalized confusion matrix') plt.show()
def main(): CUTOFF = 100000 DATASET_LOCATION = "../datasets/" POS_DATASET_NAME = "brown_pos_dataset.hdf5" POS_DATASET_PATH = os.path.join(DATASET_LOCATION, POS_DATASET_NAME) WORD_BY_WORD_MATRIX = "brown.word-by-word.normalized.npy" VECTOR_INDEX_PATH = os.path.join(DATASET_LOCATION, WORD_BY_WORD_MATRIX) MODELS_PATH = "../trained_models/" MODEL_NAME = "logistic_regression_model.pkl" classifier = os.path.join(MODELS_PATH, MODEL_NAME) test_dataset = prep_dataset(POS_DATASET_PATH, VECTOR_INDEX_PATH, which_sets=["test"], cutoff=CUTOFF, cast_y=False) test_X, test_y = test_dataset test_y = test_y.get_value().astype("int32") predictions = predict(classifier, test_X.get_value()) CORPUS_INDICES = "brown_pos_dataset.indices" # Indexes for mapping words and tags <--> ints with open(os.path.join(DATASET_LOCATION, CORPUS_INDICES)) as indices_file: corpus_indices = cPickle.load(indices_file) # map tag ids back to strings y_test_actual = [corpus_indices["idx2tag"][tag_idx] for tag_idx in test_y] y_test_hat = [corpus_indices["idx2tag"][tag_idx] for tag_idx in predictions] # quick check of our accuracy on the test dataset acc = sum([y == p for y, p in zip(predictions, test_y)]) / float(len(predictions)) print "ACC: {}".format(acc) # get class names class_names = list(set(y_test_actual)) # Compute confusion matrix cm = confusion_matrix(y_test_actual, y_test_hat, labels=class_names) cm_normalized = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis] plt.figure() plot_confusion_matrix(cm_normalized, class_names, title="Normalized confusion matrix") plt.show() f1_scores = sklearn.metrics.f1_score(y_test_actual, y_test_hat, labels=class_names, average=None) for class_name, score in zip(class_names, f1_scores): print "{}:\t{}".format(class_name, score)