def main(): """ main """ DATASET_LOCATION = '../datasets/' POS_DATASET_NAME = 'brown_pos_dataset.hdf5' POS_DATASET_PATH = os.path.join(DATASET_LOCATION, POS_DATASET_NAME) WORD_BY_WORD_MATRIX = 'brown.word-by-word.normalized.npy' VECTOR_INDEX_PATH = os.path.join(DATASET_LOCATION, WORD_BY_WORD_MATRIX) CUTOFF = 100000 MODELS_PATH = "../trained_models/" MODEL_NAME = "logistic_regression_model.pkl" save_path = os.path.join(MODELS_PATH, MODEL_NAME) train_dataset = prep_dataset(POS_DATASET_PATH, VECTOR_INDEX_PATH, which_sets=['train'], cutoff=CUTOFF) dev_dataset = prep_dataset(POS_DATASET_PATH, VECTOR_INDEX_PATH, which_sets=['dev'], cutoff=CUTOFF) # get the functions and params that we need for our models initialization_data = initialize_logistic_regression(train_dataset, dev_dataset, learning_rate=0.1, batch_size=100) classifier, train_model_func, validate_model_func, n_train_batches, n_valid_batches = initialization_data train_model(train_model_func, n_train_batches, validate_model=validate_model_func, n_valid_batches=n_valid_batches, training_epochs=200) save_model(classifier, save_path)
def main(): """ main """ DATASET_LOCATION = '../../datasets/' # the pos dataset consists of windows around words POS_DATASET_NAME = 'brown_pos_dataset.hdf5' POS_DATASET_PATH = os.path.join(DATASET_LOCATION, POS_DATASET_NAME) CORPUS_INDICES = 'brown_pos_dataset.indices' WORD_BY_WORD_MATRIX = 'brown.word-by-word.normalized.npy' # load the training data VECTOR_INDEX_PATH = os.path.join(DATASET_LOCATION, WORD_BY_WORD_MATRIX) CUTOFF = 10000 train_dataset = prep_dataset(POS_DATASET_PATH, VECTOR_INDEX_PATH, which_sets=['train'], cutoff=CUTOFF) initialization_data = initialize_dA(train_dataset, learning_rate=0.1, corruption_level=0.3, batch_size=50, n_hidden=2) classifier, train_model_func, validate_model_func, n_train_batches, n_valid_batches = initialization_data train_model(train_model_func, n_train_batches, validate_model=validate_model_func, n_valid_batches=n_valid_batches, training_epochs=10) # make a theano function to get predictions from a trained model training_data = theano.tensor.matrix('training_X') predictions = classifier.predict(training_data) get_predictions = theano.function([training_data], predictions) # get predictions and evaluate p = get_predictions(train_dataset[0].get_value()) # get train_y without the cast train_y = prep_dataset(POS_DATASET_PATH, VECTOR_INDEX_PATH, which_sets=['train'], cast_y=False, cutoff=CUTOFF)[1].get_value().astype('int32') CUTOFF_BEGIN=0 CUTOFF_END=1000 y_vals = train_y[CUTOFF_BEGIN:CUTOFF_END] norm_y_vals = y_vals / float(np.amax(y_vals)) jitter1 = np.random.normal(loc=0.0, scale=0.05, size=CUTOFF_END-CUTOFF_BEGIN) jitter2 = np.random.normal(loc=0.0, scale=0.05, size=CUTOFF_END-CUTOFF_BEGIN) x1 = p[CUTOFF_BEGIN:CUTOFF_END,0] + jitter1 x2 = p[CUTOFF_BEGIN:CUTOFF_END,1] + jitter2 plt.scatter(x1, x2, c=norm_y_vals, s=20) plt.show()
def main(): """ main """ DATASET_LOCATION = '../datasets/' POS_DATASET_NAME = 'brown_pos_dataset.hdf5' POS_DATASET_PATH = os.path.join(DATASET_LOCATION, POS_DATASET_NAME) WORD_BY_WORD_MATRIX = 'brown.word-by-word.normalized.npy' VECTOR_INDEX_PATH = os.path.join(DATASET_LOCATION, WORD_BY_WORD_MATRIX) CUTOFF = 10000 MODELS_PATH = "../trained_models/" MODEL_NAME = "mlp_model.pkl" classifier = os.path.join(MODELS_PATH, MODEL_NAME) test_dataset = prep_dataset(POS_DATASET_PATH, VECTOR_INDEX_PATH, which_sets=['test'], cutoff=CUTOFF, cast_y=False) test_X, test_y = test_dataset test_y = test_y.get_value().astype('int32') predictions = predict(classifier, test_X.get_value()) CORPUS_INDICES = 'brown_pos_dataset.indices' with open(os.path.join(DATASET_LOCATION, CORPUS_INDICES)) as indices_file: corpus_indices = cPickle.load(indices_file) # map tag ids back to strings y_test_actual = [corpus_indices['idx2tag'][tag_idx] for tag_idx in test_y] y_test_hat = [corpus_indices['idx2tag'][tag_idx] for tag_idx in predictions] # Quick Evaluation acc = sum([y==p for y,p in zip(predictions, test_y)]) / float(len(predictions)) print "ACC: {}".format((acc)) # get class names class_names = list(set(y_test_actual)) # Compute confusion matrix cm = confusion_matrix(y_test_actual, y_test_hat, labels=class_names) # Normalize the confusion matrix by row (i.e by the number of samples in each class) cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] plt.figure() plot_confusion_matrix(cm_normalized, class_names, title='Normalized confusion matrix') plt.show()
def main(): CUTOFF = 100000 DATASET_LOCATION = "../datasets/" POS_DATASET_NAME = "brown_pos_dataset.hdf5" POS_DATASET_PATH = os.path.join(DATASET_LOCATION, POS_DATASET_NAME) WORD_BY_WORD_MATRIX = "brown.word-by-word.normalized.npy" VECTOR_INDEX_PATH = os.path.join(DATASET_LOCATION, WORD_BY_WORD_MATRIX) MODELS_PATH = "../trained_models/" MODEL_NAME = "logistic_regression_model.pkl" classifier = os.path.join(MODELS_PATH, MODEL_NAME) test_dataset = prep_dataset(POS_DATASET_PATH, VECTOR_INDEX_PATH, which_sets=["test"], cutoff=CUTOFF, cast_y=False) test_X, test_y = test_dataset test_y = test_y.get_value().astype("int32") predictions = predict(classifier, test_X.get_value()) CORPUS_INDICES = "brown_pos_dataset.indices" # Indexes for mapping words and tags <--> ints with open(os.path.join(DATASET_LOCATION, CORPUS_INDICES)) as indices_file: corpus_indices = cPickle.load(indices_file) # map tag ids back to strings y_test_actual = [corpus_indices["idx2tag"][tag_idx] for tag_idx in test_y] y_test_hat = [corpus_indices["idx2tag"][tag_idx] for tag_idx in predictions] # quick check of our accuracy on the test dataset acc = sum([y == p for y, p in zip(predictions, test_y)]) / float(len(predictions)) print "ACC: {}".format(acc) # get class names class_names = list(set(y_test_actual)) # Compute confusion matrix cm = confusion_matrix(y_test_actual, y_test_hat, labels=class_names) cm_normalized = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis] plt.figure() plot_confusion_matrix(cm_normalized, class_names, title="Normalized confusion matrix") plt.show() f1_scores = sklearn.metrics.f1_score(y_test_actual, y_test_hat, labels=class_names, average=None) for class_name, score in zip(class_names, f1_scores): print "{}:\t{}".format(class_name, score)