def max_credit_model(): ''' Computes the accuracy of a "max credit" model where each statement is labeled with the most frequent label of the previous statements by the speaker. ''' print("NUM CORRECT", "\t", "ACCURACY") train_labels, _, _, _, train_credit = get_data(var.TRAINING_DATA_PATH) val_labels, _, _, _, val_credit = get_data(var.VALIDATION_DATA_PATH) test_labels, _, _, _, test_credit = get_data(var.TEST_DATA_PATH) # 9: barely true counts. # 10: false counts. # 11: half true counts. # 12: mostly true counts. # 13: pants on fire counts. credit_mapping = [ "barely-true", "false", "half-true", "mostly-true", "pants-fire" ] train_correct = 0.0 train_credit = clean_credit(train_labels, train_credit) for i in range(len(train_labels)): max_credit_index = train_credit[i].index(max(train_credit[i])) if credit_mapping[max_credit_index] == train_labels[i]: train_correct += 1.0 print(train_correct, "\t\t", train_correct / len(train_labels), "\tTraining Data") val_correct = 0.0 val_credit = clean_credit(val_labels, val_credit) for i in range(len(val_labels)): max_credit_index = val_credit[i].index(max(val_credit[i])) if credit_mapping[max_credit_index] == val_labels[i]: val_correct += 1.0 print(val_correct, "\t\t", val_correct / len(val_labels), "\tValidation Data") test_correct = 0.0 test_credit = clean_credit(test_labels, test_credit) for i in range(len(test_labels)): max_credit_index = test_credit[i].index(max(test_credit[i])) if credit_mapping[max_credit_index] == test_labels[i]: test_correct += 1.0 print(test_correct, "\t\t", test_correct / len(test_labels), "\tTest Data")
def main(): initialize() data = parse_data.get_data() all_predictions = [] all_gold = [] for target in data: if config.verbose: print target train_X, train_Y, feat_dict = create_feature_matrix(data[target]["train"], feat_dict=None) tune_X, tune_Y, _ = create_feature_matrix(data[target]["tune"], feat_dict=feat_dict) test_X, test_Y, _ = create_feature_matrix(data[target]["test"], feat_dict=feat_dict) model = train_model(train_X, train_Y, tune_X, tune_Y) predictions = model.predict(test_X) _, _, f1 = compute_f1(predictions, test_Y) if config.verbose: print "\t", "F1:", round(f1 * 100, 2) all_predictions += list(predictions) all_gold += list(test_Y) f1_against, f1_favor, f1_overall = compute_f1(all_predictions, all_gold) print "F1 Against:", round(f1_against * 100, 2) print "F1 Favor:", round(f1_favor * 100, 2) print "---> Overall F1:", round(f1_overall * 100, 2)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- from parse_data import get_data from draw_confusion_matrix import plot_confusion_matrix import matplotlib.pyplot as plt import numpy as np from sklearn.metrics import confusion_matrix, accuracy_score import time from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier, DistanceMetric from sklearn.model_selection import KFold if __name__ == '__main__': data = get_data() X = data['normalized_feature_matrix'] Y = data['target_vector'] KNN_rs = KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree', metric='euclidean') start_time = time.time() KNN_rs.fit(X, Y) used_time = time.time() - start_time print( "KD Tree using cosine distance with resubstitution method training time is %s seconds" % used_time) start_time = time.time() predicted_rs = KNN_rs.predict(X) used_time = time.time() - start_time print( "KD Tree using cosine distance with resubstitution method querying time is %s seconds" % used_time) cfm_rs = confusion_matrix(Y, predicted_rs, labels=range(1, 11))
def train_model(): ''' Trains a model using the parameters specified in var.py. The model training checkpoints are saved to var.FOLDER_NAME. A copy of the var.py used is also saved in that folder. Only the lowest val loss models for each trained model is saved. ''' copyfile('var.py', os.path.join(var.FOLDER_NAME, 'var.py')) print("Reading word vectors... ") embeddings = None if var.USE_WORD2VEC: embeddings = KeyedVectors.load_word2vec_format(var.WORD2VEC_BIN_PATH, binary=True) else: embeddings = get_glove_vectors(var.GLOVE_VECTOR_PATH) print("--- DONE ---") print("Getting input data... ") train_labels, train_sentences, train_subjects, train_party, train_credit = get_data( var.TRAINING_DATA_PATH) val_labels, val_sentences, val_subjects, val_party, val_credit = get_data( var.VALIDATION_DATA_PATH) print("--- DONE ---") if var.NO_STOP_WORDS: train_sentences = remove_stop_words(train_sentences) val_sentences = remove_stop_words(val_sentences) print("Preparing data for model... ") # Convert the input sentences into sequences of integers # length MAX_SEQUENCE_LENGTH where each integer maps to a # word and the sequence only considers the first # MAX_SEQUENCE_LENGTH words in the statement being evaluated tokenizer = Tokenizer(num_words=var.MAX_NUM_WORDS) tokenizer.fit_on_texts(train_sentences) train_sequences = tokenizer.texts_to_sequences(train_sentences) val_sequences = tokenizer.texts_to_sequences(val_sentences) word_index = tokenizer.word_index x_train = pad_sequences(train_sequences, maxlen=var.MAX_SEQUENCE_LENGTH) x_val = pad_sequences(val_sequences, maxlen=var.MAX_SEQUENCE_LENGTH) y_train = to_categorical( np.asarray([var.LABEL_MAPPING[label] for label in train_labels])) y_val = to_categorical( np.asarray([var.LABEL_MAPPING[label] for label in val_labels])) # Get the Part of Speech frequencies x_train_pos = np.asarray(get_pos_freqs(train_sentences)) x_val_pos = np.asarray(get_pos_freqs(val_sentences)) # Populate SUBJECT_MAPPING with freq information from training data var.SUBJECT_MAPPING = get_mapping(train_subjects) # Create one hot vectors for the subject x_train_subject = np.asarray( get_one_hot_vectors(train_subjects, var.NUM_SUBJECTS, var.SUBJECT_MAPPING)) x_val_subject = np.asarray( get_one_hot_vectors(val_subjects, var.NUM_SUBJECTS, var.SUBJECT_MAPPING)) # Convert party to list of list format train_party = [[party] for party in train_party] val_party = [[party] for party in val_party] # Populate PARTY_MAPPING with training data var.PARTY_MAPPING = get_mapping(train_party) # Get One Hot Vectors representing Party x_train_party = np.asarray( get_one_hot_vectors(train_party, var.NUM_PARTIES, var.PARTY_MAPPING)) x_val_party = np.asarray( get_one_hot_vectors(val_party, var.NUM_PARTIES, var.PARTY_MAPPING)) # Remove current label from credit vector train_credit = clean_credit(train_labels, train_credit) val_credit = clean_credit(val_labels, val_credit) # Normalize Credit Vector x_train_credit = np.asarray(normalize_vectors(train_credit)) x_val_credit = np.asarray(normalize_vectors(val_credit)) # Create embedding matrix for embedding layer. Matrix will be # (num_words + 1) x EMBEDDING_DIM since word_index starts at 1 num_words = min(var.MAX_NUM_WORDS, len(word_index)) embedding_matrix = np.zeros((num_words + 1, var.EMBEDDING_DIM)) for word, rank in word_index.items(): if rank <= var.MAX_NUM_WORDS: embedding = None if var.USE_WORD2VEC: if word in embeddings.vocab: embedding = embeddings[word] else: embedding = embeddings.get(word, None) if embedding is not None: embedding_matrix[rank] = embedding print("--- DONE ---") for i in range(var.NUM_MODELS): print("Creating model " + str(i + 1) + " out of " + str(var.NUM_MODELS) + " ...") if var.MODEL_TYPE == "CNN": model = cnn_model(embedding_matrix, num_words, pooling=var.POOLING, subject=var.USE_SUBJECTS, party=var.USE_PARTY, credit=var.USE_CREDIT, pos=var.USE_POS) elif var.MODEL_TYPE == "BI_LSTM": model = bi_lstm_model(embedding_matrix, num_words, subject=var.USE_SUBJECTS, party=var.USE_PARTY, credit=var.USE_CREDIT, pos=var.USE_POS) elif var.MODEL_TYPE == "BI_LSTM_CNN": model = bi_lstm_cnn_model(embedding_matrix, num_words, pooling=var.POOLING, subject=var.USE_SUBJECTS, party=var.USE_PARTY, credit=var.USE_CREDIT, pos=var.USE_POS) elif var.MODEL_TYPE == "CNN_BI_LSTM": model = cnn_bi_lstm_model(embedding_matrix, num_words, subject=var.USE_SUBJECTS, party=var.USE_PARTY, credit=var.USE_CREDIT, pos=var.USE_POS) elif var.MODEL_TYPE == "PARALLEL": model = parallel_cnn_bi_lstm_model(embedding_matrix, num_words, pooling=var.POOLING, subject=var.USE_SUBJECTS, party=var.USE_PARTY, credit=var.USE_CREDIT, pos=var.USE_POS) else: raise Exception("Invalid MODEL_TYPE") print("--- DONE ---") print("Training model... ") # Save trained model after each epoch checkpoint_file = os.path.join(var.FOLDER_NAME, str(i).zfill(2) + '_' + var.FILE_NAME) checkpoint = ModelCheckpoint(checkpoint_file, monitor='val_loss', verbose=1, save_best_only=True) callbacks = [checkpoint] train_input = [x_train] val_input = [x_val] if var.USE_SUBJECTS: train_input.append(x_train_subject) val_input.append(x_val_subject) if var.USE_PARTY: train_input.append(x_train_party) val_input.append(x_val_party) if var.USE_CREDIT: train_input.append(x_train_credit) val_input.append(x_val_credit) if var.USE_POS: train_input.append(x_train_pos) val_input.append(x_val_pos) print("train_input_size", len(train_input)) model.fit(train_input, y_train, validation_data=(val_input, y_val), epochs=var.NUM_EPOCHS, batch_size=var.BATCH_SIZE, callbacks=callbacks) print("--- DONE ---") model.summary() del model
""" @copyright: 2013 by Pauli Rikula <*****@*****.**> @license: MIT <http://www.opensource.org/licenses/mit-license.php> """ #for parsing from parse_data import get_data #for audio from signal_tools import generate_wavs if __name__ == "__main__": print "generating wavs" generate_wavs(get_data(),"temperature")
def main(args_parser): #Dataset parser = args_parser args = parser.parse_args() train_image_data, train_label_data, train_filename, valid_image_data, valid_label_data, valid_filename, unique_classes = get_data( ) #tf.reset_default_graph() DATASET_PATH = args.datasetPath LEARNING_RATE_1 = args.learningRate EPOCHS = args.epochs BATCH_SIZE = args.batchSize NUM_CLASSES = len(unique_classes) Z_SCORE = args.zScore WEIGHT_DECAY_1 = args.weightDecay print("Current Setup:-") print( "Starting Learning Rate: {}, Epochs: {}, Batch Size: {}, Confidence Interval Z-Score {}, Number of classes: {}, Starting Weight Decay: {}" .format(LEARNING_RATE_1, EPOCHS, BATCH_SIZE, Z_SCORE, NUM_CLASSES, WEIGHT_DECAY_1)) #Placeholders learning_rate = tf.placeholder(tf.float32, shape=[], name='learning_rate') weight_decay = tf.placeholder(tf.float32, shape=[], name="weight_decay") #Dataset training_dataset = tf.data.Dataset.from_generator( lambda: itertools.zip_longest(train_image_data, train_label_data, train_filename), output_types=(tf.float32, tf.float32), output_shapes=(tf.TensorShape([None, None, 3]), tf.TensorShape([None]), tf.TensorShape([None]))) training_dataset = training_dataset.repeat(EPOCHS).batch( BATCH_SIZE).prefetch(1) train_iterator = training_dataset.make_initializable_iterator() train_features, train_labels, train_filename = train_iterator.get_next() valid_dataset = tf.data.Dataset.from_generator( lambda: itertools.zip_longest(valid_image_data, valid_label_data, valid_filename), output_types=(tf.float32, tf.float32), output_shapes=(tf.TensorShape([None, None, 3]), tf.TensorShape([None]), tf.TensorShape([None]))) valid_dataset = valid_dataset.repeat(EPOCHS).batch(BATCH_SIZE).prefetch(1) valid_iterator = valid_dataset.make_initializable_iterator() valid_features, valid_labels, valid_filename = valid_iterator.get_next() #Model _, train_op, train_cross_entropy, train_conf_matrix_op, train_accuracy = initiate_vgg_model( train_features, train_labels, train_filename, NUM_CLASSES, weight_decay, learning_rate, handle="training", reuse_model=None) _, _, valid_cross_entropy, valid_conf_matrix_op, valid_accuracy = initiate_vgg_model( valid_features, valid_labels, valid_filename, NUM_CLASSES, weight_decay, learning_rate, handle="validation", reuse_model=True) saver = tf.train.Saver() if not os.path.exists(os.path.join("./short_dl_research_train/")): os.mkdir(os.path.join("./short_dl_research_train/")) with tf.Session() as sess: with np.printoptions(threshold=np.inf): train_writer = tf.summary.FileWriter( "./short_tensorboard_training_logs/") valid_writer = tf.summary.FileWriter( "./short_tensorboard_validation_logs/") train_writer.add_graph(sess.graph) valid_writer.add_graph(sess.graph) train_highest_acc = 0 valid_highest_acc = 0 sess.run([ tf.global_variables_initializer(), tf.local_variables_initializer() ]) for epoch in range(EPOCHS): print("Current Epoch: {}/{}".format(epoch, EPOCHS)) i = 0 try: sess.run(train_iterator.initializer) while True: print("Current Training Iteration : {}/{}".format( i, floor(int(157252) / BATCH_SIZE))) train_acc, _, _, train_ce = util.training( BATCH_SIZE, NUM_CLASSES, learning_rate, weight_decay, sess, train_op, train_conf_matrix_op, LEARNING_RATE_1, WEIGHT_DECAY_1, train_cross_entropy, train_accuracy) train_value1, train_value2 = util.confidence_interval( train_acc, Z_SCORE, BATCH_SIZE) print("Training Accuracy : {}".format(train_acc)) print("Training Loss (Cross Entropy) : {}".format( train_ce)) print("Training Confidence Interval: [{} , {}]".format( train_value2, train_value1)) if train_highest_acc <= train_acc: train_highest_acc = train_acc print( "Highest Training Accuracy Reached: {}".format( train_highest_acc)) #For every epoch, we will save the model saver.save( sess, os.path.join("./short_dl_research_train/", "model.ckpt")) print( "Latest Model is saving and Tensorboard Logs are updated" ) train_writer.add_summary( tf.summary.merge_all().eval(), epoch * (floor(int(157252) / BATCH_SIZE)) + i) i = i + 1 except tf.errors.OutOfRangeError: print("End of the training dataset, proceed to validation") pass j = 0 try: sess.run(valid_iterator.initializer) while True: print("Current Validation Iteration : {}/{}".format( j, floor(int(19657) / BATCH_SIZE))) valid_acc, _, valid_ce = util.validation( BATCH_SIZE, NUM_CLASSES, learning_rate, weight_decay, sess, valid_conf_matrix_op, LEARNING_RATE_1, WEIGHT_DECAY_1, valid_cross_entropy, valid_accuracy) valid_value1, valid_value2 = util.confidence_interval( valid_acc, Z_SCORE, BATCH_SIZE) print("Validation Accuracy : {}".format(valid_acc)) print("validation Loss (Cross Entropy) : {}".format( valid_ce)) print( "Validation Confidence Interval: [{} , {}]".format( valid_value2, valid_value1)) if valid_highest_acc <= valid_acc: valid_highest_acc = valid_acc print("Highest Validation Accuracy Reached: {}". format(valid_highest_acc)) valid_writer.add_summary( tf.summary.merge_all().eval(), epoch * (floor(int(19657) / BATCH_SIZE)) + j) j = j + 1 except tf.errors.OutOfRangeError: print("End of validation dataset, go to the next epoch") pass
import iris.coord_categorisation from cf_units import Unit import iris.quickplot as qplt import iris.plot as iplt import iris.analysis as ia import numpy as np from scipy import interpolate import parse_data data = parse_data.get_data() data = {k: data[k] for k in ["altitude", "windspeed"]} percentiles = [1, 5, 10, 25, 50, 75, 90, 95, 99] import matplotlib.pyplot as plt from matplotlib import style, ticker import seaborn as sns from labellines import labelLines sns.set(font_scale=1) style.use("default") fig, ax = plt.subplots(1, 1, figsize=(8, 6), dpi=200) colors = plt.cm.rainbow(np.linspace(0, 1, len(percentiles))) altitude_mean = np.array(data["altitude"].collapsed( ["latitude", "longitude", "time"], ia.MEAN).data) np.save("analysis_wind_speed_percentiles_at_altitude/altitude.npy", altitude_mean) for i, percentile in enumerate(percentiles): windspeed_pct = np.array(
from pulp import * from parse_data import get_data NUTRITION_FACTS = get_data() FOODS = ["avocado", "beans", "cheese", "rice", "spinach"] pound_per_gram = 1 / 453.5924 FOOD_COSTS = { ### cost per pound * pound per gram * number of grams the nutrition data is based on "avocado": (1.25 * 2) * pound_per_gram * 136, # 1.25 avos * (2 avos per pound ) * lb per gram * number of grams "beans": 1.0 * pound_per_gram * 130, "cheese": 7.71 * pound_per_gram * 28, "rice": 1.0 * pound_per_gram * 42, "spinach": 7.0 * pound_per_gram * 340 } DAILY_NUTRITION = { 'protein': ['gte', 56], 'calories': ['eq', 2000], 'sodium': ['lte', 2400], 'vitaminC': ['gte', 90], 'vitaminA': ['gte', 700], 'saturatedFat': ['lte', 20] } my_lp_problem = LpProblem("My LP Problem", LpMinimize) # setup variables (the individual food items are the variables) food_vars = LpVariable.dicts("Foods", FOODS, 0)
""" from parse_data import get_data import numpy as np import pylab as pl from sklearn import cluster #, datasets #from sklearn.metrics import euclidean_distances #from sklearn.neighbors import kneighbors_graph from sklearn.preprocessing import Scaler from sklearn.decomposition import PCA data = get_data()[:5000] temperature_0 = [ r[1] for r in data[:-50]] temperature_1 = [ r[1] for r in data[50:]] X = np.c_[temperature_0,temperature_1] #X = Scaler().fit_transform(X) pca = PCA(n_components=3) X = pca.fit(X).transform(X) #spectral = cluster.SpectralClustering(n_clusters=2, mode='arpack', # affinity="nearest_neighbors")
""" @copyright: 2013 by Pauli Rikula <*****@*****.**> @license: MIT <http://www.opensource.org/licenses/mit-license.php> """ from parse_data import get_data import numpy as np import matplotlib.pyplot as plt data = get_data() temperature_0 = [ r[1] for r in data[:-1]] temperature_1 = [ r[1] for r in data[1:]] H, xedges, yedges = np.histogram2d(temperature_0,temperature_1, bins=(128,128)) H_log = np.log(H) extent = [yedges[0], yedges[-1], xedges[-1], xedges[0]] plt.imshow(H_log, extent=extent, interpolation='nearest') plt.colorbar() plt.xlabel("T_n")
# TenserFlow and tf.keras import tensorflow as tf from tensorflow import keras # detection of face + use of webcam import cv2 # Helper Librairies import numpy as np import keyboard # parsing of profils from parse_data import get_data from train_data import get_train_data train_images, train_labels, names = get_data() # normalization train_images = np.asarray(train_images) / 255.0 # create a neural network model = keras.Sequential([ keras.layers.Flatten(input_shape=(28, 28)), keras.layers.Dense(128, activation=tf.nn.relu), keras.layers.Dense(len(names), activation=tf.nn.softmax) ]) # giving the network rules model.compile(optimizer=tf.train.AdamOptimizer(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
dataset = {'k': [[1, 2], [2, 3], [3, 1]], 'r': [[6, 5], [7, 7], [8, 6]]} new_features = [2, 5] def kNN(data, predict, k=3): if len(data) >= k: print('k is less than total voting groups') distances = [] for group in data: for features in data[group]: euclidean_distance = np.linalg.norm( np.array(features) - np.array(predict)) distances.append([euclidean_distance, group]) votes = [i[1] for i in sorted(distances)[:k]] vote_result = Counter(votes).most_common(1)[0][0] confidence = Counter(votes).most_common(1)[0][1] return vote_result, confidence train_set, test_set = parse_data.get_data() total = correct = 0 for group in test_set: for data in test_set[group]: vote, confidence = kNN(train_set, data, k=5) if group == vote: correct += 1 total += 1 print("Accuracy: ", correct * 1.0 / total)
import numpy as np import tflearn import csv from tflearn.layers.conv import conv_2d, max_pool_2d from tflearn.layers.core import input_data, dropout, fully_connected from tflearn.layers.estimator import regression os.environ['TF_CPP_MIN_LOG_LEVEL']='2' # not required coz not jupyter import tensorflow as tf tf.reset_default_graph() image_dim = parse_data.image_dim training_data, testing_data = parse_data.get_data() model_path = '11.cogs-and-dats.model' # Make training set and testing set from the entire labeled training data test_set = int(len(training_data)*0.8) X = np.array([t[0] for t in training_data[:test_set]]) Y = np.array([t[1] for t in training_data[:test_set]]) X = X.reshape([-1, image_dim, image_dim, 1]) test_x = np.array([t[0] for t in training_data[test_set:]]) test_y = np.array([t[1] for t in training_data[test_set:]]) test_x = test_x.reshape([-1, image_dim, image_dim, 1]) # Make array from testing data that can be passed for prediction predict_this_X = np.array([t[0] for t in testing_data]) predict_this_X = predict_this_X.reshape([-1, image_dim, image_dim, 1]) predict_this_X_id = np.array([t[1] for t in testing_data])
#!/usr/bin/env python3 import numpy as np import parse_data as ps from sklearn.naive_bayes import GaussianNB if __name__ == '__main__': train_data = ps.get_data('train_clean.csv') test_data = ps.get_test_matrix('test_clean.csv') X = train_data['feature_matrix'] Y = train_data['target_vector'] label = test_data['label'] TestX = test_data['feature_matrix'] GNB = GaussianNB() GNB.fit(X, Y) Predict_Y = GNB.predict(X) TestY = GNB.predict(TestX) for i in range(len(label)): print("%s is %s" % (label[i][0], TestY[i]))
def test_model(): ''' Evaluates a saved model against eval and test sets. The trained model folder path should be the first argument. All saved models inside that folder will be evaluated. ''' paths = None if len(sys.argv) < 2: raise Exception( "Must specify the path to the folder of the saved model to test") model_path = sys.argv[1] folder = '' if os.path.isdir(model_path): paths = [ os.path.join(model_path, f) for f in os.listdir(model_path) if os.path.isfile(os.path.join(model_path, f)) and f.endswith(".hdf5") ] folder = model_path elif os.path.isfile(model_path): paths = [model_path] print() print("List of files to test") print(paths) print() model_results = [] for i, path in enumerate(paths): print() print("Testing file " + str(i + 1) + " out of " + str(len(paths))) model = load_model(path) # Recreate the input tokenizer train_labels, train_sentences, train_subjects, train_party, train_credit = get_data( var.TRAINING_DATA_PATH) if var.NO_STOP_WORDS: train_sentences = remove_stop_words(train_sentences) train_party = [[party] for party in train_party] tokenizer = Tokenizer(num_words=var.MAX_NUM_WORDS) tokenizer.fit_on_texts(train_sentences) # Get the val input via tokenizer and val labels val_labels, val_sentences, val_subjects, val_party, val_credit = get_data( var.VALIDATION_DATA_PATH) if var.NO_STOP_WORDS: val_sentences = remove_stop_words(val_sentences) val_party = [[party] for party in val_party] val_sequences = tokenizer.texts_to_sequences(val_sentences) x_val = pad_sequences(val_sequences, maxlen=var.MAX_SEQUENCE_LENGTH) y_val = to_categorical( np.asarray([var.LABEL_MAPPING[label] for label in val_labels])) # Get the test input via tokenizer and test labels test_labels, test_sentences, test_subjects, test_party, test_credit = get_data( var.TEST_DATA_PATH) if var.NO_STOP_WORDS: test_sentences = remove_stop_words(test_sentences) test_party = [[party] for party in test_party] test_sequences = tokenizer.texts_to_sequences(test_sentences) x_test = pad_sequences(test_sequences, maxlen=var.MAX_SEQUENCE_LENGTH) y_test = to_categorical( np.asarray([var.LABEL_MAPPING[label] for label in test_labels])) x_test_pos = np.asarray(get_pos_freqs(test_sentences)) x_val_pos = np.asarray(get_pos_freqs(val_sentences)) var.SUBJECT_MAPPING = get_mapping(train_subjects) x_test_subjects = np.asarray( get_one_hot_vectors(test_subjects, var.NUM_SUBJECTS, var.SUBJECT_MAPPING)) x_val_subjects = np.asarray( get_one_hot_vectors(val_subjects, var.NUM_SUBJECTS, var.SUBJECT_MAPPING)) var.PARTY_MAPPING = get_mapping(train_party) x_test_party = np.asarray( get_one_hot_vectors(test_party, var.NUM_PARTIES, var.PARTY_MAPPING)) x_val_party = np.asarray( get_one_hot_vectors(val_party, var.NUM_PARTIES, var.PARTY_MAPPING)) test_credit = clean_credit(test_labels, test_credit) val_credit = clean_credit(val_labels, val_credit) x_test_credit = np.asarray(normalize_vectors(test_credit)) x_val_credit = np.asarray(normalize_vectors(val_credit)) test_input = [x_test] val_input = [x_val] if var.USE_SUBJECTS: test_input.append(x_test_subjects) val_input.append(x_val_subjects) if var.USE_PARTY: test_input.append(x_test_party) val_input.append(x_val_party) if var.USE_CREDIT: test_input.append(x_test_credit) val_input.append(x_val_credit) if var.USE_POS: test_input.append(x_test_pos) val_input.append(x_val_pos) test_score = model.evaluate(test_input, y_test, batch_size=var.BATCH_SIZE) val_score = model.evaluate(val_input, y_val, batch_size=var.BATCH_SIZE) print() print("model = " + str(path)) print("val loss = %0.4f, val acc = %0.4f" % (val_score[0], val_score[1])) print("test loss = %0.4f, test acc = %0.4f" % (test_score[0], test_score[1])) model_results.append((os.path.basename(path), round(val_score[0], 4), round(val_score[1], 4), round(test_score[0], 4), round(test_score[1], 4))) average_val_loss = round( sum([info[1] for info in model_results]) / len(model_results), 4) average_val_acc = round( sum([info[2] for info in model_results]) / len(model_results), 4) average_test_loss = round( sum([info[3] for info in model_results]) / len(model_results), 4) average_test_acc = round( sum([info[4] for info in model_results]) / len(model_results), 4) best_val = sorted(model_results, key=lambda x: x[2], reverse=True)[:5] best_test = sorted(model_results, key=lambda x: x[4], reverse=True)[:5] with open(os.path.join(folder, "stats.txt"), "w") as f: f.write("Num models = " + str(len(model_results)) + "\n") f.write("Average val_loss = " + str(average_val_loss) + "\n") f.write("Average val_acc = " + str(average_val_acc) + "\n") f.write("Average test_loss = " + str(average_test_loss) + "\n") f.write("Average test_acc = " + str(average_test_acc) + "\n") f.write("\n") f.write("Top 5 validation accuracies:\n") for model in best_val: f.write("\t" + str(model[2]) + "\t" + str(model[0]) + "\n") f.write("\n") f.write("Top 5 test accuracies:\n") for model in best_test: f.write("\t" + str(model[4]) + "\t" + str(model[0]) + "\n") f.write("\n") f.write("Results for all models:\n") for model in sorted(model_results, key=lambda x: x[0]): f.write("\t" + str(model) + "\n") print() print("---------------------- OVERALL STATISTICS -----------------------") print() with open(os.path.join(folder, "stats.txt"), "r") as f: print(f.read())
""" @copyright: 2013 by Pauli Rikula <*****@*****.**> @license: MIT <http://www.opensource.org/licenses/mit-license.php> """ #for parsing from parse_data import get_data from plot_tools import makeplot, daily, monthly if __name__ == "__main__": signal = get_data() makeplot(signal, daily) makeplot(signal, monthly)