Beispiel #1
0
def max_credit_model():
    '''
    Computes the accuracy of a "max credit" model where each statement is
    labeled with the most frequent label of the previous statements by the
    speaker.
    '''
    print("NUM CORRECT", "\t", "ACCURACY")

    train_labels, _, _, _, train_credit = get_data(var.TRAINING_DATA_PATH)
    val_labels, _, _, _, val_credit = get_data(var.VALIDATION_DATA_PATH)
    test_labels, _, _, _, test_credit = get_data(var.TEST_DATA_PATH)

    # 9: barely true counts.
    # 10: false counts.
    # 11: half true counts.
    # 12: mostly true counts.
    # 13: pants on fire counts.
    credit_mapping = [
        "barely-true", "false", "half-true", "mostly-true", "pants-fire"
    ]

    train_correct = 0.0

    train_credit = clean_credit(train_labels, train_credit)
    for i in range(len(train_labels)):
        max_credit_index = train_credit[i].index(max(train_credit[i]))
        if credit_mapping[max_credit_index] == train_labels[i]:
            train_correct += 1.0

    print(train_correct, "\t\t", train_correct / len(train_labels),
          "\tTraining Data")

    val_correct = 0.0

    val_credit = clean_credit(val_labels, val_credit)
    for i in range(len(val_labels)):
        max_credit_index = val_credit[i].index(max(val_credit[i]))
        if credit_mapping[max_credit_index] == val_labels[i]:
            val_correct += 1.0

    print(val_correct, "\t\t", val_correct / len(val_labels),
          "\tValidation Data")

    test_correct = 0.0

    test_credit = clean_credit(test_labels, test_credit)
    for i in range(len(test_labels)):
        max_credit_index = test_credit[i].index(max(test_credit[i]))
        if credit_mapping[max_credit_index] == test_labels[i]:
            test_correct += 1.0

    print(test_correct, "\t\t", test_correct / len(test_labels), "\tTest Data")
def main():
    initialize()
    data = parse_data.get_data()

    all_predictions = []
    all_gold = []
    for target in data:
        if config.verbose:
            print target
        train_X, train_Y, feat_dict = create_feature_matrix(data[target]["train"],
                                                            feat_dict=None)
        tune_X, tune_Y, _ = create_feature_matrix(data[target]["tune"],
                                                  feat_dict=feat_dict)
        test_X, test_Y, _ = create_feature_matrix(data[target]["test"],
                                                  feat_dict=feat_dict)
        model = train_model(train_X, train_Y, tune_X, tune_Y)
        predictions = model.predict(test_X)

        _, _, f1 = compute_f1(predictions, test_Y)
        if config.verbose:
            print "\t", "F1:", round(f1 * 100, 2)
        all_predictions += list(predictions)
        all_gold += list(test_Y)
    f1_against, f1_favor, f1_overall = compute_f1(all_predictions, all_gold)
    print "F1 Against:", round(f1_against * 100, 2)
    print "F1 Favor:", round(f1_favor * 100, 2)
    print "---> Overall F1:", round(f1_overall * 100, 2)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from parse_data import get_data
from draw_confusion_matrix import plot_confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score
import time
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier, DistanceMetric
from sklearn.model_selection import KFold

if __name__ == '__main__':
    data = get_data()
    X = data['normalized_feature_matrix']
    Y = data['target_vector']
    KNN_rs = KNeighborsClassifier(n_neighbors=5,
                                  algorithm='kd_tree',
                                  metric='euclidean')
    start_time = time.time()
    KNN_rs.fit(X, Y)
    used_time = time.time() - start_time
    print(
        "KD Tree using cosine distance with resubstitution method training time is %s seconds"
        % used_time)
    start_time = time.time()
    predicted_rs = KNN_rs.predict(X)
    used_time = time.time() - start_time
    print(
        "KD Tree using cosine distance with resubstitution method querying time is %s seconds"
        % used_time)
    cfm_rs = confusion_matrix(Y, predicted_rs, labels=range(1, 11))
Beispiel #4
0
def train_model():
    '''
    Trains a model using the parameters specified in var.py. The
    model training checkpoints are saved to var.FOLDER_NAME. A
    copy of the var.py used is also saved in that folder. Only
    the lowest val loss models for each trained model is saved.
    '''
    copyfile('var.py', os.path.join(var.FOLDER_NAME, 'var.py'))

    print("Reading word vectors... ")

    embeddings = None
    if var.USE_WORD2VEC:
        embeddings = KeyedVectors.load_word2vec_format(var.WORD2VEC_BIN_PATH,
                                                       binary=True)
    else:
        embeddings = get_glove_vectors(var.GLOVE_VECTOR_PATH)

    print("--- DONE ---")

    print("Getting input data... ")

    train_labels, train_sentences, train_subjects, train_party, train_credit = get_data(
        var.TRAINING_DATA_PATH)
    val_labels, val_sentences, val_subjects, val_party, val_credit = get_data(
        var.VALIDATION_DATA_PATH)

    print("--- DONE ---")

    if var.NO_STOP_WORDS:
        train_sentences = remove_stop_words(train_sentences)
        val_sentences = remove_stop_words(val_sentences)

    print("Preparing data for model... ")

    # Convert the input sentences into sequences of integers
    # length MAX_SEQUENCE_LENGTH where each integer maps to a
    # word and the sequence only considers the first
    # MAX_SEQUENCE_LENGTH words in the statement being evaluated
    tokenizer = Tokenizer(num_words=var.MAX_NUM_WORDS)
    tokenizer.fit_on_texts(train_sentences)
    train_sequences = tokenizer.texts_to_sequences(train_sentences)
    val_sequences = tokenizer.texts_to_sequences(val_sentences)
    word_index = tokenizer.word_index

    x_train = pad_sequences(train_sequences, maxlen=var.MAX_SEQUENCE_LENGTH)
    x_val = pad_sequences(val_sequences, maxlen=var.MAX_SEQUENCE_LENGTH)
    y_train = to_categorical(
        np.asarray([var.LABEL_MAPPING[label] for label in train_labels]))
    y_val = to_categorical(
        np.asarray([var.LABEL_MAPPING[label] for label in val_labels]))

    # Get the Part of Speech frequencies
    x_train_pos = np.asarray(get_pos_freqs(train_sentences))
    x_val_pos = np.asarray(get_pos_freqs(val_sentences))

    # Populate SUBJECT_MAPPING with freq information from training data
    var.SUBJECT_MAPPING = get_mapping(train_subjects)

    # Create one hot vectors for the subject
    x_train_subject = np.asarray(
        get_one_hot_vectors(train_subjects, var.NUM_SUBJECTS,
                            var.SUBJECT_MAPPING))
    x_val_subject = np.asarray(
        get_one_hot_vectors(val_subjects, var.NUM_SUBJECTS,
                            var.SUBJECT_MAPPING))

    # Convert party to list of list format
    train_party = [[party] for party in train_party]
    val_party = [[party] for party in val_party]

    # Populate PARTY_MAPPING with training data
    var.PARTY_MAPPING = get_mapping(train_party)

    # Get One Hot Vectors representing Party
    x_train_party = np.asarray(
        get_one_hot_vectors(train_party, var.NUM_PARTIES, var.PARTY_MAPPING))
    x_val_party = np.asarray(
        get_one_hot_vectors(val_party, var.NUM_PARTIES, var.PARTY_MAPPING))

    # Remove current label from credit vector
    train_credit = clean_credit(train_labels, train_credit)
    val_credit = clean_credit(val_labels, val_credit)

    # Normalize Credit Vector
    x_train_credit = np.asarray(normalize_vectors(train_credit))
    x_val_credit = np.asarray(normalize_vectors(val_credit))

    # Create embedding matrix for embedding layer. Matrix will be
    # (num_words + 1) x EMBEDDING_DIM since word_index starts at 1
    num_words = min(var.MAX_NUM_WORDS, len(word_index))
    embedding_matrix = np.zeros((num_words + 1, var.EMBEDDING_DIM))
    for word, rank in word_index.items():
        if rank <= var.MAX_NUM_WORDS:
            embedding = None
            if var.USE_WORD2VEC:
                if word in embeddings.vocab:
                    embedding = embeddings[word]
            else:
                embedding = embeddings.get(word, None)
            if embedding is not None:
                embedding_matrix[rank] = embedding

    print("--- DONE ---")

    for i in range(var.NUM_MODELS):
        print("Creating model " + str(i + 1) + " out of " +
              str(var.NUM_MODELS) + " ...")
        if var.MODEL_TYPE == "CNN":
            model = cnn_model(embedding_matrix,
                              num_words,
                              pooling=var.POOLING,
                              subject=var.USE_SUBJECTS,
                              party=var.USE_PARTY,
                              credit=var.USE_CREDIT,
                              pos=var.USE_POS)
        elif var.MODEL_TYPE == "BI_LSTM":
            model = bi_lstm_model(embedding_matrix,
                                  num_words,
                                  subject=var.USE_SUBJECTS,
                                  party=var.USE_PARTY,
                                  credit=var.USE_CREDIT,
                                  pos=var.USE_POS)
        elif var.MODEL_TYPE == "BI_LSTM_CNN":
            model = bi_lstm_cnn_model(embedding_matrix,
                                      num_words,
                                      pooling=var.POOLING,
                                      subject=var.USE_SUBJECTS,
                                      party=var.USE_PARTY,
                                      credit=var.USE_CREDIT,
                                      pos=var.USE_POS)
        elif var.MODEL_TYPE == "CNN_BI_LSTM":
            model = cnn_bi_lstm_model(embedding_matrix,
                                      num_words,
                                      subject=var.USE_SUBJECTS,
                                      party=var.USE_PARTY,
                                      credit=var.USE_CREDIT,
                                      pos=var.USE_POS)
        elif var.MODEL_TYPE == "PARALLEL":
            model = parallel_cnn_bi_lstm_model(embedding_matrix,
                                               num_words,
                                               pooling=var.POOLING,
                                               subject=var.USE_SUBJECTS,
                                               party=var.USE_PARTY,
                                               credit=var.USE_CREDIT,
                                               pos=var.USE_POS)
        else:
            raise Exception("Invalid MODEL_TYPE")

        print("--- DONE ---")

        print("Training model... ")

        # Save trained model after each epoch

        checkpoint_file = os.path.join(var.FOLDER_NAME,
                                       str(i).zfill(2) + '_' + var.FILE_NAME)
        checkpoint = ModelCheckpoint(checkpoint_file,
                                     monitor='val_loss',
                                     verbose=1,
                                     save_best_only=True)
        callbacks = [checkpoint]

        train_input = [x_train]
        val_input = [x_val]
        if var.USE_SUBJECTS:
            train_input.append(x_train_subject)
            val_input.append(x_val_subject)
        if var.USE_PARTY:
            train_input.append(x_train_party)
            val_input.append(x_val_party)
        if var.USE_CREDIT:
            train_input.append(x_train_credit)
            val_input.append(x_val_credit)
        if var.USE_POS:
            train_input.append(x_train_pos)
            val_input.append(x_val_pos)

        print("train_input_size", len(train_input))

        model.fit(train_input,
                  y_train,
                  validation_data=(val_input, y_val),
                  epochs=var.NUM_EPOCHS,
                  batch_size=var.BATCH_SIZE,
                  callbacks=callbacks)

        print("--- DONE ---")

        model.summary()
        del model
"""
@copyright: 2013 by Pauli Rikula <*****@*****.**>
@license: MIT <http://www.opensource.org/licenses/mit-license.php>
"""


#for parsing
from parse_data import get_data
#for audio
from signal_tools import generate_wavs





if __name__ == "__main__":
    print "generating wavs"
    generate_wavs(get_data(),"temperature")
Beispiel #6
0
def main(args_parser):
    #Dataset
    parser = args_parser
    args = parser.parse_args()

    train_image_data, train_label_data, train_filename, valid_image_data, valid_label_data, valid_filename, unique_classes = get_data(
    )
    #tf.reset_default_graph()
    DATASET_PATH = args.datasetPath
    LEARNING_RATE_1 = args.learningRate
    EPOCHS = args.epochs
    BATCH_SIZE = args.batchSize
    NUM_CLASSES = len(unique_classes)
    Z_SCORE = args.zScore
    WEIGHT_DECAY_1 = args.weightDecay

    print("Current Setup:-")
    print(
        "Starting Learning Rate: {}, Epochs: {}, Batch Size: {}, Confidence Interval Z-Score {}, Number of classes: {}, Starting Weight Decay: {}"
        .format(LEARNING_RATE_1, EPOCHS, BATCH_SIZE, Z_SCORE, NUM_CLASSES,
                WEIGHT_DECAY_1))

    #Placeholders
    learning_rate = tf.placeholder(tf.float32, shape=[], name='learning_rate')
    weight_decay = tf.placeholder(tf.float32, shape=[], name="weight_decay")

    #Dataset
    training_dataset = tf.data.Dataset.from_generator(
        lambda: itertools.zip_longest(train_image_data, train_label_data,
                                      train_filename),
        output_types=(tf.float32, tf.float32),
        output_shapes=(tf.TensorShape([None, None, 3]), tf.TensorShape([None]),
                       tf.TensorShape([None])))

    training_dataset = training_dataset.repeat(EPOCHS).batch(
        BATCH_SIZE).prefetch(1)
    train_iterator = training_dataset.make_initializable_iterator()
    train_features, train_labels, train_filename = train_iterator.get_next()

    valid_dataset = tf.data.Dataset.from_generator(
        lambda: itertools.zip_longest(valid_image_data, valid_label_data,
                                      valid_filename),
        output_types=(tf.float32, tf.float32),
        output_shapes=(tf.TensorShape([None, None, 3]), tf.TensorShape([None]),
                       tf.TensorShape([None])))

    valid_dataset = valid_dataset.repeat(EPOCHS).batch(BATCH_SIZE).prefetch(1)
    valid_iterator = valid_dataset.make_initializable_iterator()
    valid_features, valid_labels, valid_filename = valid_iterator.get_next()

    #Model
    _, train_op, train_cross_entropy, train_conf_matrix_op, train_accuracy = initiate_vgg_model(
        train_features,
        train_labels,
        train_filename,
        NUM_CLASSES,
        weight_decay,
        learning_rate,
        handle="training",
        reuse_model=None)
    _, _, valid_cross_entropy, valid_conf_matrix_op, valid_accuracy = initiate_vgg_model(
        valid_features,
        valid_labels,
        valid_filename,
        NUM_CLASSES,
        weight_decay,
        learning_rate,
        handle="validation",
        reuse_model=True)

    saver = tf.train.Saver()

    if not os.path.exists(os.path.join("./short_dl_research_train/")):
        os.mkdir(os.path.join("./short_dl_research_train/"))

    with tf.Session() as sess:
        with np.printoptions(threshold=np.inf):
            train_writer = tf.summary.FileWriter(
                "./short_tensorboard_training_logs/")
            valid_writer = tf.summary.FileWriter(
                "./short_tensorboard_validation_logs/")
            train_writer.add_graph(sess.graph)
            valid_writer.add_graph(sess.graph)
            train_highest_acc = 0
            valid_highest_acc = 0
            sess.run([
                tf.global_variables_initializer(),
                tf.local_variables_initializer()
            ])

            for epoch in range(EPOCHS):
                print("Current Epoch: {}/{}".format(epoch, EPOCHS))
                i = 0
                try:
                    sess.run(train_iterator.initializer)
                    while True:
                        print("Current Training Iteration : {}/{}".format(
                            i, floor(int(157252) / BATCH_SIZE)))
                        train_acc, _, _, train_ce = util.training(
                            BATCH_SIZE, NUM_CLASSES, learning_rate,
                            weight_decay, sess, train_op, train_conf_matrix_op,
                            LEARNING_RATE_1, WEIGHT_DECAY_1,
                            train_cross_entropy, train_accuracy)
                        train_value1, train_value2 = util.confidence_interval(
                            train_acc, Z_SCORE, BATCH_SIZE)
                        print("Training Accuracy : {}".format(train_acc))
                        print("Training Loss (Cross Entropy) : {}".format(
                            train_ce))
                        print("Training Confidence Interval: [{} , {}]".format(
                            train_value2, train_value1))
                        if train_highest_acc <= train_acc:
                            train_highest_acc = train_acc
                            print(
                                "Highest Training Accuracy Reached: {}".format(
                                    train_highest_acc))
                            #For every epoch, we will save the model
                            saver.save(
                                sess,
                                os.path.join("./short_dl_research_train/",
                                             "model.ckpt"))
                            print(
                                "Latest Model is saving and Tensorboard Logs are updated"
                            )
                        train_writer.add_summary(
                            tf.summary.merge_all().eval(),
                            epoch * (floor(int(157252) / BATCH_SIZE)) + i)
                        i = i + 1
                except tf.errors.OutOfRangeError:
                    print("End of the training dataset, proceed to validation")
                    pass

                j = 0
                try:
                    sess.run(valid_iterator.initializer)
                    while True:
                        print("Current Validation Iteration : {}/{}".format(
                            j, floor(int(19657) / BATCH_SIZE)))
                        valid_acc, _, valid_ce = util.validation(
                            BATCH_SIZE, NUM_CLASSES, learning_rate,
                            weight_decay, sess, valid_conf_matrix_op,
                            LEARNING_RATE_1, WEIGHT_DECAY_1,
                            valid_cross_entropy, valid_accuracy)
                        valid_value1, valid_value2 = util.confidence_interval(
                            valid_acc, Z_SCORE, BATCH_SIZE)
                        print("Validation Accuracy : {}".format(valid_acc))
                        print("validation Loss (Cross Entropy) : {}".format(
                            valid_ce))
                        print(
                            "Validation Confidence Interval: [{} , {}]".format(
                                valid_value2, valid_value1))
                        if valid_highest_acc <= valid_acc:
                            valid_highest_acc = valid_acc
                            print("Highest Validation Accuracy Reached: {}".
                                  format(valid_highest_acc))
                        valid_writer.add_summary(
                            tf.summary.merge_all().eval(),
                            epoch * (floor(int(19657) / BATCH_SIZE)) + j)
                        j = j + 1
                except tf.errors.OutOfRangeError:
                    print("End of validation dataset, go to the next epoch")
                    pass
Beispiel #7
0
import iris.coord_categorisation
from cf_units import Unit
import iris.quickplot as qplt
import iris.plot as iplt
import iris.analysis as ia
import numpy as np
from scipy import interpolate
import parse_data

data = parse_data.get_data()
data = {k: data[k] for k in ["altitude", "windspeed"]}

percentiles = [1, 5, 10, 25, 50, 75, 90, 95, 99]

import matplotlib.pyplot as plt
from matplotlib import style, ticker
import seaborn as sns
from labellines import labelLines
sns.set(font_scale=1)
style.use("default")

fig, ax = plt.subplots(1, 1, figsize=(8, 6), dpi=200)
colors = plt.cm.rainbow(np.linspace(0, 1, len(percentiles)))

altitude_mean = np.array(data["altitude"].collapsed(
    ["latitude", "longitude", "time"], ia.MEAN).data)
np.save("analysis_wind_speed_percentiles_at_altitude/altitude.npy",
        altitude_mean)

for i, percentile in enumerate(percentiles):
    windspeed_pct = np.array(
from pulp import *
from parse_data import get_data

NUTRITION_FACTS = get_data()

FOODS = ["avocado", "beans", "cheese", "rice", "spinach"]

pound_per_gram = 1 / 453.5924

FOOD_COSTS = {  ### cost per pound * pound per gram * number of grams the nutrition data is based on
    "avocado": (1.25 * 2) * pound_per_gram *
    136,  # 1.25 avos * (2 avos per pound ) * lb per gram * number of grams
    "beans": 1.0 * pound_per_gram * 130,
    "cheese": 7.71 * pound_per_gram * 28,
    "rice": 1.0 * pound_per_gram * 42,
    "spinach": 7.0 * pound_per_gram * 340
}

DAILY_NUTRITION = {
    'protein': ['gte', 56],
    'calories': ['eq', 2000],
    'sodium': ['lte', 2400],
    'vitaminC': ['gte', 90],
    'vitaminA': ['gte', 700],
    'saturatedFat': ['lte', 20]
}

my_lp_problem = LpProblem("My LP Problem", LpMinimize)

# setup variables (the individual food items are the variables)
food_vars = LpVariable.dicts("Foods", FOODS, 0)
"""


from parse_data import get_data

import numpy as np
import pylab as pl

from sklearn import cluster #, datasets
#from sklearn.metrics import euclidean_distances
#from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import Scaler

from sklearn.decomposition import PCA

data = get_data()[:5000]

temperature_0 = [ r[1] for r in data[:-50]]
temperature_1 = [ r[1] for r in data[50:]]


X = np.c_[temperature_0,temperature_1]

#X = Scaler().fit_transform(X)

pca = PCA(n_components=3)
X = pca.fit(X).transform(X)


#spectral = cluster.SpectralClustering(n_clusters=2, mode='arpack',
#            affinity="nearest_neighbors")
"""
@copyright: 2013 by Pauli Rikula <*****@*****.**>
@license: MIT <http://www.opensource.org/licenses/mit-license.php>
"""


from parse_data import get_data




import numpy as np
import matplotlib.pyplot as plt


data = get_data()

temperature_0 = [ r[1] for r in data[:-1]]
temperature_1 = [ r[1] for r in data[1:]]


H, xedges, yedges = np.histogram2d(temperature_0,temperature_1, bins=(128,128))

H_log = np.log(H)

extent = [yedges[0], yedges[-1], xedges[-1], xedges[0]]

plt.imshow(H_log, extent=extent, interpolation='nearest')

plt.colorbar()
plt.xlabel("T_n")
# TenserFlow and tf.keras
import tensorflow as tf
from tensorflow import keras

# detection of face + use of webcam
import cv2

# Helper Librairies
import numpy as np
import keyboard

# parsing of profils
from parse_data import get_data
from train_data import get_train_data

train_images, train_labels, names = get_data()

# normalization
train_images = np.asarray(train_images) / 255.0

# create a neural network
model = keras.Sequential([
    keras.layers.Flatten(input_shape=(28, 28)),
    keras.layers.Dense(128, activation=tf.nn.relu),
    keras.layers.Dense(len(names), activation=tf.nn.softmax)
])

# giving the network rules
model.compile(optimizer=tf.train.AdamOptimizer(),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
Beispiel #12
0
dataset = {'k': [[1, 2], [2, 3], [3, 1]], 'r': [[6, 5], [7, 7], [8, 6]]}
new_features = [2, 5]


def kNN(data, predict, k=3):
    if len(data) >= k:
        print('k is less than total voting groups')

    distances = []
    for group in data:
        for features in data[group]:
            euclidean_distance = np.linalg.norm(
                np.array(features) - np.array(predict))
            distances.append([euclidean_distance, group])
    votes = [i[1] for i in sorted(distances)[:k]]
    vote_result = Counter(votes).most_common(1)[0][0]
    confidence = Counter(votes).most_common(1)[0][1]
    return vote_result, confidence


train_set, test_set = parse_data.get_data()
total = correct = 0

for group in test_set:
    for data in test_set[group]:
        vote, confidence = kNN(train_set, data, k=5)
        if group == vote:
            correct += 1
        total += 1
print("Accuracy: ", correct * 1.0 / total)
Beispiel #13
0
import numpy as np
import tflearn
import csv
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.estimator import regression


os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

# not required coz not jupyter
import tensorflow as tf
tf.reset_default_graph()

image_dim = parse_data.image_dim
training_data, testing_data = parse_data.get_data()
model_path = '11.cogs-and-dats.model'

# Make training set and testing set from the entire labeled training data
test_set = int(len(training_data)*0.8)
X = np.array([t[0] for t in training_data[:test_set]])
Y = np.array([t[1] for t in training_data[:test_set]])
X = X.reshape([-1, image_dim, image_dim, 1])
test_x = np.array([t[0] for t in training_data[test_set:]])
test_y = np.array([t[1] for t in training_data[test_set:]])
test_x = test_x.reshape([-1, image_dim, image_dim, 1])

# Make array from testing data that can be passed for prediction
predict_this_X = np.array([t[0] for t in testing_data])
predict_this_X = predict_this_X.reshape([-1, image_dim, image_dim, 1])
predict_this_X_id = np.array([t[1] for t in testing_data])
Beispiel #14
0
#!/usr/bin/env python3
import numpy as np
import parse_data as ps
from sklearn.naive_bayes import GaussianNB

if __name__ == '__main__':

    train_data = ps.get_data('train_clean.csv')
    test_data = ps.get_test_matrix('test_clean.csv')

    X = train_data['feature_matrix']
    Y = train_data['target_vector']
    label = test_data['label']
    TestX = test_data['feature_matrix']
    GNB = GaussianNB()
    GNB.fit(X, Y)
    Predict_Y = GNB.predict(X)
    TestY = GNB.predict(TestX)
    for i in range(len(label)):
        print("%s is %s" % (label[i][0], TestY[i]))
Beispiel #15
0
def test_model():
    '''
    Evaluates a saved model against eval and test sets. The trained model
    folder path should be the first argument. All saved models inside
    that folder will be evaluated.
    '''
    paths = None
    if len(sys.argv) < 2:
        raise Exception(
            "Must specify the path to the folder of the saved model to test")
    model_path = sys.argv[1]
    folder = ''

    if os.path.isdir(model_path):
        paths = [
            os.path.join(model_path, f) for f in os.listdir(model_path) if
            os.path.isfile(os.path.join(model_path, f)) and f.endswith(".hdf5")
        ]
        folder = model_path
    elif os.path.isfile(model_path):
        paths = [model_path]

    print()
    print("List of files to test")
    print(paths)
    print()
    model_results = []

    for i, path in enumerate(paths):
        print()
        print("Testing file " + str(i + 1) + " out of " + str(len(paths)))

        model = load_model(path)

        # Recreate the input tokenizer
        train_labels, train_sentences, train_subjects, train_party, train_credit = get_data(
            var.TRAINING_DATA_PATH)
        if var.NO_STOP_WORDS:
            train_sentences = remove_stop_words(train_sentences)
        train_party = [[party] for party in train_party]
        tokenizer = Tokenizer(num_words=var.MAX_NUM_WORDS)
        tokenizer.fit_on_texts(train_sentences)

        # Get the val input via tokenizer and val labels
        val_labels, val_sentences, val_subjects, val_party, val_credit = get_data(
            var.VALIDATION_DATA_PATH)
        if var.NO_STOP_WORDS:
            val_sentences = remove_stop_words(val_sentences)
        val_party = [[party] for party in val_party]
        val_sequences = tokenizer.texts_to_sequences(val_sentences)
        x_val = pad_sequences(val_sequences, maxlen=var.MAX_SEQUENCE_LENGTH)
        y_val = to_categorical(
            np.asarray([var.LABEL_MAPPING[label] for label in val_labels]))

        # Get the test input via tokenizer and test labels
        test_labels, test_sentences, test_subjects, test_party, test_credit = get_data(
            var.TEST_DATA_PATH)
        if var.NO_STOP_WORDS:
            test_sentences = remove_stop_words(test_sentences)
        test_party = [[party] for party in test_party]
        test_sequences = tokenizer.texts_to_sequences(test_sentences)
        x_test = pad_sequences(test_sequences, maxlen=var.MAX_SEQUENCE_LENGTH)
        y_test = to_categorical(
            np.asarray([var.LABEL_MAPPING[label] for label in test_labels]))

        x_test_pos = np.asarray(get_pos_freqs(test_sentences))
        x_val_pos = np.asarray(get_pos_freqs(val_sentences))

        var.SUBJECT_MAPPING = get_mapping(train_subjects)
        x_test_subjects = np.asarray(
            get_one_hot_vectors(test_subjects, var.NUM_SUBJECTS,
                                var.SUBJECT_MAPPING))
        x_val_subjects = np.asarray(
            get_one_hot_vectors(val_subjects, var.NUM_SUBJECTS,
                                var.SUBJECT_MAPPING))

        var.PARTY_MAPPING = get_mapping(train_party)
        x_test_party = np.asarray(
            get_one_hot_vectors(test_party, var.NUM_PARTIES,
                                var.PARTY_MAPPING))
        x_val_party = np.asarray(
            get_one_hot_vectors(val_party, var.NUM_PARTIES, var.PARTY_MAPPING))

        test_credit = clean_credit(test_labels, test_credit)
        val_credit = clean_credit(val_labels, val_credit)
        x_test_credit = np.asarray(normalize_vectors(test_credit))
        x_val_credit = np.asarray(normalize_vectors(val_credit))

        test_input = [x_test]
        val_input = [x_val]
        if var.USE_SUBJECTS:
            test_input.append(x_test_subjects)
            val_input.append(x_val_subjects)
        if var.USE_PARTY:
            test_input.append(x_test_party)
            val_input.append(x_val_party)
        if var.USE_CREDIT:
            test_input.append(x_test_credit)
            val_input.append(x_val_credit)
        if var.USE_POS:
            test_input.append(x_test_pos)
            val_input.append(x_val_pos)

        test_score = model.evaluate(test_input,
                                    y_test,
                                    batch_size=var.BATCH_SIZE)
        val_score = model.evaluate(val_input, y_val, batch_size=var.BATCH_SIZE)

        print()
        print("model = " + str(path))
        print("val loss = %0.4f, val acc = %0.4f" %
              (val_score[0], val_score[1]))
        print("test loss = %0.4f, test acc = %0.4f" %
              (test_score[0], test_score[1]))
        model_results.append((os.path.basename(path), round(val_score[0], 4),
                              round(val_score[1],
                                    4), round(test_score[0],
                                              4), round(test_score[1], 4)))

    average_val_loss = round(
        sum([info[1] for info in model_results]) / len(model_results), 4)
    average_val_acc = round(
        sum([info[2] for info in model_results]) / len(model_results), 4)
    average_test_loss = round(
        sum([info[3] for info in model_results]) / len(model_results), 4)
    average_test_acc = round(
        sum([info[4] for info in model_results]) / len(model_results), 4)

    best_val = sorted(model_results, key=lambda x: x[2], reverse=True)[:5]
    best_test = sorted(model_results, key=lambda x: x[4], reverse=True)[:5]

    with open(os.path.join(folder, "stats.txt"), "w") as f:
        f.write("Num models = " + str(len(model_results)) + "\n")
        f.write("Average val_loss = " + str(average_val_loss) + "\n")
        f.write("Average val_acc = " + str(average_val_acc) + "\n")
        f.write("Average test_loss = " + str(average_test_loss) + "\n")
        f.write("Average test_acc = " + str(average_test_acc) + "\n")
        f.write("\n")
        f.write("Top 5 validation accuracies:\n")
        for model in best_val:
            f.write("\t" + str(model[2]) + "\t" + str(model[0]) + "\n")
        f.write("\n")
        f.write("Top 5 test accuracies:\n")
        for model in best_test:
            f.write("\t" + str(model[4]) + "\t" + str(model[0]) + "\n")
        f.write("\n")
        f.write("Results for all models:\n")
        for model in sorted(model_results, key=lambda x: x[0]):
            f.write("\t" + str(model) + "\n")

    print()
    print("---------------------- OVERALL STATISTICS -----------------------")
    print()
    with open(os.path.join(folder, "stats.txt"), "r") as f:
        print(f.read())
"""
@copyright: 2013 by Pauli Rikula <*****@*****.**>
@license: MIT <http://www.opensource.org/licenses/mit-license.php>
"""


#for parsing
from parse_data import get_data

from plot_tools import makeplot, daily, monthly

    
if __name__ == "__main__":
    signal = get_data()
    makeplot(signal, daily)
    makeplot(signal, monthly)