コード例 #1
0
# This file trains the neural network using the encoder and decoder.
import __init__
import sys
import numpy as np
import tensorflow as tf
import pickle
from functools import reduce
import os
# custom imports
from network.config import CONFIG
from network.classifier import build_cnn_classifier, get_batch, get_feed_dict
from evaluation_metrics import get_f1_from_tokens, get_exact_match_from_tokens
from preprocessing.dataset import Dataset
from score import Score

D = Dataset(CONFIG.EMBEDDING_FILE)
index2embedding = D.index2embedding
padded_data, (max_length_question,
              max_length_context) = D.load_questions(CONFIG.QUESTION_FILE_V2)
print("Loaded data")

tf.reset_default_graph()
embedding = tf.placeholder(
    shape=[len(index2embedding), CONFIG.EMBEDDING_DIMENSION],
    dtype=tf.float32,
    name='embedding_ph')
train_op, loss, classifier_out = build_cnn_classifier(embedding)

root_path = __init__.root_path
results_path = root_path + '/resultsclassifier'
model_path = root_path + '/modelclassifier'
コード例 #2
0
import numpy as np

from feature_extraction.sliding_window import SlidingWindow
from feature_extraction.static_features import StaticFeatures
from lvq.cross_validation import CrossValidateLvq
from preprocessing.dataset import Dataset

splits = 5

sw_file = '../feature_extraction/data/sliding_window/50_0_10_10.json'
sw = SlidingWindow(file_path=sw_file)
#  sw = SlidingWindow()
sf_file = '../feature_extraction/data/static_features.csv'
sf = StaticFeatures(file_path=sf_file)

ds = Dataset()
df, mid_points = ds.generate_dataset(sw, sf, n_label_bins=20)
df = ds.rescale_labels(df, 5, 11)

n_bins = len(np.unique(df.binned_points))
largest_label = np.unique(df.binned_points)[-1]
n_bins = n_bins if largest_label == n_bins - 1 else largest_label + 1

model = glvq.LgmlvqModel(prototypes_per_class=1)
cv_lvq = CrossValidateLvq(n_bins, splits, model)
cv_lvq.cross_validate(df, gradient=False)

print
cv_lvq.print_conv_matrix()
tp = np.sum(cv_lvq.conf_matrix.diagonal())
tot = np.sum(np.sum(cv_lvq.conf_matrix))
コード例 #3
0
def test_Dataset_init():
    newDataset = Dataset("someName")

    assert isinstance(newDataset, Dataset)
    assert newDataset.outputName == "someName"
コード例 #4
0
import tensorflow as tf
from functools import reduce
import os
import pickle
from preprocessing.preprocess import answer_span_to_indices

# custom imports
from preprocessing.dataset import Dataset
from network.config import CONFIG
from network.build_model import get_batch
from evaluation_metrics import get_f1_from_tokens, get_exact_match_from_tokens
# Suppress tensorflow verboseness
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

print("Starting testing on dev file...")
D = Dataset(CONFIG.EMBEDDING_FILE)
index2embedding = D.index2embedding
padded_data, (max_length_question,
              max_length_context) = D.load_questions('data/dev.json')
print("Loaded data")

# split padded data per the start index of the question
split_data_pre = dict()
for qas in padded_data:
    first_word = D.index2word[qas["question"][0]].lower()
    if first_word not in split_data_pre:
        split_data_pre[first_word] = []
    split_data_pre[first_word].append(qas)

# Extract data bigger than batch size
split_data = dict()
コード例 #5
0
#nltk.download('averaged_perceptron_tagger')

question_asked = input(
    "Enter a \'wh\' question, for example: Who is Sachin Ramesh Tendulkar?\n")
text = nltk.word_tokenize(question_asked)
processed_pos = nltk.pos_tag(text)
text_to_search = ''
for index in range(len(processed_pos)):
    if ("VB" in processed_pos[index][1]):
        text_to_search = ' '.join(question_asked.split(' ')[index + 1:])
        break

summary = wikipedia.summary(text_to_search)

context = ' '.join(summary.split()[:CONFIG.MAX_CONTEXT_LENGTH - 2])
D = Dataset(CONFIG.EMBEDDING_FILE)
index2embedding = D.index2embedding
question_encoding, context_encoding = D.encode_single_question(
    question_asked, context, CONFIG.MAX_QUESTION_LENGTH,
    CONFIG.MAX_CONTEXT_LENGTH)

embedding_dimension = 300
init = tf.global_variables_initializer()

latest_checkpoint_path = './model/saved-7'
print("restoring from " + latest_checkpoint_path)
saver = tf.train.import_meta_graph(latest_checkpoint_path + '.meta')

config = tf.ConfigProto()
if '--noGPU' in sys.argv[1:]:
    print("Not using the GPU...")
コード例 #6
0
def test_Dataset_resolveWildcardBranch(selector, inbranches, expectBranches):
    newDataset = Dataset("someName") 
    newDataset.filesAdded = True
    newDataset.branches = inbranches
    
    assert expectBranches == newDataset._resolveWildcardBranch(selector)
コード例 #7
0
def convertTreeMulti(config, treeName, category):
    logging.info("Starting conversion using multi method")
    checkNcreateFolder(config.outputFolder)

    #For multi mode, we generate a dataset per sample. In the loop the output is disabled and in the end the
    #dataframs of the 1:: samples will be added to the first and saved

    eventsLeft = config.maxEvents
    dfs = []
    baseDataset = None
    for iSample, sample in enumerate(config.samples):
        logging.info("Processing sample %s", sample)
        if iSample == 0:
            datasetName = config.outputPrefix + "_" + config.sampleName + "_" + config.categories[
                category].name
        else:
            datasetName = config.outputPrefix + "_" + config.sampleInfo[
                sample].name + "_" + config.categories[category].name
        dataset = Dataset(datasetName, config.outputFolder, treeName)
        logging.info("Setting sample selection: %s",
                     config.sampleInfo[sample].selection)
        dataset.sampleSelection = config.sampleInfo[sample].selection
        logging.info("Setting category selection: %s",
                     config.categories[category].selection)
        dataset.selection = config.categories[category].selection

        if config.excludeBranches is not None:
            dataset.ignoreBranches = config.excludeBranches
        logging.info("Setting files")
        dataset.addFiles(config.sampleInfo[sample].files)

        logging.info("Setting output branches")

        dataset.setOutputBranches(config.outputVariables)

        logging.debug("Setting indexing branches: %s", config.indexVariables)
        dataset.outputIndex = config.indexVariables

        if config.addRatio:
            dataset.setSF(config.sampleInfo[sample].addSF, "sampleRatio")

        logging.info("Starting processing dataset")
        thisSampleDF = dataset.process(eventsLeft, skipOutput=True)
        eventsLeft -= len(thisSampleDF)
        dfs.append(thisSampleDF)
        if iSample == 0:
            baseDataset = copy(dataset)

    baseDataset.makeOutput(pd.concat(dfs))
    logging.info("Finished processing")
コード例 #8
0
def convertTree(config, treeName, category):
    """ Wrapper for the functionality of preprocessing.dataset  """
    logging.info("Starting conversion")

    checkNcreateFolder(config.outputFolder)

    datasetName = config.outputPrefix + "_" + config.sampleName + "_" + config.categories[
        category].name
    dataset = Dataset(datasetName, config.outputFolder, treeName)

    logging.info("Setting sample selection: %s", config.sampleSelection)
    dataset.sampleSelection = config.sampleSelection
    logging.info("Setting category selection: %s",
                 config.categories[category].selection)
    dataset.selection = config.categories[category].selection

    if config.excludeBranches is not None:
        dataset.ignoreBranches = config.excludeBranches

    logging.info("Setting files")
    dataset.addFiles(config.files)

    logging.info("Setting output branches")
    dataset.setOutputBranches(config.outputVariables)

    logging.debug("Setting indexing branches: %s", config.indexVariables)
    dataset.outputIndex = config.indexVariables

    if config.addRatio:
        dataset.setSF(config.sampleSF, "sampleRatio")

    logging.info("Starting processing dataset")
    dataset.process(config.maxEvents)

    logging.info("Finished processing")
コード例 #9
0
from lvq.predictor import Predictor
from preprocessing.dataset import Dataset

sw_file = '../feature_extraction/data/sliding_window/50_0_10_10.json'
sw = SlidingWindow(file_path=sw_file)
# Gets the sliding window data from the measurements
# Can be generated from the database or extracted from a file
#
#
sf_file = '../feature_extraction/data/static_features.csv'
sf = StaticFeatures(file_path=sf_file)
# Gets the static features by the measure points
# Can be generated from the database or extracted from a file


ds = Dataset()
df, mid_points = ds.generate_dataset(sw, sf, n_label_bins=20)
# Concatenates the static data and the sliding window and bins the labels


# df = ds.rescale_labels(df, 5, 11)
# mid_points = np.array(mid_points)[4:]
# optional, removes barely used labels
# Still needs to be optimized

mpdata = df.loc[df.measure_point_id == 767]
df = df[df.measure_point_id != 767]
#  take 1 measure point for testing and remove it from the dataset

x = df.iloc[:, 2:-4]
# the data to train lvq on,
コード例 #10
0
def main():
    # TODO hyperopt for optimal parameter

    # --- OPEN CONFIGURATION FILE AND GET PARAMETERS---
    json_file = open('config/config.json', 'r')
    json_file = json.load(json_file)

    training_path = json_file['Training']
    validation_path = json_file['Validation']
    test_path = json_file['Test']
    parameters = json_file['Parameters']

    # --- CREATE DATASET AND COMPUTE A PRE-PROCESSING---
    dataset = Dataset(training_path, validation_path, test_path, parameters)
    x_train, y_train = dataset.get_training()
    x_val, y_val = dataset.get_validation()
    x_test, y_test = dataset.get_test()

    # Decomment this lines to performs a pre-processing study
    # After normalization
    plot_pixel_intensity(x_train[0],
                         './pixel_intensity_after_normalization.png')

    # Verify if dataset is balanced
    counters = dataset.get_counters()
    balance(counters)

    datagen = keras.preprocessing.image.ImageDataGenerator()

    train_batches = len(x_train) // parameters[
        'BatchSize']  # // operator indicates a floor division

    # --- DEFINE MODEL ---
    model = custom_cnn_2(num_classes=parameters['NumClass'],
                         heigth=parameters['Height'],
                         width=parameters['Width'],
                         channels=parameters['Channels'])

    lr = parameters['LearningRate']
    decay = lr / parameters['NumEpoch']

    # --- TRAIN MODEL ---
    model.compile(loss=categorical_crossentropy,
                  optimizer=Adadelta(),
                  metrics=['accuracy'])

    # --- CHECKPOINTING TO SAVE BEST NETWORK ---
    filepath = 'models_saved/custom_cnn_2.hdf5'
    checkpoint = ModelCheckpoint(filepath=filepath,
                                 monitor='val_acc',
                                 verbose=1,
                                 save_best_only=True,
                                 mode='max')

    history = model.fit_generator(generator=datagen.flow(
        x_train, y_train, batch_size=parameters['BatchSize']),
                                  steps_per_epoch=train_batches,
                                  validation_data=[x_val, y_val],
                                  epochs=parameters['NumEpoch'],
                                  callbacks=[Metrics('logs'), checkpoint])

    model.load_weights('models_saved/custom_cnn_2.hdf5')
    model.compile(loss=categorical_crossentropy,
                  optimizer=Adadelta(),
                  metrics=['accuracy'])

    # --- EVALUATE MODEL ---
    score = model.evaluate(x=x_val, y=y_val, verbose=0)

    # --- PREDICT NEW VALUES ---
    y_pred = model.predict(
        x_val
    )  # Use validation because in this way we can evaluate some metrics with sklearn (use test with true prediction)

    write_to_file(score, y_val, y_pred, dict_elem, parameters['BatchSize'],
                  parameters['NumEpoch'], "Custom CNN 2")

    classes_predicted = np.argmax(y_pred, axis=1)
    classes_true = np.argmax(y_val, axis=1)

    # --- PLOT RESULTS ---
    plot_learning_curve(history)
    plt.show()

    confusion_mtx = metrics.confusion_matrix(classes_true, classes_predicted)
    plot_confusion_matrix(confusion_mtx, classes=list(dict_elem.values()))
    plt.show()

    # --- SAVE MODEL AND WEIGHTS ---
    model_json = model.to_json()
    with open('models_saved/custom_cnn_2.json', 'w') as mod:
        mod.write(model_json)

    print("Model was saved successfully!")
コード例 #11
0
import numpy as np
import tensorflow as tf
import pickle
from functools import reduce
import os
from preprocessing.preprocess import answer_span_to_indices
# custom imports
from preprocessing.dataset import Dataset
from network.config import CONFIG
from network.build_model import get_batch
from evaluation_metrics import get_f1_from_tokens, get_exact_match_from_tokens

# Suppress tensorflow verboseness
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

D = Dataset(CONFIG.EMBEDDING_FILE)
index2embedding = D.index2embedding
#padded_data_squad1, (max_length_question, max_length_context) = D.load_questions('data/train.json')
#padded_data_validation = padded_data_squad1[(int) (CONFIG.TRAIN_PERCENTAGE*len(padded_data_squad1)):]
#untrained_contexts = [x["context"] for x in padded_data_validation]
#print("Loaded data from squad one")

padded_data_squad2, (max_length_question_squad2, max_length_context_squad2) = D.load_questions('data/train-v2.0.json')
print("padded_data_squad2.len = ",len(padded_data_squad2))
print("Max length from squad 2 q and c: ", max_length_question_squad2, max_length_context_squad2)
print("Loaded data from squad two")
'''
padded_data_untrained = [x for x in padded_data_squad2 if x["context"] in untrained_contexts]
unanswerable_data = [x for x in padded_data_untrained if x["answer_start"]==-1]
answerable_data = [x for x in padded_data_untrained if x["answer_start"]>=0]
print("Number of unanswerable questions: ",len(unanswerable_data))