# This file trains the neural network using the encoder and decoder. import __init__ import sys import numpy as np import tensorflow as tf import pickle from functools import reduce import os # custom imports from network.config import CONFIG from network.classifier import build_cnn_classifier, get_batch, get_feed_dict from evaluation_metrics import get_f1_from_tokens, get_exact_match_from_tokens from preprocessing.dataset import Dataset from score import Score D = Dataset(CONFIG.EMBEDDING_FILE) index2embedding = D.index2embedding padded_data, (max_length_question, max_length_context) = D.load_questions(CONFIG.QUESTION_FILE_V2) print("Loaded data") tf.reset_default_graph() embedding = tf.placeholder( shape=[len(index2embedding), CONFIG.EMBEDDING_DIMENSION], dtype=tf.float32, name='embedding_ph') train_op, loss, classifier_out = build_cnn_classifier(embedding) root_path = __init__.root_path results_path = root_path + '/resultsclassifier' model_path = root_path + '/modelclassifier'
import numpy as np from feature_extraction.sliding_window import SlidingWindow from feature_extraction.static_features import StaticFeatures from lvq.cross_validation import CrossValidateLvq from preprocessing.dataset import Dataset splits = 5 sw_file = '../feature_extraction/data/sliding_window/50_0_10_10.json' sw = SlidingWindow(file_path=sw_file) # sw = SlidingWindow() sf_file = '../feature_extraction/data/static_features.csv' sf = StaticFeatures(file_path=sf_file) ds = Dataset() df, mid_points = ds.generate_dataset(sw, sf, n_label_bins=20) df = ds.rescale_labels(df, 5, 11) n_bins = len(np.unique(df.binned_points)) largest_label = np.unique(df.binned_points)[-1] n_bins = n_bins if largest_label == n_bins - 1 else largest_label + 1 model = glvq.LgmlvqModel(prototypes_per_class=1) cv_lvq = CrossValidateLvq(n_bins, splits, model) cv_lvq.cross_validate(df, gradient=False) print cv_lvq.print_conv_matrix() tp = np.sum(cv_lvq.conf_matrix.diagonal()) tot = np.sum(np.sum(cv_lvq.conf_matrix))
def test_Dataset_init(): newDataset = Dataset("someName") assert isinstance(newDataset, Dataset) assert newDataset.outputName == "someName"
import tensorflow as tf from functools import reduce import os import pickle from preprocessing.preprocess import answer_span_to_indices # custom imports from preprocessing.dataset import Dataset from network.config import CONFIG from network.build_model import get_batch from evaluation_metrics import get_f1_from_tokens, get_exact_match_from_tokens # Suppress tensorflow verboseness os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' print("Starting testing on dev file...") D = Dataset(CONFIG.EMBEDDING_FILE) index2embedding = D.index2embedding padded_data, (max_length_question, max_length_context) = D.load_questions('data/dev.json') print("Loaded data") # split padded data per the start index of the question split_data_pre = dict() for qas in padded_data: first_word = D.index2word[qas["question"][0]].lower() if first_word not in split_data_pre: split_data_pre[first_word] = [] split_data_pre[first_word].append(qas) # Extract data bigger than batch size split_data = dict()
#nltk.download('averaged_perceptron_tagger') question_asked = input( "Enter a \'wh\' question, for example: Who is Sachin Ramesh Tendulkar?\n") text = nltk.word_tokenize(question_asked) processed_pos = nltk.pos_tag(text) text_to_search = '' for index in range(len(processed_pos)): if ("VB" in processed_pos[index][1]): text_to_search = ' '.join(question_asked.split(' ')[index + 1:]) break summary = wikipedia.summary(text_to_search) context = ' '.join(summary.split()[:CONFIG.MAX_CONTEXT_LENGTH - 2]) D = Dataset(CONFIG.EMBEDDING_FILE) index2embedding = D.index2embedding question_encoding, context_encoding = D.encode_single_question( question_asked, context, CONFIG.MAX_QUESTION_LENGTH, CONFIG.MAX_CONTEXT_LENGTH) embedding_dimension = 300 init = tf.global_variables_initializer() latest_checkpoint_path = './model/saved-7' print("restoring from " + latest_checkpoint_path) saver = tf.train.import_meta_graph(latest_checkpoint_path + '.meta') config = tf.ConfigProto() if '--noGPU' in sys.argv[1:]: print("Not using the GPU...")
def test_Dataset_resolveWildcardBranch(selector, inbranches, expectBranches): newDataset = Dataset("someName") newDataset.filesAdded = True newDataset.branches = inbranches assert expectBranches == newDataset._resolveWildcardBranch(selector)
def convertTreeMulti(config, treeName, category): logging.info("Starting conversion using multi method") checkNcreateFolder(config.outputFolder) #For multi mode, we generate a dataset per sample. In the loop the output is disabled and in the end the #dataframs of the 1:: samples will be added to the first and saved eventsLeft = config.maxEvents dfs = [] baseDataset = None for iSample, sample in enumerate(config.samples): logging.info("Processing sample %s", sample) if iSample == 0: datasetName = config.outputPrefix + "_" + config.sampleName + "_" + config.categories[ category].name else: datasetName = config.outputPrefix + "_" + config.sampleInfo[ sample].name + "_" + config.categories[category].name dataset = Dataset(datasetName, config.outputFolder, treeName) logging.info("Setting sample selection: %s", config.sampleInfo[sample].selection) dataset.sampleSelection = config.sampleInfo[sample].selection logging.info("Setting category selection: %s", config.categories[category].selection) dataset.selection = config.categories[category].selection if config.excludeBranches is not None: dataset.ignoreBranches = config.excludeBranches logging.info("Setting files") dataset.addFiles(config.sampleInfo[sample].files) logging.info("Setting output branches") dataset.setOutputBranches(config.outputVariables) logging.debug("Setting indexing branches: %s", config.indexVariables) dataset.outputIndex = config.indexVariables if config.addRatio: dataset.setSF(config.sampleInfo[sample].addSF, "sampleRatio") logging.info("Starting processing dataset") thisSampleDF = dataset.process(eventsLeft, skipOutput=True) eventsLeft -= len(thisSampleDF) dfs.append(thisSampleDF) if iSample == 0: baseDataset = copy(dataset) baseDataset.makeOutput(pd.concat(dfs)) logging.info("Finished processing")
def convertTree(config, treeName, category): """ Wrapper for the functionality of preprocessing.dataset """ logging.info("Starting conversion") checkNcreateFolder(config.outputFolder) datasetName = config.outputPrefix + "_" + config.sampleName + "_" + config.categories[ category].name dataset = Dataset(datasetName, config.outputFolder, treeName) logging.info("Setting sample selection: %s", config.sampleSelection) dataset.sampleSelection = config.sampleSelection logging.info("Setting category selection: %s", config.categories[category].selection) dataset.selection = config.categories[category].selection if config.excludeBranches is not None: dataset.ignoreBranches = config.excludeBranches logging.info("Setting files") dataset.addFiles(config.files) logging.info("Setting output branches") dataset.setOutputBranches(config.outputVariables) logging.debug("Setting indexing branches: %s", config.indexVariables) dataset.outputIndex = config.indexVariables if config.addRatio: dataset.setSF(config.sampleSF, "sampleRatio") logging.info("Starting processing dataset") dataset.process(config.maxEvents) logging.info("Finished processing")
from lvq.predictor import Predictor from preprocessing.dataset import Dataset sw_file = '../feature_extraction/data/sliding_window/50_0_10_10.json' sw = SlidingWindow(file_path=sw_file) # Gets the sliding window data from the measurements # Can be generated from the database or extracted from a file # # sf_file = '../feature_extraction/data/static_features.csv' sf = StaticFeatures(file_path=sf_file) # Gets the static features by the measure points # Can be generated from the database or extracted from a file ds = Dataset() df, mid_points = ds.generate_dataset(sw, sf, n_label_bins=20) # Concatenates the static data and the sliding window and bins the labels # df = ds.rescale_labels(df, 5, 11) # mid_points = np.array(mid_points)[4:] # optional, removes barely used labels # Still needs to be optimized mpdata = df.loc[df.measure_point_id == 767] df = df[df.measure_point_id != 767] # take 1 measure point for testing and remove it from the dataset x = df.iloc[:, 2:-4] # the data to train lvq on,
def main(): # TODO hyperopt for optimal parameter # --- OPEN CONFIGURATION FILE AND GET PARAMETERS--- json_file = open('config/config.json', 'r') json_file = json.load(json_file) training_path = json_file['Training'] validation_path = json_file['Validation'] test_path = json_file['Test'] parameters = json_file['Parameters'] # --- CREATE DATASET AND COMPUTE A PRE-PROCESSING--- dataset = Dataset(training_path, validation_path, test_path, parameters) x_train, y_train = dataset.get_training() x_val, y_val = dataset.get_validation() x_test, y_test = dataset.get_test() # Decomment this lines to performs a pre-processing study # After normalization plot_pixel_intensity(x_train[0], './pixel_intensity_after_normalization.png') # Verify if dataset is balanced counters = dataset.get_counters() balance(counters) datagen = keras.preprocessing.image.ImageDataGenerator() train_batches = len(x_train) // parameters[ 'BatchSize'] # // operator indicates a floor division # --- DEFINE MODEL --- model = custom_cnn_2(num_classes=parameters['NumClass'], heigth=parameters['Height'], width=parameters['Width'], channels=parameters['Channels']) lr = parameters['LearningRate'] decay = lr / parameters['NumEpoch'] # --- TRAIN MODEL --- model.compile(loss=categorical_crossentropy, optimizer=Adadelta(), metrics=['accuracy']) # --- CHECKPOINTING TO SAVE BEST NETWORK --- filepath = 'models_saved/custom_cnn_2.hdf5' checkpoint = ModelCheckpoint(filepath=filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') history = model.fit_generator(generator=datagen.flow( x_train, y_train, batch_size=parameters['BatchSize']), steps_per_epoch=train_batches, validation_data=[x_val, y_val], epochs=parameters['NumEpoch'], callbacks=[Metrics('logs'), checkpoint]) model.load_weights('models_saved/custom_cnn_2.hdf5') model.compile(loss=categorical_crossentropy, optimizer=Adadelta(), metrics=['accuracy']) # --- EVALUATE MODEL --- score = model.evaluate(x=x_val, y=y_val, verbose=0) # --- PREDICT NEW VALUES --- y_pred = model.predict( x_val ) # Use validation because in this way we can evaluate some metrics with sklearn (use test with true prediction) write_to_file(score, y_val, y_pred, dict_elem, parameters['BatchSize'], parameters['NumEpoch'], "Custom CNN 2") classes_predicted = np.argmax(y_pred, axis=1) classes_true = np.argmax(y_val, axis=1) # --- PLOT RESULTS --- plot_learning_curve(history) plt.show() confusion_mtx = metrics.confusion_matrix(classes_true, classes_predicted) plot_confusion_matrix(confusion_mtx, classes=list(dict_elem.values())) plt.show() # --- SAVE MODEL AND WEIGHTS --- model_json = model.to_json() with open('models_saved/custom_cnn_2.json', 'w') as mod: mod.write(model_json) print("Model was saved successfully!")
import numpy as np import tensorflow as tf import pickle from functools import reduce import os from preprocessing.preprocess import answer_span_to_indices # custom imports from preprocessing.dataset import Dataset from network.config import CONFIG from network.build_model import get_batch from evaluation_metrics import get_f1_from_tokens, get_exact_match_from_tokens # Suppress tensorflow verboseness os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' D = Dataset(CONFIG.EMBEDDING_FILE) index2embedding = D.index2embedding #padded_data_squad1, (max_length_question, max_length_context) = D.load_questions('data/train.json') #padded_data_validation = padded_data_squad1[(int) (CONFIG.TRAIN_PERCENTAGE*len(padded_data_squad1)):] #untrained_contexts = [x["context"] for x in padded_data_validation] #print("Loaded data from squad one") padded_data_squad2, (max_length_question_squad2, max_length_context_squad2) = D.load_questions('data/train-v2.0.json') print("padded_data_squad2.len = ",len(padded_data_squad2)) print("Max length from squad 2 q and c: ", max_length_question_squad2, max_length_context_squad2) print("Loaded data from squad two") ''' padded_data_untrained = [x for x in padded_data_squad2 if x["context"] in untrained_contexts] unanswerable_data = [x for x in padded_data_untrained if x["answer_start"]==-1] answerable_data = [x for x in padded_data_untrained if x["answer_start"]>=0] print("Number of unanswerable questions: ",len(unanswerable_data))