def test_testing_convolution(text_vectorizer, class_vectorizer, classif_level, classif_type, dataset_location): load_standard_sets = True root_location = fh.get_root_location('data/convolutional_outcome/') sets_location = fh.join_paths(root_location, "model_sets") checkpoint_path = fh.join_paths(root_location, "model_checkpoints") model_path = fh.link_paths(checkpoint_path, 'convolution_model') weights_path = fh.link_paths(checkpoint_path, 'convolution_weights') # load sets and settings model_name = text_vectorizer+'/'+class_vectorizer+'/NN' # train_data, test_data, val_data, train_labels, test_labels, val_labels, settings = ch.load_sets(sets_location, '2020-01-08T22:28:44.757410') # classes, n_classes, vocab_processor, len_vocabulary = settings # only for test train_data, test_data, val_data, train_labels, test_labels, val_labels, _ = ch.load_sets(sets_location) # it could be that a label is only in the test/data data, might be a problem sequence_length = train_data.shape[1] # calculates metrics with testing data model, predictions = pmh.run_cnn_test(_, train_data, train_labels, test_data, test_labels, val_data, val_labels, model_path, weights_path, False) binary_predictions = mh.get_binary_0_5(predictions) # display testing metrics metrics = mh.get_sequential_metrics(test_labels, predictions, binary_predictions) mh.display_sequential_metrics('testing convolution sequence', metrics) classifier_name, layers = ch.get_sequential_classifier_information(model) ch.save_results(classifier_name+' Test', metrics, layers, model_name, classif_level, classif_type, dataset_location)
def test_basic_LSTM(data_frame, text_vectorizer, classif_level, classif_type, dataset_location): print('### LSTM Doing Testing ###') root_location = fh.get_root_location('data/lstm_outcome/') doc2vec_model_location = fh.link_paths(fh.join_paths(root_location, "doc2vec_model/vocab_model/"), "doc2vec_model") model_location = link_paths(join_paths(root_location, "lstm_model"), "lstm_model") sets_location = join_paths(root_location, "model_sets") save_results = True load_model = True # date problem X_train, X_test, X_val, y_train, y_test, y_val, settings = load_sets(sets_location) classes, n_classes, vocab_processor, len_vocabulary = settings training_docs_list = X_train['patent_id'] test_docs_list = X_test['patent_id'] val_docs_list = X_val['patent_id'] sequence_size, embedding_size = 1, 150 X_data, Xv_data, Xt_data = ch.get_df_data(3, training_docs_list, val_docs_list, test_docs_list, sequence_size, embedding_size, doc2vec_model_location) ##### print(X_data.shape) input_size, output_size = X_data.shape[2], n_classes if load_model: model = tf.keras.models.load_model(model_location) min_delta, patience = 0.00001, 15 early_stopper = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=min_delta, patience=patience, verbose=1, mode='auto') metrics_callback = MetricsCNNCallback(Xv_data, y_val, patience) metrics, predictions, val_metrics = run_lstm(model, X_data, y_train, Xt_data, y_test, Xv_data, y_val, batch_size, [early_stopper, metrics_callback], queue_size, training_docs_list, val_docs_list) binary_predictions = mh.get_binary_0_5(predictions) print('\nGenerating Testing Metrics') metrics = mh.get_sequential_metrics(y_val, predictions, binary_predictions) mh.display_sequential_metrics(metrics) if save_results: classifier_name, parameters = ch.get_sequential_classifier_information(model) model_name = text_vectorizer+'/'+class_vectorizer+'/'+classifier_name ch.save_results(classifier_name+'_LSTM', metrics, parameters, model_name, classif_level, classif_type, dataset_location) print('end testing step')
def predict_generator(history, model, x_data, y_label, NN_BATCH_SIZE, QUEUE_SIZE, docs_list): y_pred = model.predict_generator( # generator=ch.batch_generator(Xv_file, yv_file, NN_BATCH_SIZE QUEUE_SIZE, is_mlp=False, validate=True), generator=ch.batch_generator(x_data, y_label, NN_BATCH_SIZE, QUEUE_SIZE, is_mlp=False, validate=True), max_q_size=QUEUE_SIZE, val_samples=len(docs_list)) y_pred_binary = mh.get_binary_0_5(y_pred) return history, y_pred, y_pred_binary
def train_testing_convolution(data_frame, text_vectorizer, class_vectorizer): save_standard_sets = True root_location = fh.get_root_location('data/convolutional_outcome/') sets_location = fh.join_paths(root_location, "model_sets") checkpoint_path = fh.join_paths(root_location, "model_checkpoints") model_path = fh.link_paths(checkpoint_path, 'convolution_model') weights_path = fh.link_paths(checkpoint_path, 'convolution_weights') # get sets model_name = text_vectorizer+'/'+class_vectorizer+'/NN' standard_results = ch.apply_df_vectorizer(data_frame, text_vectorizer, class_vectorizer, model_name) train_data, test_data, train_labels, test_labels, classes, n_classes, vocab_processor, len_vocabulary = standard_results train_data, val_data, train_labels, val_labels = ch.get_train_test_from_data(train_data, train_labels) # save sets # ch.save_sets(sets_location, train_data, test_data, val_data, train_labels, test_labels, val_labels, # [classes, n_classes, vocab_processor, len_vocabulary]) # this is for test train_data, test_data, val_data, train_labels, test_labels, val_labels, _ = ch.load_sets(sets_location) # it could be that a label is only in the test/data data, might be a problem sequence_length = train_data.shape[1] # define the model model = pmh.get_cnn_test(len_vocabulary, n_classes, sequence_length) # calculates metrics with validating data model, val_predictions = pmh.run_cnn_test(model, train_data, train_labels, val_data, val_labels, val_data, val_labels, model_path, weights_path, True) binary_val_predictions = mh.get_binary_0_5(val_predictions) print(val_labels.shape) print(val_predictions.shape) # display validation metrics metrics = mh.get_sequential_metrics(val_labels, val_predictions, binary_predictions) mh.display_sequential_metrics('validation convolution sequence', metrics)
def train_basic_LSTM(data_frame, text_vectorizer, classif_level, classif_type, dataset_location): print('### LSTM - Training ###') root_location = get_root_location('data/lstm_outcome/') doc2vec_model_location = link_paths(join_paths(root_location, "doc2vec_model/vocab_model/"), "doc2vec_model") model_location = link_paths(join_paths(root_location, "lstm_model"), "lstm_model") sets_location = join_paths(root_location, "model_sets") save_results = False save_model = True model_name = text_vectorizer+'/'+class_vectorizer+'/LSTM' results = apply_df_vectorizer(data_frame, text_vectorizer, class_vectorizer, model_name) X_train, X_val, y_train, y_val, classes, n_classes, vocab_processor, len_vocabulary = results save_sets(sets_location, X_train, None, X_val, y_train, None, y_val, [classes, n_classes, vocab_processor, len_vocabulary]) # val_path problem training_docs_list = X_train['patent_id'] val_docs_list = X_val['patent_id'] sequence_size, embedding_size = 1, 150 X_data, Xv_data, _ = ch.get_df_data(2, training_docs_list, val_docs_list, None, sequence_size, embedding_size, doc2vec_model_location) # sequence size is the (, number, ) in the tuple data print(X_data.shape) input_size, output_size = X_data.shape[2], n_classes parameters = { "estimator__epochs": 200, "estimator__batch_size": 64, "estimator__optimizer_1": 'Adam', "estimator__optimizer_2": 'Adam', "estimator__optimizer_3": 'Adam', "estimator__optimizer_4": 'Adam', "estimator__optimizer_5": 'Adam', "estimator__optimizer_6": 'Adam', "estimator__init_mode_1": 'uniform', "estimator__init_mode_2": 'uniform', "estimator__init_mode_3": 'uniform', "estimator__init_mode_4": 'uniform', "estimator__init_mode_5": 'uniform', "estimator__init_mode_6": 'uniform', "estimator__activation_1": 'softmax', "estimator__activation_2": 'softmax', "estimator__activation_3": 'softmax', "estimator__activation_4": 'softmax', "estimator__activation_5": 'softmax', "estimator__activation_6": 'softmax', "estimator__dropout_rate_1": .0, "estimator__dropout_rate_2": .0, "estimator__dropout_rate_3": .0, "estimator__dropout_rate_4": .0, "estimator__weight_constraint_1": 1, "estimator__weight_constraint_2": 1, "estimator__weight_constraint_3": 1, "estimator__weight_constraint_4": 1, "estimator__weight_constraint_5": 1, "estimator__weight_constraint_6": 1, "estimator__neurons_1": n_classes*20, "estimator__neurons_2": n_classes*5, "estimator__neurons_3": n_classes*2, "estimator__neurons_4": n_classes*1.5, "estimator__filters": 16, "estimator__filters_2": 16, "estimator__filters_3": 16, "estimator__kernel_size_1": 8, "estimator__kernel_size_2": 8, "estimator__kernel_size_3": 8, "estimator__strides_1": 8, "estimator__strides_2": 8, "estimator__strides_3": 8, "estimator__activation_lstm_1": 'tanh', "estimator__activation_lstm_2": 'tanh', "estimator__activation_lstm_3": 'tanh', "estimator__recurrent_activation_1": 'hard_sigmoid', "estimator__recurrent_activation_2": 'hard_sigmoid', "estimator__recurrent_activation_3": 'hard_sigmoid', "estimator__w_dropout_do_1": .2, "estimator__w_dropout_do_2": .2, "estimator__w_dropout_do_3": .2, "estimator__u_dropout_do_1": .2, "estimator__u_dropout_do_2": .2, "estimator__u_dropout_do_3": .2, "estimator__backwards_1": False, "estimator__backwards_2": False, "estimator__backwards_3": False, "estimator__unroll_1": False, "estimator__unroll_2": False, "estimator__unroll_3": False } model = get_lstm(optimizer, init_mode_1, activation_1, init_mode_2, activation_2, init_mode_3, activation_3, init_mode_4, activation_4, init_mode_5, activation_5, init_mode_6, activation_6, weight_constraint_1, weight_constraint_2, weight_constraint_3, weight_constraint_4, weight_constraint_5, weight_constraint_6, dropout_rate_1, dropout_rate_2, dropout_rate_3, dropout_rate_4, neurons_1, neurons_2, neurons_3, filters_1, filters_2, filters_3, kernel_size_1, kernel_size_2, kernel_size_3, strides_1, strides_2, strides_3, activation_lstm_1, activation_lstm_2, activation_lstm_3, recurrent_activation_1, recurrent_activation_2, recurrent_activation_3, w_dropout_do_1, w_dropout_do_2, w_dropout_do_3, u_dropout_do_1, u_dropout_do_2, u_dropout_do_3, backwards_1, backwards_2, backwards_3, unroll_1, unroll_2, unroll_3, lstm_output_size_1, lstm_output_size_2, lstm_output_size_3, input_size) # input size # check the X_data shape and vocabulary size # check the X_data shape and vocabulary size if save_model: model.save(model_location) min_delta, patience = 0.00001, 15 early_stopper = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=min_delta, patience=patience, verbose=1, mode='auto') metrics_callback = MetricsCNNCallback(Xv_data, y_val, patience) metrics, val_predictions, val_metrics = run_lstm(model, X_data, y_train, Xt_data, y_test, Xv_data, y_val, batch_size, [early_stopper, metrics_callback], queue_size, training_docs_list, val_docs_list) binary_val_predictions = mh.get_binary_0_5(val_predictions) print('\nGenerating Validation Metrics') validation_metrics = mh.get_sequential_metrics(y_val, val_predictions, binary_val_predictions) mh.display_sequential_metrics(validation_metrics) if save_results: classifier_name, parameters = ch.get_sequential_classifier_information(model) model_name = text_vectorizer+'/'+class_vectorizer+'/'+classifier_name ch.save_results(classifier_name+'_LSTM', validation_metrics, parameters, model_name, classif_level, classif_type, dataset_location) print('end training step')