def test_testing_convolution(text_vectorizer, class_vectorizer, classif_level, classif_type, dataset_location): load_standard_sets = True root_location = fh.get_root_location('data/convolutional_outcome/') sets_location = fh.join_paths(root_location, "model_sets") checkpoint_path = fh.join_paths(root_location, "model_checkpoints") model_path = fh.link_paths(checkpoint_path, 'convolution_model') weights_path = fh.link_paths(checkpoint_path, 'convolution_weights') # load sets and settings model_name = text_vectorizer+'/'+class_vectorizer+'/NN' # train_data, test_data, val_data, train_labels, test_labels, val_labels, settings = ch.load_sets(sets_location, '2020-01-08T22:28:44.757410') # classes, n_classes, vocab_processor, len_vocabulary = settings # only for test train_data, test_data, val_data, train_labels, test_labels, val_labels, _ = ch.load_sets(sets_location) # it could be that a label is only in the test/data data, might be a problem sequence_length = train_data.shape[1] # calculates metrics with testing data model, predictions = pmh.run_cnn_test(_, train_data, train_labels, test_data, test_labels, val_data, val_labels, model_path, weights_path, False) binary_predictions = mh.get_binary_0_5(predictions) # display testing metrics metrics = mh.get_sequential_metrics(test_labels, predictions, binary_predictions) mh.display_sequential_metrics('testing convolution sequence', metrics) classifier_name, layers = ch.get_sequential_classifier_information(model) ch.save_results(classifier_name+' Test', metrics, layers, model_name, classif_level, classif_type, dataset_location)
def second_attempt_from_web(data_frame, text_vectorizer, class_vectorizer, classif_level, classif_type, dataset_location): root_location = fh.get_root_location('data/convolutional_outcome/') # imdb = keras.datasets.imdb # (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000) # explore data # print("Training entries: {}, labels: {}".format(len(train_data), len(train_labels))) # print("how data looks like: ", train_data[0]) # [1, 14, 34...] # print("how labels looks like: ", train_labels[0]) # 0 # preprocess data # A dictionary mapping words to an integer index # word_index = imdb.get_word_index() # The first indices are reserved, thus it increases all the indices by 3 # word_index = {k : (v + 3) for k, v in word_index.items()} # word_index["<PAD>"] = 0 # word_index["<START>"] = 1 # word_index["<UNK>"] = 2 # unknown # word_index["<UNUSED>"] = 3 # make both the train and the test dataset the same length # train_data = keras.preprocessing.sequence.pad_sequences(train_data, # value=word_index["<PAD>"], # padding='post', # maxlen=256) # test_data = keras.preprocessing.sequence.pad_sequences(test_data, # value=word_index["<PAD>"], # padding='post', # maxlen=256) # print(train_data[0]) # [1 14 34 0 0 0] - with little difference: PAD,START,UNK,UNUSED model_name = text_vectorizer+'/'+class_vectorizer+'/NN' standard_results = ch.apply_df_vectorizer(data_frame, text_vectorizer, class_vectorizer, model_name) train_data, test_data, train_labels, test_labels, classes, n_classes, vocab_processor, len_vocabulary = standard_results train_data, val_data, train_labels, val_labels = ch.get_train_test_from_data(train_data, train_labels) # print("Training entries: {}, labels: {}".format(len(train_data), len(train_labels))) # print("how data looks like: ", train_data[0]) # [1 14 34 0 0 0] # print("how labels looks like: ", train_labels[0:5]) # list of lists [[0 1 0 0 0 1 0 1], ...] model = pmh.get_text_convolutional_from_web(len_vocabulary, n_classes) metrics, predictions = pmh.run_text_cnn_model(model, train_data, train_labels, test_data, test_labels) classifier_name, layers = ch.get_sequential_classifier_information(model) mh.display_convolutional_metrics(classifier_name, metrics[0], metrics[1], metrics[2], test_labels, predictions) ch.save_results(classifier_name, metrics, layers, model_name, classif_level, classif_type, dataset_location)
def apply_word2vec_extratrees(data_frame, classif_level, classif_type, source_path): data_frame['text'] = data_frame.apply( lambda row: th.tokenize_complex_text(row['text']), axis=1) data_frame['classification'] = data_frame.apply( lambda row: th.tokenize_complex_text(row['classification']), axis=1) df_single_classification = ch.get_list_each_text_a_different_classification( data_frame) x = df_single_classification['text'] y = df_single_classification['classification'] X_train, X_test, y_train, y_test = ch.get_train_test_from_data(x, y) model_w2v = wmh.get_word2vec_model(X_train) etree_w2v = Pipeline([("word2vec vectorizer", wmh.MeanEmbeddingVectorizer(model_w2v)), ("extra trees", pmh.get_extra_tree())]) etree_w2v_tfidf = Pipeline([("word2vec vectorizer", wmh.TfidfEmbeddingVectorizer(model_w2v)), ("extra trees", pmh.get_extra_tree())]) # NB!!!: the model does not support multi targets, so i duplicate the sources and give them different targets y_pred = pmh.fit_predict_functions(etree_w2v_tfidf, X_train, y_train, X_test) classifier_name_0 = 'Word2Vec/MeanEmbeddingVectorizer' classifier_name_1, parameters_1 = ch.get_extratree_classifier_information( str(etree_w2v)) model_name = '[all classes predictions]' + classifier_name_0 + '/' + classifier_name_1 # this should be changed by comparing all the possibilities for specified text (i can use the original dataframe!) list_metrics = mh.calculate_metrics(model_name, y_test, y_pred) none_average, binary_average, micro_average, macro_average = list_metrics ch.save_results(classifier_name_1, list_metrics, parameters_1, model_name, classif_level, classif_type, source_path) # NB!!!: the model does not support multi targets, so i duplicate the sources and give them different targets y_pred = pmh.fit_predict_functions(etree_w2v, X_train, y_train, X_test) classifier_name_2 = 'Word2Vec/TfidfEmbeddingVectorizer' model_name = '[all classes predictions]' + classifier_name_2 + '/' + classifier_name_1 # this should be changed by comparing all the possibilities for specified text (i can use the original dataframe!) list_metrics = mh.calculate_metrics(model_name, y_test, y_pred) none_average, binary_average, micro_average, macro_average = list_metrics ch.save_results(classifier_name_1, list_metrics, parameters_1, model_name, classif_level, classif_type, source_path)
def test_basic_LSTM(data_frame, text_vectorizer, classif_level, classif_type, dataset_location): print('### LSTM Doing Testing ###') root_location = fh.get_root_location('data/lstm_outcome/') doc2vec_model_location = fh.link_paths(fh.join_paths(root_location, "doc2vec_model/vocab_model/"), "doc2vec_model") model_location = link_paths(join_paths(root_location, "lstm_model"), "lstm_model") sets_location = join_paths(root_location, "model_sets") save_results = True load_model = True # date problem X_train, X_test, X_val, y_train, y_test, y_val, settings = load_sets(sets_location) classes, n_classes, vocab_processor, len_vocabulary = settings training_docs_list = X_train['patent_id'] test_docs_list = X_test['patent_id'] val_docs_list = X_val['patent_id'] sequence_size, embedding_size = 1, 150 X_data, Xv_data, Xt_data = ch.get_df_data(3, training_docs_list, val_docs_list, test_docs_list, sequence_size, embedding_size, doc2vec_model_location) ##### print(X_data.shape) input_size, output_size = X_data.shape[2], n_classes if load_model: model = tf.keras.models.load_model(model_location) min_delta, patience = 0.00001, 15 early_stopper = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=min_delta, patience=patience, verbose=1, mode='auto') metrics_callback = MetricsCNNCallback(Xv_data, y_val, patience) metrics, predictions, val_metrics = run_lstm(model, X_data, y_train, Xt_data, y_test, Xv_data, y_val, batch_size, [early_stopper, metrics_callback], queue_size, training_docs_list, val_docs_list) binary_predictions = mh.get_binary_0_5(predictions) print('\nGenerating Testing Metrics') metrics = mh.get_sequential_metrics(y_val, predictions, binary_predictions) mh.display_sequential_metrics(metrics) if save_results: classifier_name, parameters = ch.get_sequential_classifier_information(model) model_name = text_vectorizer+'/'+class_vectorizer+'/'+classifier_name ch.save_results(classifier_name+'_LSTM', metrics, parameters, model_name, classif_level, classif_type, dataset_location) print('end testing step')
def apply_svm(X_train_tfidf, y_train, X_test_tfidf, y_test, classif_level, classif_type, source_path): svm = pmh.get_SVC() classifier_name, parameters = ch.get_classifier_information(str(svm)) y_pred = pmh.fit_predict_functions(svm, X_train_tfidf, y_train, X_test_tfidf) model_name = '[all classes predictions]label_encoder/tfidf/' + classifier_name list_metrics = mh.calculate_metrics(model_name, y_test, y_pred) none_average, binary_average, micro_average, macro_average = list_metrics ch.save_results(classifier_name, list_metrics, parameters, model_name, classif_level, classif_type, source_path)
def apply_label_powerset(X_train, y_train, X_test, y_test, baseline_name, source_path): classifier = ch.get_label_powerset(pmh.get_logistic()) classifier_name, parameters = ch.get_complex_classifier_information( str(classifier), 1, 1, 2, 0) y_pred = pmh.fit_predict_functions(classifier, X_train, y_train, X_test) model_name = '[all classes predictions]' + baseline_name + classifier_name list_metrics = mh.calculate_metrics(model_name, y_test, y_pred) none_average, binary_average, micro_average, macro_average = list_metrics ch.save_results(classifier_name, list_metrics, parameters, model_name, classif_level, classif_type, source_path)
def apply_onevsrest(X_train, y_train, X_test, y_test, classes, baseline_name, source_path): # X_train.sort_indices() # SVC needs this line in addition custom_pipeline = Pipeline([ ('clf', OneVsRestClassifier(pmh.get_logistic(), n_jobs=-1)), # ('clf', OneVsRestClassifier(pmh.get_SVC(), n_jobs=-1)), # ('clf', OneVsRestClassifier(pmh.get_multinomialNB(), n_jobs=-1)), # ('clf', OneVsRestClassifier(pmh.get_decision_tree(), n_jobs=-1)), # ('clf', OneVsRestClassifier(pmh.get_kneighbors(), n_jobs=-1)), # ('clf', OneVsRestClassifier(pmh.get_linear_SVC(), n_jobs=-1)), # ('clf', OneVsRestClassifier(pmh.get_random_forest_classifier(), n_jobs=-1)), # ('clf', OneVsRestClassifier(pmh.get_SGD_classifier(), n_jobs=-1)), ]) classifier_name, parameters = ch.get_complex_classifier_information( str(custom_pipeline), 3, 1, 4, 0) model_name = '[each class predictions]' + baseline_name + classifier_name accuracies = [] for _class in classes: print('**Processing {} texts...**'.format(_class)) y_pred = pmh.fit_predict_functions(custom_pipeline, X_train, y_train[_class], X_test) accuracies.append(mh.get_accuracy_score(y_test[_class], y_pred)) list_metrics = mh.calculate_metrics(model_name, y_test[_class], y_pred) none_average, binary_average, micro_average, macro_average = list_metrics ch.save_results(classifier_name, list_metrics, parameters, model_name, classif_level, classif_type, source_path) print('Pipeline score on training {}'.format( custom_pipeline.score(X_train, y_train[_class]))) true_positives, false_positives, tpfn = mh.get_predictions_distribution( y_test[_class], y_pred) model_name = '[all classes predictions]' + baseline_name + classifier_name if tpfn == 0 or (true_positives + false_positives) == 0: mh.display_directly_metrics(model_name, 0, 0, 0, -1) else: precision = true_positives / (true_positives + false_positives) recall = true_positives / tpfn mh.display_directly_metrics( model_name, precision, recall, 2 * (precision * recall) / (precision + recall), -1)
def apply_adapted_algorithm(X_train, y_train, X_test, y_test, baseline_name, source_path): classifier = pmh.get_MLkNN() classifier_name, parameters = ch.get_complex_classifier_information( str(classifier), 0, 0, 1, 0) X_train, y_train, X_test = th.get_lil_matrices(X_train, y_train, X_test) y_pred = pmh.fit_predict_functions(classifier, X_train, y_train, X_test) model_name = '[all classes predictions]' + baseline_name + classifier_name list_metrics = mh.calculate_metrics(model_name, y_test, y_pred) none_average, binary_average, micro_average, macro_average = list_metrics ch.save_results(classifier_name, list_metrics, parameters, model_name, classif_level, classif_type, source_path)
def fourth_attemp_from_web(data_frame, text_vectorizer, class_vectorizer, classif_level, classif_type, dataset_location): model_name = text_vectorizer+'/'+class_vectorizer+'/CNN' standard_results = ch.apply_df_vectorizer(data_frame, text_vectorizer, class_vectorizer, model_name) train_data, test_data, train_labels, test_labels, classes, n_classes, vocab_processor, len_vocabulary = standard_results # TODO: is it useful? # test_labels = np_utils.to_categorical(test_labels, n_classes) model = pmh.get_fourth_attempt_model_from_web(train_data, n_classes) y_train_predclass, y_test_predclass, train_metrics, test_metrics, train_predictions, test_predictions = pmh.run_fourth_attempt_model(model, train_data, train_labels, test_data, test_labels) # mh.display_convolution_metrics_fourth_attempt(train_labels, test_labels, y_train_predclass, y_test_predclass) # mh.display_convolutional_metrics(classifier_name, train_metrics[0], train_metrics[1], train_metrics[2], train_labels, train_predictions) mh.display_convolutional_metrics(classifier_name, test_metrics[0], test_metrics[1], test_metrics[2], test_labels, test_predictions) classifier_name, layers = ch.get_sequential_classifier_information(model) ch.save_results(classifier_name, test_metrics, layers, model_name, classif_level, classif_type, dataset_location)
def apply_doc2vec_logistic_regression(data_frame, text_vectorizer, class_vectorizer, classif_level, classif_type, source_path): baseline_name = '[all classes predictions]' + text_vectorizer + '/' + class_vectorizer results_model_dbow = ch.apply_doc2vec_separated_train_test( data_frame, baseline_name) X_train, y_train, X_test, y_test, model_dbow, train_tagged, test_tagged = results_model_dbow logreg = pmh.get_logistic() classifier_name_0, parameters_0 = ch.get_classifier_information( str(model_dbow)) classifier_name_1, parameters_1 = ch.get_classifier_information( str(logreg)) y_pred = pmh.fit_predict_functions(logreg, X_train, y_train, X_test) model_name = baseline_name + classifier_name_0 + '/' + classifier_name_1 list_metrics = mh.calculate_metrics(model_name, y_test, y_pred) none_average, binary_average, micro_average, macro_average = list_metrics ch.save_results(classifier_name_1, list_metrics, parameters_1, model_name, classif_level, classif_type, source_path) # baseline_name = '[each class predictions]'+text_vectorizer+'/'+class_vectorizer # vectorizer_results = ch.apply_df_vectorizer(data_frame, 'doc2vec', 'multi_label', baseline_name) # X_train, X_test, y_train, y_test, classes, n_classes, vocab_processor, len_vocabulary = vectorizer_results # model_name = baseline_name+'/'+classifier_name_0+'/'+classifier_name_1 # # for i in range(n_classes): # unique, counts = np.unique(y_train[:, i], return_counts=True) # if len(counts) > 1 and counts[1] > 1: # y_pred = pmh.fit_predict_functions(custom_pipeline, X_train, y_train[:, i], X_test) # print('### ', classes[i] ,' ###') # list_metrics = mh.calculate_metrics(model_name, y_test[:, i], y_pred) # none_average, binary_average, micro_average, macro_average = list_metrics # # ch.save_results(classifier_name_1, list_metrics, parameters_1, model_name, classif_level, classif_type, source_path) # improvement improved_logistic_regression(train_tagged, test_tagged, model_dbow, logreg, text_vectorizer, class_vectorizer, classif_level, classif_type, source_path)
def improved_logistic_regression(train_tagged, test_tagged, model_dbow, logreg, text_vectorizer, class_vectorizer, classif_level, classif_type, source_path): classifier_name_0, parameters_0 = ch.get_classifier_information( str(model_dbow)) classifier_name_1, parameters_1 = ch.get_classifier_information( str(logreg)) model_dmm = wmh.train_doc2vec_with_tagged_data(train_tagged.values) classifier_name_2, parameters_2 = ch.get_classifier_information( str(model_dmm)) y_train, X_train = wmh.vec_for_learning(model_dmm, train_tagged) y_test, X_test = wmh.vec_for_learning(model_dmm, test_tagged) y_pred = pmh.fit_predict_functions(logreg, X_train, y_train, X_test) model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) model_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) merged_model = wmh.get_concatenated_doc2vec(model_dbow, model_dmm) y_train, X_train = wmh.vec_for_learning(merged_model, train_tagged) y_test, X_test = wmh.vec_for_learning(merged_model, test_tagged) y_pred = pmh.fit_predict_functions(logreg, X_train, y_train, X_test) model_name = '[all classes predictions]' + classifier_name_0 + '/' + classifier_name_2 + '/' + classifier_name_1 list_metrics = mh.calculate_metrics(model_name, y_test, y_pred) none_average, binary_average, micro_average, macro_average = list_metrics ch.save_results(classifier_name_1, list_metrics, parameters_1, model_name, classif_level, classif_type, source_path)
def test_LSTM_from_web(data_frame, text_vectorizer, class_vectorizer, classif_level, classif_type, dataset_location): print('### LSTM Doing Testing ###') root_location = fh.get_root_location('data/lstm_outcome/') nn_parameter_search_location = fh.join_paths(root_location, "nn_fhv_parameter_search") doc2vec_model_location = fh.link_paths(fh.join_paths(root_location, "doc2vec_model/vocab_model/"), "doc2vec_model") save_results = True sequence_size = 1 EMBEDDING_SIZE = 150 model_name = text_vectorizer+'/'+class_vectorizer+'/LSTM' results = ch.apply_df_vectorizer(data_frame, text_vectorizer, class_vectorizer, model_name) X_train, X_test, y_train, y_test, classes, n_classes, vocab_processor, len_vocabulary = results X_train, X_val, y_train, y_val = ch.get_train_test_from_data(X_train, y_train) training_docs_list = X_train['patent_id'] test_docs_list = X_test['patent_id'] val_docs_list = X_val['patent_id'] X_data, Xv_data, Xt_data = ch.get_df_data(3, training_docs_list, val_docs_list, test_docs_list, sequence_size, EMBEDDING_SIZE, doc2vec_model_location) GLOBAL_VARS.DOC2VEC_MODEL_NAME, GLOBAL_VARS.MODEL_NAME = wmh.set_parameters_lstm_doc2vec(nn_parameter_search_location, classif_level, classif_type) NN_INPUT_NEURONS, NN_SEQUENCE_SIZE, NN_OUTPUT_NEURONS = pmh.get_lstm_shapes(X_data, n_classes) NN_BATCH_SIZE, PARTS_LEVEL, NN_MAX_EPOCHS, QUEUE_SIZE = pmh.get_lstm_basic_parameters() params = pmh.get_lstm_testing_parameters() lstm_output_size,w_dropout_do,u_dropout_do, stack_layers, conv_size, conv_filter_length, conv_max_pooling_length = params EARLY_STOPPER_MIN_DELTA, EARLY_STOPPER_PATIENCE = pmh.get_early_stopping_parameters() TEST_METRICS_FILENAME = '{}_level_{}_standard_nn_test_metrics_dict.pkl' test_metrics_dict = dict() test_metrics_path = fh.link_paths(fh.link_paths(nn_parameter_search_location, GLOBAL_VARS.MODEL_NAME), TEST_METRICS_FILENAME.format(classif_type, PARTS_LEVEL)) param_results_path = fh.link_paths(fh.link_paths(nn_parameter_search_location, GLOBAL_VARS.MODEL_NAME), NN_PARAMETER_SEARCH_PREFIX.format(classif_type, classif_level, NN_BATCH_SIZE)) param_results_dict = pickle.load(open(param_results_path, 'rb')) GLOBAL_VARS.NN_MODEL_NAME = 'lstm_size_{}_w-drop_{}_u-drop_{}_stack_{}_conv_{}'.format(lstm_output_size, w_dropout_do, u_dropout_do, stack_layers, str(conv_size) ) if conv_size: GLOBAL_VARS.NN_MODEL_NAME += '_conv-filter-length_{}_max-pooling-size_{}'.format(conv_filter_length, conv_max_pooling_length) if GLOBAL_VARS.NN_MODEL_NAME not in param_results_dict.keys(): print("Can't find model: {}".format(GLOBAL_VARS.NN_MODEL_NAME)) raise Exception() if fh.ensure_exists_path_location(test_metrics_path): test_metrics_dict = pickle.load(open(fh.link_paths(fh.link_paths(nn_parameter_search_location, GLOBAL_VARS.MODEL_NAME), TEST_METRICS_FILENAME.format(classif_type,PARTS_LEVEL)), 'rb')) if GLOBAL_VARS.NN_MODEL_NAME in test_metrics_dict.keys(): print("Test metrics already exist for: {}".format(GLOBAL_VARS.NN_MODEL_NAME)) test_metrics = test_metrics_dict[GLOBAL_VARS.NN_MODEL_NAME] print("** Test Metrics: Cov Err: {:.3f}, Avg Labels: {:.3f}, \n\t\t Top 1: {:.3f}, Top 3: {:.3f}, Top 5: {:.3f}, \n\t\t F1 Micro: {:.3f}, F1 Macro: {:.3f}".format( test_metrics['coverage_error'], test_metrics['average_num_of_labels'], test_metrics['top_1'], test_metrics['top_3'], test_metrics['top_5'], test_metrics['f1_micro'], test_metrics['f1_macro'])) raise Exception() print('***************************************************************************************') print(GLOBAL_VARS.NN_MODEL_NAME) model = pmh.get_keras_rnn_model(NN_INPUT_NEURONS, NN_SEQUENCE_SIZE, NN_OUTPUT_NEURONS, lstm_output_size, w_dropout_do, u_dropout_do, stack_layers, conv_size, conv_filter_length, conv_max_pooling_length) # get model best weights weights = param_results_dict[GLOBAL_VARS.NN_MODEL_NAME]['best_weights'] model.set_weights(weights) print('Evaluating on Test Data using best weights') _, ytp, ytp_binary = pmh.predict_generator(None, model, Xt_data, y_test, NN_BATCH_SIZE, QUEUE_SIZE, test_docs_list) print('Generating Test Metrics') test_metrics = mh.get_sequential_metrics(y_test, ytp, ytp_binary) mh.display_sequential_metrics(test_metrics) if save_results: classifier_name, parameters = ch.get_sequential_classifier_information(model) ch.save_results(classifier_name+'_LSTM', test_metrics, parameters, model_name, classif_level, classif_type, dataset_location) test_metrics_dict[GLOBAL_VARS.NN_MODEL_NAME] = test_metrics pickle.dump(test_metrics_dict, open(test_metrics_path, 'wb'))
def train_LSTM_from_web(data_frame, text_vectorizer, class_vectorizer, classif_level, classif_type, dataset_location): print('### LSTM Doing Training ###') root_location = fh.get_root_location('data/lstm_outcome/') exports_location = fh.join_paths(root_location, "exported_data/") matrices_save_location = fh.join_paths(root_location, "fhv_matrices/") nn_parameter_search_location = fh.join_paths(root_location, "nn_fhv_parameter_search") doc2vec_model_location = fh.link_paths(fh.join_paths(root_location, "doc2vec_model/vocab_model/"), "doc2vec_model") load_existing_results = True # it was True save_results = True sequence_size = 1 EMBEDDING_SIZE = 150 model_name = text_vectorizer+'/'+class_vectorizer+'/LSTM' results = ch.apply_df_vectorizer(data_frame, text_vectorizer, class_vectorizer, model_name) X_train, X_val, y_train, y_val, classes, n_classes, vocab_processor, len_vocabulary = results training_docs_list = X_train['patent_id'] val_docs_list = X_val['patent_id'] X_data, Xv_data, _ = ch.get_df_data(2, training_docs_list, val_docs_list, None, sequence_size, EMBEDDING_SIZE, doc2vec_model_location) GLOBAL_VARS.DOC2VEC_MODEL_NAME, GLOBAL_VARS.MODEL_NAME = wmh.set_parameters_lstm_doc2vec(nn_parameter_search_location, classif_level, classif_type) # print(X_data.shape) # 64, 1, 200 # print(Xt_data.shape) # 20, 1, 200 # print(Xv_data.shape) # 16, 1, 200 NN_INPUT_NEURONS, NN_SEQUENCE_SIZE, NN_OUTPUT_NEURONS = pmh.get_lstm_shapes(X_data, n_classes) NN_BATCH_SIZE, PARTS_LEVEL, NN_MAX_EPOCHS, QUEUE_SIZE = pmh.get_lstm_basic_parameters() param_sampler = pmh.get_lstm_training_parameters() EARLY_STOPPER_MIN_DELTA, EARLY_STOPPER_PATIENCE = pmh.get_early_stopping_parameters() param_results_dict = dict() param_results_path = fh.link_paths(fh.join_paths(nn_parameter_search_location, GLOBAL_VARS.MODEL_NAME), NN_PARAMETER_SEARCH_PREFIX.format(classif_type, classif_level, NN_BATCH_SIZE)) index = param_results_path.rfind('/') fh.create_folder(fh.link_paths(fh.join_paths(nn_parameter_search_location, GLOBAL_VARS.MODEL_NAME), NN_PARAMETER_SEARCH_PREFIX.format(classif_type, classif_level, NN_BATCH_SIZE))[:index]) ########### # print(X_data.shape) # input_size, sequence_size, output_size = X_data.shape[2], X_data.shape[0], n_classes # NN_BATCH_SIZE, PARTS_LEVEL, NN_MAX_EPOCHS, QUEUE_SIZE = 1, 2, 200, 100 # lstm_output_sizes = [500, 1000] # w_dropout_options = [0., 0.5] # u_dropout_options = [0., 0.5] # stack_layers_options = [1, 2, 3] # lstm_output_size, w_dropout_do, u_dropout_do, stack_layers = 500, 0.0, 0.0, 1 # conv_size, conv_filter_length, max_pooling_length = 128, 2, 2 # EARLY_STOPPER_MIN_DELTA, EARLY_STOPPER_PATIENCE = 0.00001, 15 # import tensorflow as tf # from tensorflow import keras # from keras.layers import Input, Dense, Dropout, Activation # from keras.models import Model, Sequential # from keras.layers.convolutional import MaxPooling1D, Convolution1D # from keras.layers.recurrent import LSTM # model = Sequential() # # model.add(Convolution1D(nb_filter=conv_size, input_shape=(sequence_size, input_size), # # filter_length=conv_filter_length, border_mode='same', activation='relu')) # # model.add(MaxPooling1D(pool_length=max_pooling_length)) # model.add(LSTM(lstm_output_size, activation='tanh', recurrent_activation='hard_sigmoid', use_bias=False, # input_dim=input_size, dropout_W=w_dropout_do, dropout_U=u_dropout_do, # implementation=1, # return_sequences=False if 1 == stack_layers else True, # go_backwards=False, stateful=False, unroll=False, # name='lstm_{}_w-drop_{}_u-drop_{}_layer_{}'.format(lstm_output_size, str(u_dropout_do), # str(w_dropout_do), str(1)))) # # model.add(LSTM(lstm_output_size, activation='tanh', recurrent_activation='hard_sigmoid', use_bias=False, # # input_dim=input_size, dropout_W=w_dropout_do, dropout_U=u_dropout_do, # # implementation=1, # # return_sequences=False if 2 == stack_layers else True, # # go_backwards=False, stateful=False, unroll=False, # # name='lstm_{}_w-drop_{}_u-drop_{}_layer_{}'.format(lstm_output_size, str(u_dropout_do), # # str(w_dropout_do), str(2)))) # # model.add(LSTM(lstm_output_size, activation='tanh', recurrent_activation='hard_sigmoid', use_bias=False, # # input_dim=input_size, dropout_W=w_dropout_do, dropout_U=u_dropout_do, # # implementation=1, # # return_sequences=False if 3 == stack_layers else True, # # go_backwards=False, stateful=False, unroll=False, # # name='lstm_{}_w-drop_{}_u-drop_{}_layer_{}'.format(lstm_output_size, str(u_dropout_do), # # str(w_dropout_do), str(3)))) # model.add(Dense(output_size, activation='sigmoid', name='sigmoid_output')) # model.compile(optimizer='rmsprop', loss='binary_crossentropy') # input_matrix = fh.join_paths(matrices_save_location, GLOBAL_VARS.MODEL_NAME) # early_stopper = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=EARLY_STOPPER_MIN_DELTA, # patience=EARLY_STOPPER_PATIENCE, verbose=1, mode='auto') # metrics_callback = mh.MetricsCallback(input_matrix, classif_type, PARTS_LEVEL, NN_BATCH_SIZE, is_mlp=False) # history, yvp, yvp_binary = pmh.fit_predict_generator(model, X_data, y_train, Xv_data, y_val, training_docs_list, val_docs_list, early_stopper, metrics_callback, NN_BATCH_SIZE, NN_MAX_EPOCHS, QUEUE_SIZE) # validation_metrics = mh.get_sequential_metrics(y_val, yvp, yvp_binary) # mh.display_sequential_metrics(validation_metrics) # ########### # useful to skip all the already tested models if load_existing_results: param_results_path = fh.link_paths(fh.join_paths(nn_parameter_search_location, GLOBAL_VARS.MODEL_NAME), NN_PARAMETER_SEARCH_PREFIX.format(classif_type, classif_level, NN_BATCH_SIZE)) if fh.ensure_exists_path_location(param_results_path): print('Loading Previous results in {}'.format(param_results_path)) param_results_dict = pickle.load(open(param_results_path, 'rb')) else: print('No Previous results exist in {}'.format(param_results_path)) for params in param_sampler: start_time = time.time() lstm_output_size = params['lstm_output_size'] w_dropout_do = params['w_dropout'] u_dropout_do = params['u_dropout'] stack_layers = params['stack_layers'] conv_size = params['conv_size'] conv_filter_length = params['conv_filter_length'] conv_max_pooling_length = params['conv_max_pooling_length'] GLOBAL_VARS.NN_MODEL_NAME = 'lstm_size_{}_w-drop_{}_u-drop_{}_stack_{}_conv_{}'.format( lstm_output_size, w_dropout_do, u_dropout_do, stack_layers, str(conv_size)) if conv_size: GLOBAL_VARS.NN_MODEL_NAME += '_conv-filter-length_{}_max-pooling-size_{}'.format(conv_filter_length, conv_max_pooling_length) if GLOBAL_VARS.NN_MODEL_NAME in param_results_dict.keys(): print("skipping: {}".format(GLOBAL_VARS.NN_MODEL_NAME)) continue # creating the actual keras model model = pmh.get_keras_rnn_model(NN_INPUT_NEURONS, NN_SEQUENCE_SIZE, NN_OUTPUT_NEURONS, lstm_output_size, w_dropout_do, u_dropout_do, stack_layers, conv_size, conv_filter_length, conv_max_pooling_length) classifier_name, parameters = ch.get_sequential_classifier_information(model) model_name = text_vectorizer+'/'+class_vectorizer+'/'+classifier_name input_matrix = fh.join_paths(matrices_save_location, GLOBAL_VARS.MODEL_NAME) early_stopper = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=EARLY_STOPPER_MIN_DELTA, patience=EARLY_STOPPER_PATIENCE, verbose=1, mode='auto') metrics_callback = mh.MetricsCallback(input_matrix, classif_type, PARTS_LEVEL, NN_BATCH_SIZE, is_mlp=False) history, yvp, yvp_binary = pmh.fit_predict_generator(model, X_data, y_train, Xv_data, y_val, training_docs_list, val_docs_list, early_stopper, metrics_callback, NN_BATCH_SIZE, NN_MAX_EPOCHS, QUEUE_SIZE) print('\nGenerating Validation Metrics') validation_metrics = mh.get_sequential_metrics(y_val, yvp, yvp_binary) mh.display_sequential_metrics(classifier_name, validation_metrics) param_results_dict[GLOBAL_VARS.NN_MODEL_NAME] = dict() # param_results_dict[GLOBAL_VARS.NN_MODEL_NAME]['best_validation_metrics'] = best_validation_metrics param_results_dict[GLOBAL_VARS.NN_MODEL_NAME]['epochs'] = len(history.history['val_loss']) param_results_dict[GLOBAL_VARS.NN_MODEL_NAME]['best_weights'] = metrics_callback.best_weights param_results_dict[GLOBAL_VARS.NN_MODEL_NAME]['best_val_loss'] = metrics_callback.best_val_loss param_results_dict[GLOBAL_VARS.NN_MODEL_NAME]['training_loss'] = metrics_callback.losses param_results_dict[GLOBAL_VARS.NN_MODEL_NAME]['validation_loss'] = metrics_callback.val_losses duration = time.time() - start_time param_results_dict[GLOBAL_VARS.NN_MODEL_NAME]['duration'] = duration ch.delete_variables(history, metrics_callback) ch.save_results(classifier_name+'_LSTM', validation_metrics, parameters, model_name, classif_level, classif_type, dataset_location) if save_results: file = open(param_results_path, 'wb') pickle.dump(param_results_dict, file)
def third_attempt_from_web(data_frame, text_vectorizer, class_vectorizer, classif_level, classif_type, dataset_location): # fashion_mnist = keras.datasets.fashion_mnist # (train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data() # class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', # 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot'] # print("### images ###") # print(test_images) # print(type(test_images)) # print(test_images.shape) # print(test_images[0]) # print(test_labels) # print(type(test_labels)) # print(test_labels.shape) # print(test_labels[0]) model_name = text_vectorizer+'/'+class_vectorizer+'/CNN' standard_results = ch.apply_df_vectorizer(data_frame, text_vectorizer, class_vectorizer, model_name) train_data, test_data, train_labels, test_labels, classes, n_classes, vocab_processor, len_vocabulary = standard_results # print("### texts ###") # print(test_data) # print(type(test_data)) # print(test_data.shape) # print(test_data[0]) # print(test_labels) # print(type(test_labels)) # print(test_labels.shape) # print(test_labels[0]) # preprocess data # train_images = train_images / 255.0 # test_images = test_images / 255.0 # it reduced all in the range 0--1 # normalize the values train_data = np.reshape(train_data, [train_data.shape[0], 1, train_data.shape[1]]) test_data = np.reshape(test_data, [test_data.shape[0], 1, test_data.shape[1]]) print(train_labels) print(train_labels.shape) for index in range(train_labels.shape[1]): temp_train_labels = train_labels[:, index] # estimating the first class temp_test_labels = test_labels[:, index] # estimating the first class print(temp_train_labels) model = pmh.get_image_convolutional_from_web(train_data, n_classes) metrics, predictions = pmh.run_image_cnn_model(model, train_data, temp_train_labels, test_data, temp_test_labels) print("predictions: ", predictions) classifier_name, layers = ch.get_sequential_classifier_information(model) mh.display_convolutional_metrics(classifier_name, metrics[0], metrics[1], metrics[2], temp_test_labels, predictions) ch.save_results(classifier_name, metrics, layers, model_name, classif_level, classif_type, dataset_location)
def train_basic_LSTM(data_frame, text_vectorizer, classif_level, classif_type, dataset_location): print('### LSTM - Training ###') root_location = get_root_location('data/lstm_outcome/') doc2vec_model_location = link_paths(join_paths(root_location, "doc2vec_model/vocab_model/"), "doc2vec_model") model_location = link_paths(join_paths(root_location, "lstm_model"), "lstm_model") sets_location = join_paths(root_location, "model_sets") save_results = False save_model = True model_name = text_vectorizer+'/'+class_vectorizer+'/LSTM' results = apply_df_vectorizer(data_frame, text_vectorizer, class_vectorizer, model_name) X_train, X_val, y_train, y_val, classes, n_classes, vocab_processor, len_vocabulary = results save_sets(sets_location, X_train, None, X_val, y_train, None, y_val, [classes, n_classes, vocab_processor, len_vocabulary]) # val_path problem training_docs_list = X_train['patent_id'] val_docs_list = X_val['patent_id'] sequence_size, embedding_size = 1, 150 X_data, Xv_data, _ = ch.get_df_data(2, training_docs_list, val_docs_list, None, sequence_size, embedding_size, doc2vec_model_location) # sequence size is the (, number, ) in the tuple data print(X_data.shape) input_size, output_size = X_data.shape[2], n_classes parameters = { "estimator__epochs": 200, "estimator__batch_size": 64, "estimator__optimizer_1": 'Adam', "estimator__optimizer_2": 'Adam', "estimator__optimizer_3": 'Adam', "estimator__optimizer_4": 'Adam', "estimator__optimizer_5": 'Adam', "estimator__optimizer_6": 'Adam', "estimator__init_mode_1": 'uniform', "estimator__init_mode_2": 'uniform', "estimator__init_mode_3": 'uniform', "estimator__init_mode_4": 'uniform', "estimator__init_mode_5": 'uniform', "estimator__init_mode_6": 'uniform', "estimator__activation_1": 'softmax', "estimator__activation_2": 'softmax', "estimator__activation_3": 'softmax', "estimator__activation_4": 'softmax', "estimator__activation_5": 'softmax', "estimator__activation_6": 'softmax', "estimator__dropout_rate_1": .0, "estimator__dropout_rate_2": .0, "estimator__dropout_rate_3": .0, "estimator__dropout_rate_4": .0, "estimator__weight_constraint_1": 1, "estimator__weight_constraint_2": 1, "estimator__weight_constraint_3": 1, "estimator__weight_constraint_4": 1, "estimator__weight_constraint_5": 1, "estimator__weight_constraint_6": 1, "estimator__neurons_1": n_classes*20, "estimator__neurons_2": n_classes*5, "estimator__neurons_3": n_classes*2, "estimator__neurons_4": n_classes*1.5, "estimator__filters": 16, "estimator__filters_2": 16, "estimator__filters_3": 16, "estimator__kernel_size_1": 8, "estimator__kernel_size_2": 8, "estimator__kernel_size_3": 8, "estimator__strides_1": 8, "estimator__strides_2": 8, "estimator__strides_3": 8, "estimator__activation_lstm_1": 'tanh', "estimator__activation_lstm_2": 'tanh', "estimator__activation_lstm_3": 'tanh', "estimator__recurrent_activation_1": 'hard_sigmoid', "estimator__recurrent_activation_2": 'hard_sigmoid', "estimator__recurrent_activation_3": 'hard_sigmoid', "estimator__w_dropout_do_1": .2, "estimator__w_dropout_do_2": .2, "estimator__w_dropout_do_3": .2, "estimator__u_dropout_do_1": .2, "estimator__u_dropout_do_2": .2, "estimator__u_dropout_do_3": .2, "estimator__backwards_1": False, "estimator__backwards_2": False, "estimator__backwards_3": False, "estimator__unroll_1": False, "estimator__unroll_2": False, "estimator__unroll_3": False } model = get_lstm(optimizer, init_mode_1, activation_1, init_mode_2, activation_2, init_mode_3, activation_3, init_mode_4, activation_4, init_mode_5, activation_5, init_mode_6, activation_6, weight_constraint_1, weight_constraint_2, weight_constraint_3, weight_constraint_4, weight_constraint_5, weight_constraint_6, dropout_rate_1, dropout_rate_2, dropout_rate_3, dropout_rate_4, neurons_1, neurons_2, neurons_3, filters_1, filters_2, filters_3, kernel_size_1, kernel_size_2, kernel_size_3, strides_1, strides_2, strides_3, activation_lstm_1, activation_lstm_2, activation_lstm_3, recurrent_activation_1, recurrent_activation_2, recurrent_activation_3, w_dropout_do_1, w_dropout_do_2, w_dropout_do_3, u_dropout_do_1, u_dropout_do_2, u_dropout_do_3, backwards_1, backwards_2, backwards_3, unroll_1, unroll_2, unroll_3, lstm_output_size_1, lstm_output_size_2, lstm_output_size_3, input_size) # input size # check the X_data shape and vocabulary size # check the X_data shape and vocabulary size if save_model: model.save(model_location) min_delta, patience = 0.00001, 15 early_stopper = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=min_delta, patience=patience, verbose=1, mode='auto') metrics_callback = MetricsCNNCallback(Xv_data, y_val, patience) metrics, val_predictions, val_metrics = run_lstm(model, X_data, y_train, Xt_data, y_test, Xv_data, y_val, batch_size, [early_stopper, metrics_callback], queue_size, training_docs_list, val_docs_list) binary_val_predictions = mh.get_binary_0_5(val_predictions) print('\nGenerating Validation Metrics') validation_metrics = mh.get_sequential_metrics(y_val, val_predictions, binary_val_predictions) mh.display_sequential_metrics(validation_metrics) if save_results: classifier_name, parameters = ch.get_sequential_classifier_information(model) model_name = text_vectorizer+'/'+class_vectorizer+'/'+classifier_name ch.save_results(classifier_name+'_LSTM', validation_metrics, parameters, model_name, classif_level, classif_type, dataset_location) print('end training step')
def apply_fasttext(data_frame, text_vectorizer, class_vectorizer, classif_level, classif_type, dataset_location): root_location = fh.get_root_location('data/fasttext_outcome/') fasttext_location = fh.join_paths(root_location, 'fasttext_models/') word2vec_location = fh.join_paths(root_location, 'word2vec_models/') timestamp = datetime.datetime.now().isoformat() ############################# # unsupervised model # model = fasttext.train_unsupervised(input=fh.link_paths(root_location, 'training set.csv'), # autotuneValidationFile=fh.link_paths(root_location, 'training set.csv')) ############################# # supervised model # model = fasttext.train_supervised(input=fh.link_paths(root_location, 'training set.csv'), # autotuneValidationFile=fh.link_paths(root_location, 'validating set.csv'), verbose=3, autotuneDuration=5000) ############################# model, parameters = pmh.get_fasttext( fh.link_paths(root_location, 'training set.csv')) pmh.save_fasttext_model( model, fh.link_paths(fasttext_location, 'model ' + timestamp + '.bin')) # pmh.load_fasttext_model(fh.link_paths(fasttext_location, 'model '+timestamp+'.bin')) test_labels, predictions, results = pmh.predict_test_fasttext( model, fh.link_paths(root_location, 'testing set.csv')) # print('predicted labels {}, probabilities of the labels {}'.format(predictions[0], predictions[1])) result_top_15 = model.test(fh.link_paths(root_location, 'testing set.csv'), k=15) result_top_8 = model.test(fh.link_paths(root_location, 'testing set.csv'), k=8) result_top_5 = model.test(fh.link_paths(root_location, 'testing set.csv'), k=5) result_top_3 = model.test(fh.link_paths(root_location, 'testing set.csv'), k=3) result_top_1 = model.test(fh.link_paths(root_location, 'testing set.csv'), k=1) classifier_name = ch.get_fasttext_classifier_information(str(model)) model_name = text_vectorizer + '/' + class_vectorizer + '/' + classifier_name mh.display_directly_metrics( 'k=-1 ' + classifier_name, -1, results[1], results[2], 2 * (results[1] * results[2]) / (results[1] + results[2])) mh.display_directly_metrics( 'k= 15 ' + classifier_name, -1, result_top_15[1], result_top_15[2], 2 * (result_top_15[1] * result_top_15[2]) / (result_top_15[1] + result_top_15[2])) mh.display_directly_metrics( 'k= 8 ' + classifier_name, -1, result_top_8[1], result_top_8[2], 2 * (result_top_8[1] * result_top_8[2]) / (result_top_8[1] + result_top_8[2])) mh.display_directly_metrics( 'k= 5 ' + classifier_name, -1, result_top_5[1], result_top_5[2], 2 * (result_top_5[1] * result_top_5[2]) / (result_top_5[1] + result_top_5[2])) mh.display_directly_metrics( 'k= 3 ' + classifier_name, -1, result_top_3[1], result_top_3[2], 2 * (result_top_3[1] * result_top_3[2]) / (result_top_3[1] + result_top_3[2])) mh.display_directly_metrics( 'k= 1 ' + classifier_name, -1, result_top_1[1], result_top_1[2], 2 * (result_top_1[1] * result_top_1[2]) / (result_top_1[1] + result_top_1[2])) ch.save_results(classifier_name, results, parameters, model_name, classif_level, classif_type, dataset_location) manual_metrics = mh.calculate_manual_metrics(model_name, test_labels, predictions) none_average, binary_average, micro_average, macro_average = manual_metrics # print(model.test_label()) # path is missing # list_ = model.words # new_list = [] # for token in list_: # if len(token) in [0,1,2]: # new_list.append(token) # elif len(token) > 29: # new_list.append(token) # # print(new_list) # print(len(new_list)) print(model.labels)
def apply_multi_label_classification_without_pipeline( data_frame, text_vectorizer, class_vectorizer, classif_level, classif_type, source_path): baseline_name = text_vectorizer + '/' + class_vectorizer + '/onevsrest/' vectorizer_results = ch.apply_df_vectorizer(data_frame, text_vectorizer, class_vectorizer, '[both]' + baseline_name) X_train, X_test, y_train, y_test, classes, n_classes, vocab_processor, len_vocabulary = vectorizer_results # len_vocabulary = 57335 # len_vocabulary = 34736 print(X_train[0]) print('len_vocabulary: ', len_vocabulary, ' num_classes: ', n_classes) # Run classifier classifier = OneVsRestClassifier(pmh.get_logistic()) # here # classifier = OneVsRestClassifier(pmh.get_SVC()) # here # classifier = OneVsRestClassifier(pmh.get_multinomialNB()) # ERROR WORD2VEC/DOC2VEC (X has negative value) # classifier = OneVsRestClassifier(pmh.get_decision_tree()) # here # classifier = OneVsRestClassifier(pmh.get_kneighbors()) # here # classifier = OneVsRestClassifier(pmh.get_linear_SVC()) # classifier = OneVsRestClassifier(pmh.get_random_forest_classifier()) # here # classifier = OneVsRestClassifier(pmh.get_SGD_classifier()) train_predictions = np.ndarray(shape=(n_classes, y_train.shape[0]), dtype=int) predictions = np.ndarray(shape=(n_classes, y_test.shape[0]), dtype=int) ### precision = dict() recall = dict() average_precision = dict() classifier_name, parameters = ch.get_complex_classifier_information( str(classifier), 1, 1, 2, 0) second_training = False just_once = True another_try = False # single train and estimation instead of multi-train-estimation steps (it performs better with svm and logistic) if not another_try: for _ in range(1): for i in range(n_classes): if second_training: if classifier_name in [ 'DecisionTreeClassifier', 'KNeighborsClassifier', 'MultinomialNB', 'RandomForestClassifier' ]: # do not provide the second metrics break elif just_once: # fit again with the whole set of classes - should be better classifier.fit(X_train, y_train) y_score = classifier.decision_function(X_test) precision[i], recall[i], average_precision[ i] = mh.calculate_recall_curve_precision_score( y_test[:, i], y_score[:, i], None, y_test[:, i], y_score[:, i]) just_once = False else: predictions[i] = pmh.fit_predict_functions( classifier, X_train, y_train[:, i], X_test) train_predictions[i] = classifier.predict(X_train) print('**Processing classes {0:0.2f} % ...**'.format( ((i + 1) / n_classes) * 100)) second_training = True break predictions = predictions.transpose() print('transposed') else: predictions = pmh.fit_predict_functions(classifier, X_train, y_train, X_test) train_predictions = classifier.predict(X_train) # metrics model_name = '[each class predictions]' + baseline_name + classifier_name manual_metrics = mh.calculate_manual_metrics(model_name, y_test, predictions) none_average, binary_average, micro_average, macro_average = manual_metrics # metrics list_metrics = mh.calculate_metrics(model_name, y_test, predictions) none_average, binary_average, micro_average, macro_average = list_metrics ch.save_results(classifier_name, list_metrics, parameters, model_name, classif_level, classif_type, source_path) if not just_once: print('not just once') train_predictions = classifier.predict(X_train) predictions = classifier.predict(X_test) # metrics mh.calculate_metrics_with_recall_curve(y_score, y_train, y_test, train_predictions, predictions) # metrics model_name = '[all classes predictions]' + baseline_name + classifier_name list_metrics = mh.calculate_metrics(model_name, y_test, predictions) none_average, binary_average, micro_average, macro_average = list_metrics ch.save_results(classifier_name, list_metrics, parameters, model_name, classif_level, classif_type, source_path)