def test_testing_convolution(text_vectorizer, class_vectorizer, classif_level, classif_type, dataset_location): load_standard_sets = True root_location = fh.get_root_location('data/convolutional_outcome/') sets_location = fh.join_paths(root_location, "model_sets") checkpoint_path = fh.join_paths(root_location, "model_checkpoints") model_path = fh.link_paths(checkpoint_path, 'convolution_model') weights_path = fh.link_paths(checkpoint_path, 'convolution_weights') # load sets and settings model_name = text_vectorizer+'/'+class_vectorizer+'/NN' # train_data, test_data, val_data, train_labels, test_labels, val_labels, settings = ch.load_sets(sets_location, '2020-01-08T22:28:44.757410') # classes, n_classes, vocab_processor, len_vocabulary = settings # only for test train_data, test_data, val_data, train_labels, test_labels, val_labels, _ = ch.load_sets(sets_location) # it could be that a label is only in the test/data data, might be a problem sequence_length = train_data.shape[1] # calculates metrics with testing data model, predictions = pmh.run_cnn_test(_, train_data, train_labels, test_data, test_labels, val_data, val_labels, model_path, weights_path, False) binary_predictions = mh.get_binary_0_5(predictions) # display testing metrics metrics = mh.get_sequential_metrics(test_labels, predictions, binary_predictions) mh.display_sequential_metrics('testing convolution sequence', metrics) classifier_name, layers = ch.get_sequential_classifier_information(model) ch.save_results(classifier_name+' Test', metrics, layers, model_name, classif_level, classif_type, dataset_location)
def get_csv_path(script_key): current_path = os.path.dirname(os.path.realpath(__file__)) index = current_path.rfind('/', 0, -1) index = current_path.rfind('/', 0, index-1) if script_key == ("clean" or "clean_mix"): return fh.link_paths(current_path[:index], settings["eu_extract"]["csv_name"]), fh.link_paths(current_path[:index], settings["us_extract"]["csv_name"]) return fh.link_paths(current_path[:index], settings[script_key]["csv_name"])
def save_data_frame(script_key, data_frame, csvfile): index = settings[script_key]["data_frame_file_name"].rfind('/') path_to_csv = fh.get_root_location(settings[script_key]["data_frame_file_name"][:index]) if csvfile: output_path = fh.link_paths(path_to_csv, csvfile) else: output_path = fh.link_paths(path_to_csv, settings[script_key]["data_frame_file_name"][index+1:]) data_frame.to_csv(output_path, index=False, sep=',', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")
def save_sets(sets_location, train_data, test_data, val_data, train_labels, test_labels, val_labels, settings): try: date = datetime.datetime.now().isoformat() actual_sets_location = fh.join_paths(sets_location, date) with open(fh.link_paths(actual_sets_location, 'training_set '+date+'.pkl'), "wb") as f: pkl.dump([train_data, train_labels], f) with open(fh.link_paths(actual_sets_location, 'testing_set '+date+'.pkl'), "wb") as f: pkl.dump([test_data, test_labels], f) with open(fh.link_paths(actual_sets_location, 'validation_set '+date+'.pkl'), "wb") as f: pkl.dump([val_data, val_labels], f) with open(fh.link_paths(actual_sets_location, 'settings '+date+'.pkl'), "wb") as f: pkl.dump(settings, f) except: print('A problem occurred while saving the sets!')
def load_sets(sets_location, date='01-01-2020'): try: actual_sets_location = fh.join_paths(sets_location, date) print(actual_sets_location, date, fh.link_paths(actual_sets_location, 'training_set '+date+'.pkl')) with open(fh.link_paths(actual_sets_location, 'training_set '+date+'.pkl'), "rb") as f: train_data, train_labels = pkl.load(f) with open(fh.link_paths(actual_sets_location, 'testing_set '+date+'.pkl'), "rb") as f: test_data, test_labels = pkl.load(f) with open(fh.link_paths(actual_sets_location, 'validation_set '+date+'.pkl'), "rb") as f: val_data, val_labels = pkl.load(f) with open(fh.link_paths(actual_sets_location, 'settings '+date+'.pkl'), "rb") as f: settings = pkl.load(f) # classes, n_classes, vocab_processor, len_vocabulary = pkl.load(f) return train_data, test_data, val_data, train_labels, test_labels, val_labels, settings except: print('A problem occurred while loading the sets!') return None, None, None, None, None, None, None
def load_data_frame(script_key, csvfile): index = settings[script_key]["data_frame_file_name"].rfind('/') path_to_csv = fh.get_root_location(settings[script_key]["data_frame_file_name"][:index]) if csvfile: input_path = fh.link_paths(path_to_csv, csvfile) else: input_path = fh.link_paths(path_to_csv, settings[script_key]["data_frame_file_name"][index+1:]) print('input_path: ', input_path) data_frame = pd.read_csv(input_path, sep=',', quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ", header=None, engine='python') data_frame.columns = ['patent_id', 'text', 'classification'] # data_frame['text'] = data_frame['text'].apply(lambda text : clean_text(text)) data_frame = check_out_empty_texts_and_wrong_classcodes(data_frame) classification_df = pd.DataFrame(columns=['class', 'count']) # data_frame['classification'].apply(lambda classcode : th.calculate_class_distribution(classcode, classification_df)) return data_frame, classification_df
def handle_complete_args(source_path, folder_level): source_path = fh.link_paths(source_path, '*') source_path = fh.get_list_files(source_path, None) source_path = list(map(lambda path : path + '/', source_path)) print("source path: %s" % source_path) folder_level = int(folder_level) print("folder destination level: %s" % folder_level) if len(source_path) == 0 or source_path[len(source_path)-1][-5:-1] == '.xml': return source_path_warnings() return source_path, folder_level
def apply_tfidf_vectorizer_fit(data_frame, pth): # list of strings, which are consequently made of words (no stopwords, stemming already applied ...) - preprocessed text print('### tfidf_vectorizer_with_lambda ###') # results_tfidf_strip_accents_ascii_analyzer_char_wb_ngram_range_(2, 2)_norm_l1_max_df_0.9_min_df_0.1_max_features_150 vectorizer = TfidfVectorizer(strip_accents='ascii', analyzer='char', ngram_range=(2,2), norm='l1', max_df=.9, min_df=.1, max_features=200) vectorizer.fit(data_frame['text'].apply(lambda x : np.str_(x))) date = datetime.now().isoformat() with open(fh.link_paths(pth, 'tfidf_model '+date+'.pkl'), "wb") as f: pkl.dump(vectorizer, f) return vectorizer
def handle_partial_args(source_path): source_path = fh.link_paths(source_path, '*') source_path = fh.get_list_files(source_path, None) source_path = list(map(lambda path : path + '/', source_path)) print("source path: %s" % source_path) if len(source_path) == 0 or source_path[len(source_path)-1][-5:-1] == '.xml': return th.source_path_warnings() else: folder_level = source_path[0].count('/')-1 print("folder destination level: %s" % folder_level) return source_path, folder_level
def test_basic_LSTM(data_frame, text_vectorizer, classif_level, classif_type, dataset_location): print('### LSTM Doing Testing ###') root_location = fh.get_root_location('data/lstm_outcome/') doc2vec_model_location = fh.link_paths(fh.join_paths(root_location, "doc2vec_model/vocab_model/"), "doc2vec_model") model_location = link_paths(join_paths(root_location, "lstm_model"), "lstm_model") sets_location = join_paths(root_location, "model_sets") save_results = True load_model = True # date problem X_train, X_test, X_val, y_train, y_test, y_val, settings = load_sets(sets_location) classes, n_classes, vocab_processor, len_vocabulary = settings training_docs_list = X_train['patent_id'] test_docs_list = X_test['patent_id'] val_docs_list = X_val['patent_id'] sequence_size, embedding_size = 1, 150 X_data, Xv_data, Xt_data = ch.get_df_data(3, training_docs_list, val_docs_list, test_docs_list, sequence_size, embedding_size, doc2vec_model_location) ##### print(X_data.shape) input_size, output_size = X_data.shape[2], n_classes if load_model: model = tf.keras.models.load_model(model_location) min_delta, patience = 0.00001, 15 early_stopper = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=min_delta, patience=patience, verbose=1, mode='auto') metrics_callback = MetricsCNNCallback(Xv_data, y_val, patience) metrics, predictions, val_metrics = run_lstm(model, X_data, y_train, Xt_data, y_test, Xv_data, y_val, batch_size, [early_stopper, metrics_callback], queue_size, training_docs_list, val_docs_list) binary_predictions = mh.get_binary_0_5(predictions) print('\nGenerating Testing Metrics') metrics = mh.get_sequential_metrics(y_val, predictions, binary_predictions) mh.display_sequential_metrics(metrics) if save_results: classifier_name, parameters = ch.get_sequential_classifier_information(model) model_name = text_vectorizer+'/'+class_vectorizer+'/'+classifier_name ch.save_results(classifier_name+'_LSTM', metrics, parameters, model_name, classif_level, classif_type, dataset_location) print('end testing step')
def train_testing_convolution(data_frame, text_vectorizer, class_vectorizer): save_standard_sets = True root_location = fh.get_root_location('data/convolutional_outcome/') sets_location = fh.join_paths(root_location, "model_sets") checkpoint_path = fh.join_paths(root_location, "model_checkpoints") model_path = fh.link_paths(checkpoint_path, 'convolution_model') weights_path = fh.link_paths(checkpoint_path, 'convolution_weights') # get sets model_name = text_vectorizer+'/'+class_vectorizer+'/NN' standard_results = ch.apply_df_vectorizer(data_frame, text_vectorizer, class_vectorizer, model_name) train_data, test_data, train_labels, test_labels, classes, n_classes, vocab_processor, len_vocabulary = standard_results train_data, val_data, train_labels, val_labels = ch.get_train_test_from_data(train_data, train_labels) # save sets # ch.save_sets(sets_location, train_data, test_data, val_data, train_labels, test_labels, val_labels, # [classes, n_classes, vocab_processor, len_vocabulary]) # this is for test train_data, test_data, val_data, train_labels, test_labels, val_labels, _ = ch.load_sets(sets_location) # it could be that a label is only in the test/data data, might be a problem sequence_length = train_data.shape[1] # define the model model = pmh.get_cnn_test(len_vocabulary, n_classes, sequence_length) # calculates metrics with validating data model, val_predictions = pmh.run_cnn_test(model, train_data, train_labels, val_data, val_labels, val_data, val_labels, model_path, weights_path, True) binary_val_predictions = mh.get_binary_0_5(val_predictions) print(val_labels.shape) print(val_predictions.shape) # display validation metrics metrics = mh.get_sequential_metrics(val_labels, val_predictions, binary_predictions) mh.display_sequential_metrics('validation convolution sequence', metrics)
def save_training_set(training_set, model_name): print('### saving_training_set ###') root_location = fh.get_root_location('data/saved_training_set/') model_name = model_name.replace("/", "-") path = fh.link_paths(root_location, 'training '+model_name+' '+str(datetime.now())[:-10]+'.npy') np.save(path, training_set)
def train_doc2vec(data_frame, patent_ids, classif_level, classif_type): root_location = fh.get_root_location("data/lstm_outcome/") doc2vec_model_save_location = fh.join_paths(root_location, "doc2vec_model/") preprocessed_location = fh.join_paths(root_location, "preprocessed_data/separated_datasets/") training_preprocessed_files_prefix = fh.join_paths(preprocessed_location, "training_docs_data_preprocessed/") validation_preprocessed_files_prefix = fh.join_paths(preprocessed_location, "validation_docs_data_preprocessed/") test_preprocessed_files_prefix = fh.join_paths(preprocessed_location, "test_docs_data_preprocessed/") vocab_path = fh.join_paths(doc2vec_model_save_location, "vocab_model") training_docs_iterator = create_tuple_array(data_frame, patent_ids, text_batch_size=10000) ##### tagged_data = training_docs_iterator cores = multiprocessing.cpu_count() model_dbow = Doc2Vec(dm=1, vector_size=200, window=2, negative=10, sample=1e-8, hs=0, min_count=50, alpha=0.25, min_alpha=0.05, dbow_words=0, seed=1234, concat=0, workers=cores) model_dbow.build_vocab([x for x in tqdm(tagged_data)]) for epoch in range(30): # model_dbow.train(utils_shuffle_rows([x for x in tqdm(tagged_data)]), total_examples=len(tagged_data), epochs=1) model_dbow.train(utils_shuffle_rows([x for x in tqdm(tagged_data)]), total_examples=len(tagged_data), epochs=1) model_dbow.alpha -= 0.002 model_dbow.min_alpha = model_dbow.alpha date = datetime.datetime.now().isoformat() model_dbow.save(fh.link_paths(vocab_path, 'doc2vec_vocab_30_epochs')) ##### params = wmh.get_parameters_lstm_doc2vec() GLOBAL_VARS.DOC2VEC_MODEL_NAME, placeholder_model_name, doc2vec_model = wmh.get_lstm_doc2vec(params, classif_level, classif_type) # yields a list of sentences id, text as a tuple or (id, tuple) # training_docs_iterator = lrh.BatchWrapper(training_preprocessed_files_prefix, text_batch_size=10000, level=classif_level, # level_type=classif_type) doc2vec_model.build_vocab(documents=training_docs_iterator, progress_per=params[13]) doc2vec_model.save(fh.link_paths(vocab_path, "doc2vec_vocab")) DOC2VEC_ALPHA_DECREASE = wmh.set_alpha_parameters_lstm_doc2vec(doc2vec_model) start_epoch = 1 # for epoch in range(1, params[11] + 1): # GLOBAL_VARS.MODEL_NAME = placeholder_model_name.format(epoch) # doc2vec_folder_path = fh.join_paths(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME) # if fh.ensure_exists_path_location(fh.link_paths(doc2vec_folder_path, "doc2vec_model")): # start_epoch = epoch # if start_epoch > 1: # GLOBAL_VARS.MODEL_NAME = placeholder_model_name.format(start_epoch) # doc2vec_folder_path = fh.join_paths(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME) # # if a model of that epoch already exists, we load it and proceed to the next epoch # doc2vec_model = Doc2Vec.load(fh.link_paths(doc2vec_folder_path, "doc2vec_model")) # start_epoch += 1 ## The Actual Training for epoch in range(start_epoch, params[11] + 1): print("### epoch "+str(epoch)+" ###") # set new filename/path to include the epoch GLOBAL_VARS.MODEL_NAME = placeholder_model_name.format(epoch) doc2vec_folder_path = fh.join_paths(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME) # train the doc2vec model # training_docs_iterator = lrh.BatchWrapper(training_preprocessed_files_prefix, text_batch_size=10000, level=classif_level, # level_type=classif_type) # yields a list of sentences id, text as a tuple or (id, tuple) doc2vec_model.train(documents=training_docs_iterator, total_examples=len(training_docs_iterator), report_delay=params[12], epochs=params[10]) doc2vec_model.alpha -= DOC2VEC_ALPHA_DECREASE # decrease the learning rate doc2vec_model.min_alpha = doc2vec_model.alpha # fix the learning rate, no decay doc2vec_model.save(fh.link_paths(doc2vec_folder_path, "doc2vec_model")) if epoch != params[11]: print("still training epochs missing: " + str(epoch)) sys.exit(1)
def train_LSTM_from_web(data_frame, text_vectorizer, class_vectorizer, classif_level, classif_type, dataset_location): print('### LSTM Doing Training ###') root_location = fh.get_root_location('data/lstm_outcome/') exports_location = fh.join_paths(root_location, "exported_data/") matrices_save_location = fh.join_paths(root_location, "fhv_matrices/") nn_parameter_search_location = fh.join_paths(root_location, "nn_fhv_parameter_search") doc2vec_model_location = fh.link_paths(fh.join_paths(root_location, "doc2vec_model/vocab_model/"), "doc2vec_model") load_existing_results = True # it was True save_results = True sequence_size = 1 EMBEDDING_SIZE = 150 model_name = text_vectorizer+'/'+class_vectorizer+'/LSTM' results = ch.apply_df_vectorizer(data_frame, text_vectorizer, class_vectorizer, model_name) X_train, X_val, y_train, y_val, classes, n_classes, vocab_processor, len_vocabulary = results training_docs_list = X_train['patent_id'] val_docs_list = X_val['patent_id'] X_data, Xv_data, _ = ch.get_df_data(2, training_docs_list, val_docs_list, None, sequence_size, EMBEDDING_SIZE, doc2vec_model_location) GLOBAL_VARS.DOC2VEC_MODEL_NAME, GLOBAL_VARS.MODEL_NAME = wmh.set_parameters_lstm_doc2vec(nn_parameter_search_location, classif_level, classif_type) # print(X_data.shape) # 64, 1, 200 # print(Xt_data.shape) # 20, 1, 200 # print(Xv_data.shape) # 16, 1, 200 NN_INPUT_NEURONS, NN_SEQUENCE_SIZE, NN_OUTPUT_NEURONS = pmh.get_lstm_shapes(X_data, n_classes) NN_BATCH_SIZE, PARTS_LEVEL, NN_MAX_EPOCHS, QUEUE_SIZE = pmh.get_lstm_basic_parameters() param_sampler = pmh.get_lstm_training_parameters() EARLY_STOPPER_MIN_DELTA, EARLY_STOPPER_PATIENCE = pmh.get_early_stopping_parameters() param_results_dict = dict() param_results_path = fh.link_paths(fh.join_paths(nn_parameter_search_location, GLOBAL_VARS.MODEL_NAME), NN_PARAMETER_SEARCH_PREFIX.format(classif_type, classif_level, NN_BATCH_SIZE)) index = param_results_path.rfind('/') fh.create_folder(fh.link_paths(fh.join_paths(nn_parameter_search_location, GLOBAL_VARS.MODEL_NAME), NN_PARAMETER_SEARCH_PREFIX.format(classif_type, classif_level, NN_BATCH_SIZE))[:index]) ########### # print(X_data.shape) # input_size, sequence_size, output_size = X_data.shape[2], X_data.shape[0], n_classes # NN_BATCH_SIZE, PARTS_LEVEL, NN_MAX_EPOCHS, QUEUE_SIZE = 1, 2, 200, 100 # lstm_output_sizes = [500, 1000] # w_dropout_options = [0., 0.5] # u_dropout_options = [0., 0.5] # stack_layers_options = [1, 2, 3] # lstm_output_size, w_dropout_do, u_dropout_do, stack_layers = 500, 0.0, 0.0, 1 # conv_size, conv_filter_length, max_pooling_length = 128, 2, 2 # EARLY_STOPPER_MIN_DELTA, EARLY_STOPPER_PATIENCE = 0.00001, 15 # import tensorflow as tf # from tensorflow import keras # from keras.layers import Input, Dense, Dropout, Activation # from keras.models import Model, Sequential # from keras.layers.convolutional import MaxPooling1D, Convolution1D # from keras.layers.recurrent import LSTM # model = Sequential() # # model.add(Convolution1D(nb_filter=conv_size, input_shape=(sequence_size, input_size), # # filter_length=conv_filter_length, border_mode='same', activation='relu')) # # model.add(MaxPooling1D(pool_length=max_pooling_length)) # model.add(LSTM(lstm_output_size, activation='tanh', recurrent_activation='hard_sigmoid', use_bias=False, # input_dim=input_size, dropout_W=w_dropout_do, dropout_U=u_dropout_do, # implementation=1, # return_sequences=False if 1 == stack_layers else True, # go_backwards=False, stateful=False, unroll=False, # name='lstm_{}_w-drop_{}_u-drop_{}_layer_{}'.format(lstm_output_size, str(u_dropout_do), # str(w_dropout_do), str(1)))) # # model.add(LSTM(lstm_output_size, activation='tanh', recurrent_activation='hard_sigmoid', use_bias=False, # # input_dim=input_size, dropout_W=w_dropout_do, dropout_U=u_dropout_do, # # implementation=1, # # return_sequences=False if 2 == stack_layers else True, # # go_backwards=False, stateful=False, unroll=False, # # name='lstm_{}_w-drop_{}_u-drop_{}_layer_{}'.format(lstm_output_size, str(u_dropout_do), # # str(w_dropout_do), str(2)))) # # model.add(LSTM(lstm_output_size, activation='tanh', recurrent_activation='hard_sigmoid', use_bias=False, # # input_dim=input_size, dropout_W=w_dropout_do, dropout_U=u_dropout_do, # # implementation=1, # # return_sequences=False if 3 == stack_layers else True, # # go_backwards=False, stateful=False, unroll=False, # # name='lstm_{}_w-drop_{}_u-drop_{}_layer_{}'.format(lstm_output_size, str(u_dropout_do), # # str(w_dropout_do), str(3)))) # model.add(Dense(output_size, activation='sigmoid', name='sigmoid_output')) # model.compile(optimizer='rmsprop', loss='binary_crossentropy') # input_matrix = fh.join_paths(matrices_save_location, GLOBAL_VARS.MODEL_NAME) # early_stopper = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=EARLY_STOPPER_MIN_DELTA, # patience=EARLY_STOPPER_PATIENCE, verbose=1, mode='auto') # metrics_callback = mh.MetricsCallback(input_matrix, classif_type, PARTS_LEVEL, NN_BATCH_SIZE, is_mlp=False) # history, yvp, yvp_binary = pmh.fit_predict_generator(model, X_data, y_train, Xv_data, y_val, training_docs_list, val_docs_list, early_stopper, metrics_callback, NN_BATCH_SIZE, NN_MAX_EPOCHS, QUEUE_SIZE) # validation_metrics = mh.get_sequential_metrics(y_val, yvp, yvp_binary) # mh.display_sequential_metrics(validation_metrics) # ########### # useful to skip all the already tested models if load_existing_results: param_results_path = fh.link_paths(fh.join_paths(nn_parameter_search_location, GLOBAL_VARS.MODEL_NAME), NN_PARAMETER_SEARCH_PREFIX.format(classif_type, classif_level, NN_BATCH_SIZE)) if fh.ensure_exists_path_location(param_results_path): print('Loading Previous results in {}'.format(param_results_path)) param_results_dict = pickle.load(open(param_results_path, 'rb')) else: print('No Previous results exist in {}'.format(param_results_path)) for params in param_sampler: start_time = time.time() lstm_output_size = params['lstm_output_size'] w_dropout_do = params['w_dropout'] u_dropout_do = params['u_dropout'] stack_layers = params['stack_layers'] conv_size = params['conv_size'] conv_filter_length = params['conv_filter_length'] conv_max_pooling_length = params['conv_max_pooling_length'] GLOBAL_VARS.NN_MODEL_NAME = 'lstm_size_{}_w-drop_{}_u-drop_{}_stack_{}_conv_{}'.format( lstm_output_size, w_dropout_do, u_dropout_do, stack_layers, str(conv_size)) if conv_size: GLOBAL_VARS.NN_MODEL_NAME += '_conv-filter-length_{}_max-pooling-size_{}'.format(conv_filter_length, conv_max_pooling_length) if GLOBAL_VARS.NN_MODEL_NAME in param_results_dict.keys(): print("skipping: {}".format(GLOBAL_VARS.NN_MODEL_NAME)) continue # creating the actual keras model model = pmh.get_keras_rnn_model(NN_INPUT_NEURONS, NN_SEQUENCE_SIZE, NN_OUTPUT_NEURONS, lstm_output_size, w_dropout_do, u_dropout_do, stack_layers, conv_size, conv_filter_length, conv_max_pooling_length) classifier_name, parameters = ch.get_sequential_classifier_information(model) model_name = text_vectorizer+'/'+class_vectorizer+'/'+classifier_name input_matrix = fh.join_paths(matrices_save_location, GLOBAL_VARS.MODEL_NAME) early_stopper = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=EARLY_STOPPER_MIN_DELTA, patience=EARLY_STOPPER_PATIENCE, verbose=1, mode='auto') metrics_callback = mh.MetricsCallback(input_matrix, classif_type, PARTS_LEVEL, NN_BATCH_SIZE, is_mlp=False) history, yvp, yvp_binary = pmh.fit_predict_generator(model, X_data, y_train, Xv_data, y_val, training_docs_list, val_docs_list, early_stopper, metrics_callback, NN_BATCH_SIZE, NN_MAX_EPOCHS, QUEUE_SIZE) print('\nGenerating Validation Metrics') validation_metrics = mh.get_sequential_metrics(y_val, yvp, yvp_binary) mh.display_sequential_metrics(classifier_name, validation_metrics) param_results_dict[GLOBAL_VARS.NN_MODEL_NAME] = dict() # param_results_dict[GLOBAL_VARS.NN_MODEL_NAME]['best_validation_metrics'] = best_validation_metrics param_results_dict[GLOBAL_VARS.NN_MODEL_NAME]['epochs'] = len(history.history['val_loss']) param_results_dict[GLOBAL_VARS.NN_MODEL_NAME]['best_weights'] = metrics_callback.best_weights param_results_dict[GLOBAL_VARS.NN_MODEL_NAME]['best_val_loss'] = metrics_callback.best_val_loss param_results_dict[GLOBAL_VARS.NN_MODEL_NAME]['training_loss'] = metrics_callback.losses param_results_dict[GLOBAL_VARS.NN_MODEL_NAME]['validation_loss'] = metrics_callback.val_losses duration = time.time() - start_time param_results_dict[GLOBAL_VARS.NN_MODEL_NAME]['duration'] = duration ch.delete_variables(history, metrics_callback) ch.save_results(classifier_name+'_LSTM', validation_metrics, parameters, model_name, classif_level, classif_type, dataset_location) if save_results: file = open(param_results_path, 'wb') pickle.dump(param_results_dict, file)
def test_LSTM_from_web(data_frame, text_vectorizer, class_vectorizer, classif_level, classif_type, dataset_location): print('### LSTM Doing Testing ###') root_location = fh.get_root_location('data/lstm_outcome/') nn_parameter_search_location = fh.join_paths(root_location, "nn_fhv_parameter_search") doc2vec_model_location = fh.link_paths(fh.join_paths(root_location, "doc2vec_model/vocab_model/"), "doc2vec_model") save_results = True sequence_size = 1 EMBEDDING_SIZE = 150 model_name = text_vectorizer+'/'+class_vectorizer+'/LSTM' results = ch.apply_df_vectorizer(data_frame, text_vectorizer, class_vectorizer, model_name) X_train, X_test, y_train, y_test, classes, n_classes, vocab_processor, len_vocabulary = results X_train, X_val, y_train, y_val = ch.get_train_test_from_data(X_train, y_train) training_docs_list = X_train['patent_id'] test_docs_list = X_test['patent_id'] val_docs_list = X_val['patent_id'] X_data, Xv_data, Xt_data = ch.get_df_data(3, training_docs_list, val_docs_list, test_docs_list, sequence_size, EMBEDDING_SIZE, doc2vec_model_location) GLOBAL_VARS.DOC2VEC_MODEL_NAME, GLOBAL_VARS.MODEL_NAME = wmh.set_parameters_lstm_doc2vec(nn_parameter_search_location, classif_level, classif_type) NN_INPUT_NEURONS, NN_SEQUENCE_SIZE, NN_OUTPUT_NEURONS = pmh.get_lstm_shapes(X_data, n_classes) NN_BATCH_SIZE, PARTS_LEVEL, NN_MAX_EPOCHS, QUEUE_SIZE = pmh.get_lstm_basic_parameters() params = pmh.get_lstm_testing_parameters() lstm_output_size,w_dropout_do,u_dropout_do, stack_layers, conv_size, conv_filter_length, conv_max_pooling_length = params EARLY_STOPPER_MIN_DELTA, EARLY_STOPPER_PATIENCE = pmh.get_early_stopping_parameters() TEST_METRICS_FILENAME = '{}_level_{}_standard_nn_test_metrics_dict.pkl' test_metrics_dict = dict() test_metrics_path = fh.link_paths(fh.link_paths(nn_parameter_search_location, GLOBAL_VARS.MODEL_NAME), TEST_METRICS_FILENAME.format(classif_type, PARTS_LEVEL)) param_results_path = fh.link_paths(fh.link_paths(nn_parameter_search_location, GLOBAL_VARS.MODEL_NAME), NN_PARAMETER_SEARCH_PREFIX.format(classif_type, classif_level, NN_BATCH_SIZE)) param_results_dict = pickle.load(open(param_results_path, 'rb')) GLOBAL_VARS.NN_MODEL_NAME = 'lstm_size_{}_w-drop_{}_u-drop_{}_stack_{}_conv_{}'.format(lstm_output_size, w_dropout_do, u_dropout_do, stack_layers, str(conv_size) ) if conv_size: GLOBAL_VARS.NN_MODEL_NAME += '_conv-filter-length_{}_max-pooling-size_{}'.format(conv_filter_length, conv_max_pooling_length) if GLOBAL_VARS.NN_MODEL_NAME not in param_results_dict.keys(): print("Can't find model: {}".format(GLOBAL_VARS.NN_MODEL_NAME)) raise Exception() if fh.ensure_exists_path_location(test_metrics_path): test_metrics_dict = pickle.load(open(fh.link_paths(fh.link_paths(nn_parameter_search_location, GLOBAL_VARS.MODEL_NAME), TEST_METRICS_FILENAME.format(classif_type,PARTS_LEVEL)), 'rb')) if GLOBAL_VARS.NN_MODEL_NAME in test_metrics_dict.keys(): print("Test metrics already exist for: {}".format(GLOBAL_VARS.NN_MODEL_NAME)) test_metrics = test_metrics_dict[GLOBAL_VARS.NN_MODEL_NAME] print("** Test Metrics: Cov Err: {:.3f}, Avg Labels: {:.3f}, \n\t\t Top 1: {:.3f}, Top 3: {:.3f}, Top 5: {:.3f}, \n\t\t F1 Micro: {:.3f}, F1 Macro: {:.3f}".format( test_metrics['coverage_error'], test_metrics['average_num_of_labels'], test_metrics['top_1'], test_metrics['top_3'], test_metrics['top_5'], test_metrics['f1_micro'], test_metrics['f1_macro'])) raise Exception() print('***************************************************************************************') print(GLOBAL_VARS.NN_MODEL_NAME) model = pmh.get_keras_rnn_model(NN_INPUT_NEURONS, NN_SEQUENCE_SIZE, NN_OUTPUT_NEURONS, lstm_output_size, w_dropout_do, u_dropout_do, stack_layers, conv_size, conv_filter_length, conv_max_pooling_length) # get model best weights weights = param_results_dict[GLOBAL_VARS.NN_MODEL_NAME]['best_weights'] model.set_weights(weights) print('Evaluating on Test Data using best weights') _, ytp, ytp_binary = pmh.predict_generator(None, model, Xt_data, y_test, NN_BATCH_SIZE, QUEUE_SIZE, test_docs_list) print('Generating Test Metrics') test_metrics = mh.get_sequential_metrics(y_test, ytp, ytp_binary) mh.display_sequential_metrics(test_metrics) if save_results: classifier_name, parameters = ch.get_sequential_classifier_information(model) ch.save_results(classifier_name+'_LSTM', test_metrics, parameters, model_name, classif_level, classif_type, dataset_location) test_metrics_dict[GLOBAL_VARS.NN_MODEL_NAME] = test_metrics pickle.dump(test_metrics_dict, open(test_metrics_path, 'wb'))
def get_csv_path(model_key): index = settings[model_key]["results_file_name"].rfind('/') path_to_csv = fh.get_root_location(settings[model_key]["results_file_name"][:index]) return fh.link_paths(path_to_csv, settings[model_key]["results_file_name"][index+1:])
def preprocessing_data_for_fasttext(data_frame, text_vectorizer, class_vectorizer): root_location = fh.get_root_location('data/fasttext_outcome/') data_frame['text'] = data_frame['text'].replace( '\n', ' ', regex=True).replace('\t', ' ', regex=True) model_name = text_vectorizer + '/' + class_vectorizer + '/FastText' try: X_train, X_test, Y_train, Y_test, _, _, _, _ = ch.apply_df_vectorizer( data_frame, text_vectorizer, class_vectorizer, model_name) X_train, X_val, Y_train, Y_val = ch.get_train_test_from_data( X_train, Y_train) # self.data_vectors = pd.DataFrame(columns=range(vectors_size), index=range(corpus_size)) if not isinstance(X_train, pd.DataFrame): train = pd.DataFrame(data=X_train) test = pd.DataFrame(data=X_test) val = pd.DataFrame(data=X_val) # test_labels = pd.DataFrame(columns=['']) else: train = X_train test = X_test val = X_val train.loc[:, 1] = Y_train test.loc[:, 1] = Y_test val.loc[:, 1] = Y_val train.drop(columns=['patent_id'], inplace=True) test.drop(columns=['patent_id'], inplace=True) val.drop(columns=['patent_id'], inplace=True) data_frame.to_csv(fh.link_paths(root_location, 'dataframe.csv'), index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ") train.to_csv(fh.link_paths(root_location, 'training set.csv'), index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ") test.to_csv(fh.link_paths(root_location, 'testing set.csv'), index=False, sep=',', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ") except: print('a problem occurred while trying to store the dataframes') X_train, X_test, Y_train, Y_test, _, _, _, _ = ch.apply_df_vectorizer( data_frame, text_vectorizer, class_vectorizer, model_name) X_train, X_val, Y_train, Y_val = ch.get_train_test_from_data( X_train, Y_train) val = pd.DataFrame({'text': X_val, 'classification': Y_val}) train = pd.DataFrame({'text': X_train, 'classification': Y_train}) test = pd.DataFrame({'text': X_test, 'classification': Y_test}) data_frame.to_csv(fh.link_paths(root_location, 'dataframe.csv'), index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ") val.to_csv(fh.link_paths(root_location, 'validating set.csv'), index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ") train.to_csv(fh.link_paths(root_location, 'training set.csv'), index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ") test.to_csv(fh.link_paths(root_location, 'testing set.csv'), index=False, sep=',', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")
def apply_fasttext(data_frame, text_vectorizer, class_vectorizer, classif_level, classif_type, dataset_location): root_location = fh.get_root_location('data/fasttext_outcome/') fasttext_location = fh.join_paths(root_location, 'fasttext_models/') word2vec_location = fh.join_paths(root_location, 'word2vec_models/') timestamp = datetime.datetime.now().isoformat() ############################# # unsupervised model # model = fasttext.train_unsupervised(input=fh.link_paths(root_location, 'training set.csv'), # autotuneValidationFile=fh.link_paths(root_location, 'training set.csv')) ############################# # supervised model # model = fasttext.train_supervised(input=fh.link_paths(root_location, 'training set.csv'), # autotuneValidationFile=fh.link_paths(root_location, 'validating set.csv'), verbose=3, autotuneDuration=5000) ############################# model, parameters = pmh.get_fasttext( fh.link_paths(root_location, 'training set.csv')) pmh.save_fasttext_model( model, fh.link_paths(fasttext_location, 'model ' + timestamp + '.bin')) # pmh.load_fasttext_model(fh.link_paths(fasttext_location, 'model '+timestamp+'.bin')) test_labels, predictions, results = pmh.predict_test_fasttext( model, fh.link_paths(root_location, 'testing set.csv')) # print('predicted labels {}, probabilities of the labels {}'.format(predictions[0], predictions[1])) result_top_15 = model.test(fh.link_paths(root_location, 'testing set.csv'), k=15) result_top_8 = model.test(fh.link_paths(root_location, 'testing set.csv'), k=8) result_top_5 = model.test(fh.link_paths(root_location, 'testing set.csv'), k=5) result_top_3 = model.test(fh.link_paths(root_location, 'testing set.csv'), k=3) result_top_1 = model.test(fh.link_paths(root_location, 'testing set.csv'), k=1) classifier_name = ch.get_fasttext_classifier_information(str(model)) model_name = text_vectorizer + '/' + class_vectorizer + '/' + classifier_name mh.display_directly_metrics( 'k=-1 ' + classifier_name, -1, results[1], results[2], 2 * (results[1] * results[2]) / (results[1] + results[2])) mh.display_directly_metrics( 'k= 15 ' + classifier_name, -1, result_top_15[1], result_top_15[2], 2 * (result_top_15[1] * result_top_15[2]) / (result_top_15[1] + result_top_15[2])) mh.display_directly_metrics( 'k= 8 ' + classifier_name, -1, result_top_8[1], result_top_8[2], 2 * (result_top_8[1] * result_top_8[2]) / (result_top_8[1] + result_top_8[2])) mh.display_directly_metrics( 'k= 5 ' + classifier_name, -1, result_top_5[1], result_top_5[2], 2 * (result_top_5[1] * result_top_5[2]) / (result_top_5[1] + result_top_5[2])) mh.display_directly_metrics( 'k= 3 ' + classifier_name, -1, result_top_3[1], result_top_3[2], 2 * (result_top_3[1] * result_top_3[2]) / (result_top_3[1] + result_top_3[2])) mh.display_directly_metrics( 'k= 1 ' + classifier_name, -1, result_top_1[1], result_top_1[2], 2 * (result_top_1[1] * result_top_1[2]) / (result_top_1[1] + result_top_1[2])) ch.save_results(classifier_name, results, parameters, model_name, classif_level, classif_type, dataset_location) manual_metrics = mh.calculate_manual_metrics(model_name, test_labels, predictions) none_average, binary_average, micro_average, macro_average = manual_metrics # print(model.test_label()) # path is missing # list_ = model.words # new_list = [] # for token in list_: # if len(token) in [0,1,2]: # new_list.append(token) # elif len(token) > 29: # new_list.append(token) # # print(new_list) # print(len(new_list)) print(model.labels)
def first_attempt_based_on_text_classification_paper(data_frame, text_vectorizer, class_vectorizer): # Parameters - can be placed at the beginning of the script # ================================================== # Data loading params tf.flags.DEFINE_float("dev_sample_percentage", .2, "Percentage of the training data to use for validation") tf.flags.DEFINE_string("positive_data_file", "./data/rt-polaritydata/rt-polarity.pos", "Data source for the positive data.") tf.flags.DEFINE_string("negative_data_file", "./data/rt-polaritydata/rt-polarity.neg", "Data source for the negative data.") # Model Hyperparameters tf.flags.DEFINE_integer("embedding_dim", 128, "Dimensionality of character embedding (default: 128)") tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')") tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)") tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)") tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularization lambda (default: 0.0)") # Training parameters tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)") tf.flags.DEFINE_integer("num_epochs", 200, "Number of training epochs (default: 200)") tf.flags.DEFINE_integer("evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)") tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)") tf.flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store (default: 5)") # Misc Parameters tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS # FLAGS._parse_flags() # print("\nParameters:") # for attr, value in sorted(FLAGS.__flags.items()): # print("{}={}".format(attr.upper(), value)) # print("") dev_sample_percentage = FLAGS.dev_sample_percentage # x_train, y_train, vocab_processor, len_vocabulary, x_dev, y_dev, classe, n_classes = preprocess(data_frame, dev_sample_percentage) model_name = text_vectorizer+'/'+class_vectorizer+'/TextCNN' standard_results = ch.apply_df_vectorizer(data_frame, 'standard', 'multi_label', model_name) x_train, x_dev, y_train, y_dev, classes, n_classes, vocab_processor, len_vocabulary = standard_results with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement = FLAGS.allow_soft_placement, log_device_placement = FLAGS.log_device_placement) sess = tf.Session(config = session_conf) with sess.as_default(): # Code that operates on the default graph and session comes here… cnn = pmh.TextCNN( sequence_length = x_train.shape[1], num_classes = n_classes, vocab_size = (len_vocabulary), embedding_size = FLAGS.embedding_dim, filter_sizes = list(map(int, FLAGS.filter_sizes.split(","))), num_filters = FLAGS.num_filters) # define training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(1e-4) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # output directory for models and summaries root_location = fh.get_root_location('data/convolutional_runs/') timestamp = datetime.datetime.now().isoformat() out_dir = fh.link_paths(root_location, timestamp) print("Writing to {}\n".format(out_dir)) # summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", cnn.loss) acc_summary = tf.summary.scalar("accuracy", cnn.accuracy) # train summaries train_summary_op = tf.summary.merge([loss_summary, acc_summary]) train_summary_dir = fh.join_paths(fh.link_paths(out_dir, 'summaries'), 'train') train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph_def) # dev summaries dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) dev_summary_dir = fh.join_paths(fh.link_paths(out_dir, 'summaries'), 'dev') dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph_def) # checkpointing checkpoint_dir = fh.link_paths(out_dir, 'checkpoints') checkpoint_prefix = fh.join_paths(checkpoint_dir, 'model') saver = tf.train.Saver(tf.all_variables()) # write vocabulary try: vocab_processor.save(os.path.join(out_dir, "vocab")) except: pass # initialize all variables sess.run(tf.global_variables_initializer()) def train_step(x_batch, y_batch): """ A single training step """ feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: FLAGS.dropout_keep_prob } _, step, summaries, loss, accuracy = sess.run( [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) train_summary_writer.add_summary(summaries, step) def dev_step(x_batch, y_batch, writer=None): """ Evaluates model on a dev set """ feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: 1.0 } step, summaries, loss, accuracy = sess.run( [global_step, dev_summary_op, cnn.loss, cnn.accuracy], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) if writer: writer.add_summary(summaries, step) # generate batches batches = batch_iter( list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs) # training loop. For each batch… for batch in batches: x_batch, y_batch = zip(*batch) train_step(x_batch, y_batch) current_step = tf.train.global_step(sess, global_step) if current_step % FLAGS.evaluate_every == 0: print("\nEvaluation:") dev_step(x_dev, y_dev, writer=dev_summary_writer) if current_step % FLAGS.checkpoint_every == 0: path = saver.save(sess, checkpoint_prefix, global_step=current_step) print("Saved model checkpoint to {}\n".format(path))
def apply_doc2vec(data_frame, type_, doc2vec_path): index = type_.rfind('/') if index != -1: temp_type = type_[index+1:] else: temp_type = type_ Y, classes, n_classes = apply_classification_vectorizer(temp_type, data_frame) print('finisched class vect') # if index == -1: X_train, X_test, y_train, y_test = get_train_test_from_data(data_frame, Y) # else: # # CHECKED # # total_indexes = 90588+384464 # total_indexes = data_frame.shape[0] # breakpoint_ = int(total_indexes * .8) # # breakpoint_ = 380380 # for custom esperimental data # # breakpoint_ = 4 # aranged_indexes = np.arange(total_indexes) # train_ids = aranged_indexes[:breakpoint_] # test_ids = aranged_indexes[breakpoint_:] # X_train = data_frame.iloc[train_ids] # X_test = data_frame.iloc[test_ids] # y_train = Y[:breakpoint_] # y_test = Y[breakpoint_:] print('finisched get train test') patent_ids = X_train['patent_id'] print('finisched ids') helper_doc2vec = wmh.Dov2VecHelper() load_doc2vec = False if not load_doc2vec: # more than our esperiment with 5 training years if data_frame.shape[0] <= 475052+1: print('finisched doc2vec helpers - if -> array') X_train = helper_doc2vec.label_sentences(X_train['text'], 'Train') print('finisched label sent') X_test = helper_doc2vec.label_sentences(X_test['text'], 'Test') print('finisched label sent') all_data = X_train + X_test print('finisched all data') model_dbow = wmh.train_doc2vec(all_data, doc2vec_path) print('finisched train doc2vec') train_vectors_dbow = helper_doc2vec.get_vectors(model_dbow, len(X_train), 200, 'Train') print('finisched get vect: ', len(X_train)) test_vectors_dbow = helper_doc2vec.get_vectors(model_dbow, len(X_test), 200, 'Test') print('finisched get vect: ', len(X_test)) else: print('finisched doc2vec helpers - if -> dataframe') X_train = helper_doc2vec.alternative_label_sentences(X_train['text'], 'Train') print('finisched label sent') X_test = helper_doc2vec.alternative_label_sentences(X_test['text'], 'Test') print('finisched label sent') all_data = X_train.append(X_test) print('finisched all data') model_dbow = wmh.train_alternative_doc2vec(all_data, doc2vec_path) print('finisched train doc2vec') train_vectors_dbow = helper_doc2vec.alternative_get_vectors(model_dbow, len(X_train), 200, 'Train') print('finisched get vect: ', len(X_train)) test_vectors_dbow = helper_doc2vec.alternative_get_vectors(model_dbow, len(X_test), 200, 'Test') print('finisched get vect: ', len(X_test)) else: sequence_size, embedding_size = 1, 200 training_docs_list = X_train['patent_id'] test_docs_list = X_test['patent_id'] doc2vec_path = fh.link_paths(doc2vec_path, 'doc2vec_model_reference') train_vectors_dbow, test_vectors_dbow, _ = get_df_data(2, training_docs_list, test_docs_list, None, sequence_size, embedding_size, doc2vec_path) return train_vectors_dbow, test_vectors_dbow, y_train, y_test, classes, n_classes, patent_ids