def test_model(docs, labels, model, log_writer: LogWriter, test_name): """ Tests provided instance of a model and outputs results using provided test_name :param model: model to be tested :param test_name: name which will be used for output :return: accuracy in range (0 - 1) """ stats = [] topic_indexes, topics_of_index = connect_topic_id_to_topics( model, prep_docs_for_assesment(docs, labels), log_writer) distribution = [] for index, article in enumerate(docs): analysis_res = model.analyse_text(article) if len(analysis_res) == 0: print("nothing found") continue res = max(analysis_res, key=lambda item: item[1]) if res[0] not in topics_of_index: topics_of_index[res[0]] = [labels[index]] topic_indexes[labels[index]] = res[0] print("continuing") continue distribution.append(res[0]) stats.append(1 if labels[index] in topics_of_index[res[0]] else 0) # self.log_writer.add_log("Article with topic {} was assigned {} with {} certainty.".format(article[0], "correctly" if res[0] == self.topic_positions[article[0]] else "wrong", res[1])) accuracy = sum(stats) / len(stats) log_writer.add_log("{} got accuracy {}".format(test_name, accuracy)) log_writer.add_log("Real distribution was {}".format(dict( Counter(labels)))) log_writer.add_log("Predicted distribution was {}".format( dict(Counter(distribution)))) return accuracy
from dataset_loader.dataset_helper import Dataset_Helper from results_saver import LogWriter import os import sys from neural_networks.aliaser import * import tkinter as tk from tkinter import simpledialog file_dir = os.path.dirname(__file__) sys.path.append(file_dir) root = tk.Tk() root.withdraw() preprocess = True datasets_helper = Dataset_Helper(preprocess) results_saver = LogWriter(log_file_desc=simpledialog.askstring( title="Test Name", prompt="Insert test name:", initialvalue='CONV_GRU_')) results = [] num_of_words = 10000 while datasets_helper.next_dataset(): results_saver.add_log("Starting testing dataset {}".format( datasets_helper.get_dataset_name())) validation_count = datasets_helper.get_num_of_train_texts() // 10 tokenizer = Tokenizer(num_words=num_of_words, filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n', lower=False, split=' ') generator = datasets_helper.text_generator() results_saver.add_log("Starting preprocessing and tokenization.") tokenizer.fit_on_texts(generator) results_saver.add_log("Done. Building model now.")
patience=param[2], verbose=0, mode='auto', baseline=None, restore_best_weights=False) ]) weight_in = autoencoder.get_weights()[0] weight_out = autoencoder.get_weights()[2] #tst = autoencoder.get_weights() blob = np.array([]) weight_in = weight_in.transpose() #combined_weight = np.dot(weight_in.transpose(), weight_out) num_of_important_words = 20 log_writer = LogWriter(log_file_desc='{}{}'.format(test_name, ""), result_desc="NeuralTopicModel") log_writer.write_any('model', autoencoder.to_json(), 'w+', True) loss = history.history['loss'] val_loss = history.history['val_loss'] epochs = range(1, len(loss) + 1) plt.plot(epochs, loss, 'g', label='Training loss') plt.plot(epochs, val_loss, 'b', label='Validation loss') plt.title('Training and validation loss {}'.format( dataset_helper.get_dataset_name())) plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() plt.savefig( log_writer.get_plot_path(dataset_helper.get_dataset_name(), "loss")) plt.clf()
from training_text_generator_RNN import Training_Text_Generator_RNN from dataset_loader.dataset_helper import Dataset_Helper from results_saver import LogWriter import os import sys from neural_networks.aliaser import * file_dir = os.path.dirname(__file__) sys.path.append(file_dir) """config = tf.ConfigProto( device_count = {'GPU': 1 , 'CPU': 4} ) sess = tf.Session(config=config) keras.backend.set_session(sess)""" datasets_helper = Dataset_Helper(preprocess=True) datasets_helper.set_wanted_datasets([0]) results_saver = LogWriter(log_file_desc="Bidirectional-no-relu") results = [] num_of_words = 15000 while datasets_helper.next_dataset(): results_saver.add_log("Starting testing dataset {}".format( datasets_helper.get_dataset_name())) validation_count = 200 #datasets_helper.get_num_of_train_texts() // 10 tokenizer = Tokenizer(num_words=num_of_words) #, #filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n', #lower=False, split=' ') generator = datasets_helper.text_generator() results_saver.add_log("Starting preprocessing and tokenization.") tokenizer.fit_on_texts(generator) results_saver.add_log("Done. Building model now.")
from results_saver import LogWriter, finish_dataset import os import sys from neural_networks.aliaser import * import tkinter as tk from tkinter import simpledialog file_dir = os.path.dirname(__file__) sys.path.append(file_dir) root = tk.Tk() root.withdraw() preprocess = True datasets_helper = Dataset_Helper(preprocess) datasets_helper.set_wanted_datasets([3]) results_saver = LogWriter(log_file_desc=simpledialog.askstring( title="Test Name", prompt="Insert test name:", initialvalue='Dense_')) results = [] num_of_words = 10000 while datasets_helper.next_dataset(): results_saver.add_log("Starting testing dataset {}".format( datasets_helper.get_dataset_name())) tokenizer = Tokenizer(num_words=num_of_words) generator = datasets_helper.text_generator() results_saver.add_log("Starting preprocessing and tokenization.") tokenizer.fit_on_texts(generator) results_saver.add_log("Done. Building model now.") epochs = 1 batch_size = 256 val_split = 0.2
ModelType.RF: { 'n_estimators': 20, 'max_features': max_feauters }, ModelType.DT: { 'max_features': max_feauters } } start_time = get_time_in_millis() preprocess = True models_for_test = test_model.keys() for model in models_for_test: if not test_model[model]: continue log_writer = LogWriter(log_file_desc='_{}_{}'.format( 'prep' if preprocess else 'no-prep', model.name), result_desc='Classic') tester = GeneralTester(log_writer, start_time) datasets_helper = Dataset_Helper(preprocess=preprocess) datasets_helper.set_wanted_datasets([0, 2, 3]) while datasets_helper.next_dataset(): if 'topic_count' in models_params[model]: models_params[model][ 'topic_count'] = datasets_helper.get_num_of_topics() topic_names = [(index, item) for index, item in enumerate( datasets_helper.get_dataset_topic_names())] tester.set_new_dataset(datasets_helper.get_num_of_topics(), topic_names) output_csv = [] """for key,value in test_model.items(): if not value:
num_of_words = 10000 dataset_helper = Dataset_Helper(True) dataset_helper.set_wanted_datasets([param[1]]) dataset_helper.next_dataset() num_of_topics = dataset_helper.get_num_of_topics() documents = dataset_helper.get_texts_as_list() labels = dataset_helper.get_labels(dataset_helper.get_train_file_path()) tokenizer = Tokenizer(num_words=num_of_words) tokenizer.fit_on_texts(documents) #items= tokenizer.word_index reverse_word_map = dict(map(reversed, tokenizer.word_index.items())) matrix = tokenizer.texts_to_matrix(documents, mode='binary') num_of_important_words = 20 log_writer = LogWriter(log_file_desc='{}{}'.format(test_name, ""), result_desc="NeuralTopicModel") model = Lda(num_of_topics, num_of_important_words, passes=25, iterations=25) """gensim.models.LdaModel( doc_term_matrix, num_topics=num_of_topics, id2word=dictionary, passes=2, iterations=2)""" #LDA section model.train(documents) topic_words_lda = extract_important_words(model.get_topics(), True)
min_delta=0, patience=1000, verbose=0, mode='auto', baseline=None, restore_best_weights=False) ]) weight_in = autoencoder.get_weights()[2] weight_out = autoencoder.get_weights()[4] blob = np.array([]) weight_in = weight_in.transpose() #tst = autoencoder.get_weights() num_of_important_words = 20 from results_saver import LogWriter log_writer = LogWriter(log_file_desc='{}{}'.format(test_name, regularization)) loss = history.history['loss'] val_loss = history.history['val_loss'] epochs = range(1, len(loss) + 1) plt.plot(epochs, loss, 'g', label='Training loss') plt.plot(epochs, val_loss, 'b', label='Validation loss') plt.title('Training and validation loss {}'.format( dataset_helper.get_dataset_name())) plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() plt.savefig(log_writer.get_plot_path(dataset_helper.get_dataset_name(), "loss")) plt.clf() """topic_words_in = [sorted(topic_words,key=lambda x: x[1],reverse=True) for topic_words in topic_words_in] topic_words_out = [sorted(topic_words,key=lambda x: x[1],reverse=True) for topic_words in topic_words_out]
matrix[b_ind * batch_size:(b_ind + 1) * batch_size]) #autoencoder.fit(matrix,matrix,batch_size=32,epochs=1,validation_split=0.1, verbose=2, callbacks=[EarlyStopping(monitor='val_loss', min_delta=0, patience=50, verbose=0, mode='auto', baseline=None, restore_best_weights=False)]) weights = autoencoder.get_weights() weights[0] = normalize(weights[0], norm_to_use, 0) weights[2] = normalize(weights[2], norm_to_use, 1) autoencoder.set_weights(weights) weight_in = autoencoder.get_weights()[0] weight_out = autoencoder.get_weights()[2] #tst = autoencoder.get_weights() blob = np.array([]) weight_in = weight_in.transpose() #combined_weight = np.dot(weight_in.transpose(), weight_out) num_of_important_words = 20 log_writer = LogWriter(log_file_desc='{}{}'.format(test_name, regularization)) log_writer.write_any('model', autoencoder.to_json(), 'w+', True) """topic_words_in = [sorted(topic_words,key=lambda x: x[1],reverse=True) for topic_words in topic_words_in] topic_words_out = [sorted(topic_words,key=lambda x: x[1],reverse=True) for topic_words in topic_words_out] log_writer = LogWriter(log_file_desc='LDATestsRegularize{}'.format(regularization)) log_writer.write_2D_list('topic_words_in', topic_words_in) log_writer.write_2D_list('topic_words_out', topic_words_out)""" topic_words_in_max = get_extremes(weight_in, num_of_topics, num_of_important_words, reverse_word_map, True, 'topic_words_in_max', log_writer, dataset_helper.get_dataset_name()) topic_words_in_min = get_extremes(weight_in, num_of_topics, num_of_important_words, reverse_word_map, False, 'topic_words_in_min', log_writer,
'dropouts': hp.randint('dropouts', 3), 'dropout_values': hp.uniform('dropout_values', 0.01, 0.2), 'epochs': 20, #hp.randint('epochs',20), 'batch_size': batch_size, 'learning_rate': hp.choice('learning_rate', [0.001, 0.01, 0.0005]), 'optimizer': hp.choice('optimizer', ['adam', 'rmsprop']), 'results_saver': results_saver } return space file_dir = os.path.dirname(__file__) sys.path.append(file_dir) datasets_helper = Dataset_Helper(False) results_saver = LogWriter(log_file_desc="hyperopt-best-param-search") results = [] datasets_helper.set_wanted_datasets([1]) models_to_test = ['lstm', 'dense', 'embedding', 'bidi'] """datasets_helper.next_dataset() space = create_base_params('lstm',datasets_helper) smpl = sample(space) print(sample(space))""" for model in models_to_test: while datasets_helper.next_dataset(): space = create_base_params(model, datasets_helper, results_saver) best = fmin(optimize_model, space=space, algo=tpe.suggest, max_evals=30, max_queue_len=1,
return for item in all_vars[depth]: f = [a for a in field] f.append(item) create_variations(depth + 1, f, all_vars, possibilities) def get_time_in_millis(): """ :return: system time in milliseconds """ return int(round(time.time()) * 1000) log_writer = LogWriter("log.txt") base_path = os.getcwd() csv_folder = base_path + "\\csv_folder\\" data_sets = [ (csv_folder + "4" + "\\train.csv", csv_folder + "4" + "\\test.csv", 20, "-20newsgroups-"), (csv_folder + "1" + "\\train.csv", csv_folder + "1" + "\\test.csv", 10, "-reuters-") ] #,(csv_folder+"2"+"\\train.csv",csv_folder+"2"+"\\test.csv",14)] #data_sets = [(csv_folder+"2"+"\\train.csv",csv_folder+"2"+"\\test.csv",14)] strip_nums_params = use_stemmer_params = use_lemmatizer_params = strip_short_params = [ True, False ] preproces_all_vals = [ strip_nums_params, use_stemmer_params, use_lemmatizer_params,