def test_model(docs, labels, model, log_writer: LogWriter, test_name): """ Tests provided instance of a model and outputs results using provided test_name :param model: model to be tested :param test_name: name which will be used for output :return: accuracy in range (0 - 1) """ stats = [] topic_indexes, topics_of_index = connect_topic_id_to_topics( model, prep_docs_for_assesment(docs, labels), log_writer) distribution = [] for index, article in enumerate(docs): analysis_res = model.analyse_text(article) if len(analysis_res) == 0: print("nothing found") continue res = max(analysis_res, key=lambda item: item[1]) if res[0] not in topics_of_index: topics_of_index[res[0]] = [labels[index]] topic_indexes[labels[index]] = res[0] print("continuing") continue distribution.append(res[0]) stats.append(1 if labels[index] in topics_of_index[res[0]] else 0) # self.log_writer.add_log("Article with topic {} was assigned {} with {} certainty.".format(article[0], "correctly" if res[0] == self.topic_positions[article[0]] else "wrong", res[1])) accuracy = sum(stats) / len(stats) log_writer.add_log("{} got accuracy {}".format(test_name, accuracy)) log_writer.add_log("Real distribution was {}".format(dict( Counter(labels)))) log_writer.add_log("Predicted distribution was {}".format( dict(Counter(distribution)))) return accuracy
from tkinter import simpledialog file_dir = os.path.dirname(__file__) sys.path.append(file_dir) root = tk.Tk() root.withdraw() preprocess = True datasets_helper = Dataset_Helper(preprocess) results_saver = LogWriter(log_file_desc=simpledialog.askstring( title="Test Name", prompt="Insert test name:", initialvalue='CONV_GRU_')) results = [] num_of_words = 10000 while datasets_helper.next_dataset(): results_saver.add_log("Starting testing dataset {}".format( datasets_helper.get_dataset_name())) validation_count = datasets_helper.get_num_of_train_texts() // 10 tokenizer = Tokenizer(num_words=num_of_words, filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n', lower=False, split=' ') generator = datasets_helper.text_generator() results_saver.add_log("Starting preprocessing and tokenization.") tokenizer.fit_on_texts(generator) results_saver.add_log("Done. Building model now.") batch_size = 256 gauss_noise = 0.5 epochs = 1 val_split = 0.2 val_data_count = int(datasets_helper.get_num_of_train_texts() * val_split)
from tkinter import simpledialog file_dir = os.path.dirname(__file__) sys.path.append(file_dir) root = tk.Tk() root.withdraw() preprocess = True datasets_helper = Dataset_Helper(preprocess) results_saver = LogWriter(log_file_desc=simpledialog.askstring( title="Test Name", prompt="Insert test name:", initialvalue='GRU_')) results = [] num_of_words = 10000 while datasets_helper.next_dataset(): results_saver.add_log("Starting testing dataset {}".format( datasets_helper.get_dataset_name())) validation_count = datasets_helper.get_num_of_train_texts() // 10 tokenizer = Tokenizer(num_words=num_of_words, filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n', lower=False, split=' ') generator = datasets_helper.text_generator() results_saver.add_log("Starting preprocessing and tokenization.") tokenizer.fit_on_texts(generator) results_saver.add_log("Done. Building model now.") epochs = 1 val_split = 0.2 batch_size = 512 val_data_count = int(datasets_helper.get_num_of_train_texts() * val_split) enhanced_num_of_topics = 128
'topic_count'] = datasets_helper.get_num_of_topics() topic_names = [(index, item) for index, item in enumerate( datasets_helper.get_dataset_topic_names())] tester.set_new_dataset(datasets_helper.get_num_of_topics(), topic_names) output_csv = [] """for key,value in test_model.items(): if not value: models_params.pop(key)""" log_writer.write_any("model-settings", json.dumps(models_params[model]), 'w+', True) seed = 5 random.seed(5) log_writer.add_log( "Starting preprocessing texts of {} for training".format( datasets_helper.get_dataset_name())) texts_for_train = datasets_helper.get_dataset(DatasetType.TRAIN) log_writer.add_log("Preprocessing finished") log_writer.add_log( "Starting preprocessing texts of {} for testing".format( datasets_helper.get_dataset_name())) texts_for_testing = datasets_helper.get_dataset(DatasetType.TEST) log_writer.add_log("Preprocessing finished") statistics = [] tester.set_new_preprocess_docs(texts_for_train, texts_for_testing) test_params = { "dataset_name": datasets_helper.get_dataset_name(), 'dataset_helper': datasets_helper
models_to_test = ['lstm', 'dense', 'embedding', 'bidi'] """datasets_helper.next_dataset() space = create_base_params('lstm',datasets_helper) smpl = sample(space) print(sample(space))""" for model in models_to_test: while datasets_helper.next_dataset(): space = create_base_params(model, datasets_helper, results_saver) best = fmin(optimize_model, space=space, algo=tpe.suggest, max_evals=30, max_queue_len=1, verbose=False) results_saver.add_log( 'Best params for network type {} and dataset {} are: {}\n{}'. format(model, datasets_helper.get_dataset_name(), best, space_eval(space, best))) results_saver.write_any('best_params', [ model, datasets_helper.get_dataset_name(), space_eval(space, best) ], 'a') #results_saver.write_2D_list([[model,datasets_helper.get_dataset_name(),best]],'best_params','a') datasets_helper.reset_dataset_counter() """best_run, best_model = optim.minimize(model=test, data=[], algo=tpe.suggest, max_evals=5, trials=Trials())"""
lsa_all_vals = [ lsa_one_pass, lsa_power_iter, lsa_use_tfidf, lsa_topic_nums ] lsa_variations = [] create_variations(0, [], lsa_all_vals, lsa_variations) statistics_to_merge = [] for index, preproces_settings in enumerate(preproces_variations): seed = 5 settings = { 'strip_nums': preproces_settings[0], 'use_stemmer': preproces_settings[1], 'use_lemmatizer': preproces_settings[2], 'strip_short': preproces_settings[3] } log_writer.add_log( "Initializing text preprocessor with strip_nums: {}, use_stemmer: {}, use_lemmatizer {}, strip_short: {}." .format(preproces_settings[0], preproces_settings[1], preproces_settings[2], preproces_settings[3])) text_preprocessor = TextPreprocessor(settings) log_writer.add_log( "Starting preprocessing texts of {} for training".format( data_sets[i][0])) texts_for_train = text_preprocessor.load_and_prep_csv( [data_sets[i][0]], "eng", False, 1, ';') log_writer.add_log("Preprocessing finished") log_writer.add_log( "Starting preprocessing texts of {} for training".format( data_sets[i][0])) texts_for_topic_asses = text_preprocessor.load_and_prep_csv( [data_sets[i][0]], "eng", True, 1, ';')