コード例 #1
0
def test_model(docs, labels, model, log_writer: LogWriter, test_name):
    """
    Tests provided instance of a model and outputs results using provided test_name
    :param model: model to be tested
    :param test_name: name which will be used for output
    :return: accuracy in range (0 - 1)
    """
    stats = []
    topic_indexes, topics_of_index = connect_topic_id_to_topics(
        model, prep_docs_for_assesment(docs, labels), log_writer)
    distribution = []
    for index, article in enumerate(docs):
        analysis_res = model.analyse_text(article)
        if len(analysis_res) == 0:
            print("nothing found")
            continue
        res = max(analysis_res, key=lambda item: item[1])
        if res[0] not in topics_of_index:
            topics_of_index[res[0]] = [labels[index]]
            topic_indexes[labels[index]] = res[0]
            print("continuing")
            continue
        distribution.append(res[0])
        stats.append(1 if labels[index] in topics_of_index[res[0]] else 0)
        # self.log_writer.add_log("Article with topic {} was assigned {} with {} certainty.".format(article[0], "correctly" if res[0] == self.topic_positions[article[0]] else "wrong", res[1]))
    accuracy = sum(stats) / len(stats)
    log_writer.add_log("{} got accuracy {}".format(test_name, accuracy))
    log_writer.add_log("Real distribution was {}".format(dict(
        Counter(labels))))
    log_writer.add_log("Predicted distribution was {}".format(
        dict(Counter(distribution))))
    return accuracy
コード例 #2
0
from tkinter import simpledialog

file_dir = os.path.dirname(__file__)
sys.path.append(file_dir)
root = tk.Tk()
root.withdraw()

preprocess = True
datasets_helper = Dataset_Helper(preprocess)
results_saver = LogWriter(log_file_desc=simpledialog.askstring(
    title="Test Name", prompt="Insert test name:", initialvalue='CONV_GRU_'))
results = []
num_of_words = 10000

while datasets_helper.next_dataset():
    results_saver.add_log("Starting testing dataset {}".format(
        datasets_helper.get_dataset_name()))
    validation_count = datasets_helper.get_num_of_train_texts() // 10
    tokenizer = Tokenizer(num_words=num_of_words,
                          filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n',
                          lower=False,
                          split=' ')
    generator = datasets_helper.text_generator()
    results_saver.add_log("Starting preprocessing and tokenization.")
    tokenizer.fit_on_texts(generator)
    results_saver.add_log("Done. Building model now.")

    batch_size = 256
    gauss_noise = 0.5
    epochs = 1
    val_split = 0.2
    val_data_count = int(datasets_helper.get_num_of_train_texts() * val_split)
コード例 #3
0
from tkinter import simpledialog

file_dir = os.path.dirname(__file__)
sys.path.append(file_dir)
root = tk.Tk()
root.withdraw()

preprocess = True
datasets_helper = Dataset_Helper(preprocess)
results_saver = LogWriter(log_file_desc=simpledialog.askstring(
    title="Test Name", prompt="Insert test name:", initialvalue='GRU_'))
results = []
num_of_words = 10000

while datasets_helper.next_dataset():
    results_saver.add_log("Starting testing dataset {}".format(
        datasets_helper.get_dataset_name()))
    validation_count = datasets_helper.get_num_of_train_texts() // 10
    tokenizer = Tokenizer(num_words=num_of_words,
                          filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n',
                          lower=False,
                          split=' ')
    generator = datasets_helper.text_generator()
    results_saver.add_log("Starting preprocessing and tokenization.")
    tokenizer.fit_on_texts(generator)
    results_saver.add_log("Done. Building model now.")

    epochs = 1
    val_split = 0.2
    batch_size = 512
    val_data_count = int(datasets_helper.get_num_of_train_texts() * val_split)
    enhanced_num_of_topics = 128
コード例 #4
0
                'topic_count'] = datasets_helper.get_num_of_topics()
        topic_names = [(index, item) for index, item in enumerate(
            datasets_helper.get_dataset_topic_names())]
        tester.set_new_dataset(datasets_helper.get_num_of_topics(),
                               topic_names)
        output_csv = []
        """for key,value in test_model.items():
            if not value:
                models_params.pop(key)"""
        log_writer.write_any("model-settings",
                             json.dumps(models_params[model]), 'w+', True)
        seed = 5
        random.seed(5)

        log_writer.add_log(
            "Starting preprocessing texts of {} for training".format(
                datasets_helper.get_dataset_name()))
        texts_for_train = datasets_helper.get_dataset(DatasetType.TRAIN)
        log_writer.add_log("Preprocessing finished")

        log_writer.add_log(
            "Starting preprocessing texts of {} for testing".format(
                datasets_helper.get_dataset_name()))
        texts_for_testing = datasets_helper.get_dataset(DatasetType.TEST)
        log_writer.add_log("Preprocessing finished")

        statistics = []
        tester.set_new_preprocess_docs(texts_for_train, texts_for_testing)
        test_params = {
            "dataset_name": datasets_helper.get_dataset_name(),
            'dataset_helper': datasets_helper
コード例 #5
0
models_to_test = ['lstm', 'dense', 'embedding', 'bidi']
"""datasets_helper.next_dataset()
space = create_base_params('lstm',datasets_helper)
smpl = sample(space)
print(sample(space))"""
for model in models_to_test:
    while datasets_helper.next_dataset():
        space = create_base_params(model, datasets_helper, results_saver)
        best = fmin(optimize_model,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=30,
                    max_queue_len=1,
                    verbose=False)
        results_saver.add_log(
            'Best params for network type {} and dataset {} are: {}\n{}'.
            format(model, datasets_helper.get_dataset_name(), best,
                   space_eval(space, best)))
        results_saver.write_any('best_params', [
            model,
            datasets_helper.get_dataset_name(),
            space_eval(space, best)
        ], 'a')
        #results_saver.write_2D_list([[model,datasets_helper.get_dataset_name(),best]],'best_params','a')
    datasets_helper.reset_dataset_counter()
"""best_run, best_model = optim.minimize(model=test,
                                          data=[],
                                          algo=tpe.suggest,
                                          max_evals=5,
                                          trials=Trials())"""
コード例 #6
0
    lsa_all_vals = [
        lsa_one_pass, lsa_power_iter, lsa_use_tfidf, lsa_topic_nums
    ]
    lsa_variations = []
    create_variations(0, [], lsa_all_vals, lsa_variations)
    statistics_to_merge = []
    for index, preproces_settings in enumerate(preproces_variations):
        seed = 5
        settings = {
            'strip_nums': preproces_settings[0],
            'use_stemmer': preproces_settings[1],
            'use_lemmatizer': preproces_settings[2],
            'strip_short': preproces_settings[3]
        }
        log_writer.add_log(
            "Initializing text preprocessor with strip_nums: {}, use_stemmer: {}, use_lemmatizer {}, strip_short: {}."
            .format(preproces_settings[0], preproces_settings[1],
                    preproces_settings[2], preproces_settings[3]))
        text_preprocessor = TextPreprocessor(settings)

        log_writer.add_log(
            "Starting preprocessing texts of {} for training".format(
                data_sets[i][0]))
        texts_for_train = text_preprocessor.load_and_prep_csv(
            [data_sets[i][0]], "eng", False, 1, ';')
        log_writer.add_log("Preprocessing finished")

        log_writer.add_log(
            "Starting preprocessing texts of {} for training".format(
                data_sets[i][0]))
        texts_for_topic_asses = text_preprocessor.load_and_prep_csv(
            [data_sets[i][0]], "eng", True, 1, ';')