Exemple #1
0
def test_model(docs, labels, model, log_writer: LogWriter, test_name):
    """
    Tests provided instance of a model and outputs results using provided test_name
    :param model: model to be tested
    :param test_name: name which will be used for output
    :return: accuracy in range (0 - 1)
    """
    stats = []
    topic_indexes, topics_of_index = connect_topic_id_to_topics(
        model, prep_docs_for_assesment(docs, labels), log_writer)
    distribution = []
    for index, article in enumerate(docs):
        analysis_res = model.analyse_text(article)
        if len(analysis_res) == 0:
            print("nothing found")
            continue
        res = max(analysis_res, key=lambda item: item[1])
        if res[0] not in topics_of_index:
            topics_of_index[res[0]] = [labels[index]]
            topic_indexes[labels[index]] = res[0]
            print("continuing")
            continue
        distribution.append(res[0])
        stats.append(1 if labels[index] in topics_of_index[res[0]] else 0)
        # self.log_writer.add_log("Article with topic {} was assigned {} with {} certainty.".format(article[0], "correctly" if res[0] == self.topic_positions[article[0]] else "wrong", res[1]))
    accuracy = sum(stats) / len(stats)
    log_writer.add_log("{} got accuracy {}".format(test_name, accuracy))
    log_writer.add_log("Real distribution was {}".format(dict(
        Counter(labels))))
    log_writer.add_log("Predicted distribution was {}".format(
        dict(Counter(distribution))))
    return accuracy
from dataset_loader.dataset_helper import Dataset_Helper
from results_saver import LogWriter
import os
import sys
from neural_networks.aliaser import *
import tkinter as tk
from tkinter import simpledialog

file_dir = os.path.dirname(__file__)
sys.path.append(file_dir)
root = tk.Tk()
root.withdraw()

preprocess = True
datasets_helper = Dataset_Helper(preprocess)
results_saver = LogWriter(log_file_desc=simpledialog.askstring(
    title="Test Name", prompt="Insert test name:", initialvalue='CONV_GRU_'))
results = []
num_of_words = 10000

while datasets_helper.next_dataset():
    results_saver.add_log("Starting testing dataset {}".format(
        datasets_helper.get_dataset_name()))
    validation_count = datasets_helper.get_num_of_train_texts() // 10
    tokenizer = Tokenizer(num_words=num_of_words,
                          filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n',
                          lower=False,
                          split=' ')
    generator = datasets_helper.text_generator()
    results_saver.add_log("Starting preprocessing and tokenization.")
    tokenizer.fit_on_texts(generator)
    results_saver.add_log("Done. Building model now.")
                                                patience=param[2],
                                                verbose=0,
                                                mode='auto',
                                                baseline=None,
                                                restore_best_weights=False)
                              ])
    weight_in = autoencoder.get_weights()[0]
    weight_out = autoencoder.get_weights()[2]
    #tst = autoencoder.get_weights()
    blob = np.array([])

    weight_in = weight_in.transpose()
    #combined_weight = np.dot(weight_in.transpose(), weight_out)
    num_of_important_words = 20

    log_writer = LogWriter(log_file_desc='{}{}'.format(test_name, ""),
                           result_desc="NeuralTopicModel")

    log_writer.write_any('model', autoencoder.to_json(), 'w+', True)
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(1, len(loss) + 1)
    plt.plot(epochs, loss, 'g', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss {}'.format(
        dataset_helper.get_dataset_name()))
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig(
        log_writer.get_plot_path(dataset_helper.get_dataset_name(), "loss"))
    plt.clf()
from training_text_generator_RNN import Training_Text_Generator_RNN
from dataset_loader.dataset_helper import Dataset_Helper
from results_saver import LogWriter
import os
import sys
from neural_networks.aliaser import *

file_dir = os.path.dirname(__file__)
sys.path.append(file_dir)
"""config = tf.ConfigProto( device_count = {'GPU': 1 , 'CPU': 4} )
sess = tf.Session(config=config)
keras.backend.set_session(sess)"""

datasets_helper = Dataset_Helper(preprocess=True)
datasets_helper.set_wanted_datasets([0])
results_saver = LogWriter(log_file_desc="Bidirectional-no-relu")
results = []
num_of_words = 15000

while datasets_helper.next_dataset():
    results_saver.add_log("Starting testing dataset {}".format(
        datasets_helper.get_dataset_name()))
    validation_count = 200  #datasets_helper.get_num_of_train_texts() // 10
    tokenizer = Tokenizer(num_words=num_of_words)  #,
    #filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n',
    #lower=False, split=' ')
    generator = datasets_helper.text_generator()
    results_saver.add_log("Starting preprocessing and tokenization.")
    tokenizer.fit_on_texts(generator)
    results_saver.add_log("Done. Building model now.")
Exemple #5
0
from results_saver import LogWriter, finish_dataset
import os
import sys
from neural_networks.aliaser import *
import tkinter as tk
from tkinter import simpledialog

file_dir = os.path.dirname(__file__)
sys.path.append(file_dir)
root = tk.Tk()
root.withdraw()

preprocess = True
datasets_helper = Dataset_Helper(preprocess)
datasets_helper.set_wanted_datasets([3])
results_saver = LogWriter(log_file_desc=simpledialog.askstring(
    title="Test Name", prompt="Insert test name:", initialvalue='Dense_'))
results = []
num_of_words = 10000

while datasets_helper.next_dataset():
    results_saver.add_log("Starting testing dataset {}".format(
        datasets_helper.get_dataset_name()))
    tokenizer = Tokenizer(num_words=num_of_words)
    generator = datasets_helper.text_generator()
    results_saver.add_log("Starting preprocessing and tokenization.")
    tokenizer.fit_on_texts(generator)
    results_saver.add_log("Done. Building model now.")

    epochs = 1
    batch_size = 256
    val_split = 0.2
Exemple #6
0
    ModelType.RF: {
        'n_estimators': 20,
        'max_features': max_feauters
    },
    ModelType.DT: {
        'max_features': max_feauters
    }
}
start_time = get_time_in_millis()
preprocess = True
models_for_test = test_model.keys()
for model in models_for_test:
    if not test_model[model]:
        continue
    log_writer = LogWriter(log_file_desc='_{}_{}'.format(
        'prep' if preprocess else 'no-prep', model.name),
                           result_desc='Classic')
    tester = GeneralTester(log_writer, start_time)
    datasets_helper = Dataset_Helper(preprocess=preprocess)
    datasets_helper.set_wanted_datasets([0, 2, 3])
    while datasets_helper.next_dataset():
        if 'topic_count' in models_params[model]:
            models_params[model][
                'topic_count'] = datasets_helper.get_num_of_topics()
        topic_names = [(index, item) for index, item in enumerate(
            datasets_helper.get_dataset_topic_names())]
        tester.set_new_dataset(datasets_helper.get_num_of_topics(),
                               topic_names)
        output_csv = []
        """for key,value in test_model.items():
            if not value:
    num_of_words = 10000

    dataset_helper = Dataset_Helper(True)
    dataset_helper.set_wanted_datasets([param[1]])
    dataset_helper.next_dataset()
    num_of_topics = dataset_helper.get_num_of_topics()
    documents = dataset_helper.get_texts_as_list()
    labels = dataset_helper.get_labels(dataset_helper.get_train_file_path())
    tokenizer = Tokenizer(num_words=num_of_words)
    tokenizer.fit_on_texts(documents)
    #items= tokenizer.word_index
    reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
    matrix = tokenizer.texts_to_matrix(documents, mode='binary')

    num_of_important_words = 20
    log_writer = LogWriter(log_file_desc='{}{}'.format(test_name, ""),
                           result_desc="NeuralTopicModel")

    model = Lda(num_of_topics,
                num_of_important_words,
                passes=25,
                iterations=25)
    """gensim.models.LdaModel(
    doc_term_matrix,
    num_topics=num_of_topics,
    id2word=dictionary,
    passes=2,
    iterations=2)"""

    #LDA section
    model.train(documents)
    topic_words_lda = extract_important_words(model.get_topics(), True)
Exemple #8
0
                                            min_delta=0,
                                            patience=1000,
                                            verbose=0,
                                            mode='auto',
                                            baseline=None,
                                            restore_best_weights=False)
                          ])
weight_in = autoencoder.get_weights()[2]
weight_out = autoencoder.get_weights()[4]
blob = np.array([])
weight_in = weight_in.transpose()
#tst = autoencoder.get_weights()
num_of_important_words = 20
from results_saver import LogWriter

log_writer = LogWriter(log_file_desc='{}{}'.format(test_name, regularization))
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss) + 1)
plt.plot(epochs, loss, 'g', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss {}'.format(
    dataset_helper.get_dataset_name()))
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.savefig(log_writer.get_plot_path(dataset_helper.get_dataset_name(),
                                     "loss"))
plt.clf()
"""topic_words_in = [sorted(topic_words,key=lambda x: x[1],reverse=True) for topic_words in topic_words_in]
topic_words_out = [sorted(topic_words,key=lambda x: x[1],reverse=True) for topic_words in topic_words_out]
Exemple #9
0
            matrix[b_ind * batch_size:(b_ind + 1) * batch_size])
        #autoencoder.fit(matrix,matrix,batch_size=32,epochs=1,validation_split=0.1, verbose=2, callbacks=[EarlyStopping(monitor='val_loss', min_delta=0, patience=50, verbose=0, mode='auto', baseline=None, restore_best_weights=False)])
        weights = autoencoder.get_weights()
        weights[0] = normalize(weights[0], norm_to_use, 0)
        weights[2] = normalize(weights[2], norm_to_use, 1)
        autoencoder.set_weights(weights)
weight_in = autoencoder.get_weights()[0]
weight_out = autoencoder.get_weights()[2]
#tst = autoencoder.get_weights()
blob = np.array([])

weight_in = weight_in.transpose()
#combined_weight = np.dot(weight_in.transpose(), weight_out)
num_of_important_words = 20

log_writer = LogWriter(log_file_desc='{}{}'.format(test_name, regularization))

log_writer.write_any('model', autoencoder.to_json(), 'w+', True)
"""topic_words_in = [sorted(topic_words,key=lambda x: x[1],reverse=True) for topic_words in topic_words_in]
topic_words_out = [sorted(topic_words,key=lambda x: x[1],reverse=True) for topic_words in topic_words_out]
log_writer = LogWriter(log_file_desc='LDATestsRegularize{}'.format(regularization))
log_writer.write_2D_list('topic_words_in', topic_words_in)
log_writer.write_2D_list('topic_words_out', topic_words_out)"""

topic_words_in_max = get_extremes(weight_in, num_of_topics,
                                  num_of_important_words, reverse_word_map,
                                  True, 'topic_words_in_max', log_writer,
                                  dataset_helper.get_dataset_name())
topic_words_in_min = get_extremes(weight_in, num_of_topics,
                                  num_of_important_words, reverse_word_map,
                                  False, 'topic_words_in_min', log_writer,
Exemple #10
0
        'dropouts': hp.randint('dropouts', 3),
        'dropout_values': hp.uniform('dropout_values', 0.01, 0.2),
        'epochs': 20,  #hp.randint('epochs',20),
        'batch_size': batch_size,
        'learning_rate': hp.choice('learning_rate', [0.001, 0.01, 0.0005]),
        'optimizer': hp.choice('optimizer', ['adam', 'rmsprop']),
        'results_saver': results_saver
    }
    return space


file_dir = os.path.dirname(__file__)
sys.path.append(file_dir)

datasets_helper = Dataset_Helper(False)
results_saver = LogWriter(log_file_desc="hyperopt-best-param-search")
results = []
datasets_helper.set_wanted_datasets([1])
models_to_test = ['lstm', 'dense', 'embedding', 'bidi']
"""datasets_helper.next_dataset()
space = create_base_params('lstm',datasets_helper)
smpl = sample(space)
print(sample(space))"""
for model in models_to_test:
    while datasets_helper.next_dataset():
        space = create_base_params(model, datasets_helper, results_saver)
        best = fmin(optimize_model,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=30,
                    max_queue_len=1,
Exemple #11
0
        return

    for item in all_vars[depth]:
        f = [a for a in field]
        f.append(item)
        create_variations(depth + 1, f, all_vars, possibilities)


def get_time_in_millis():
    """
    :return: system time in milliseconds
    """
    return int(round(time.time()) * 1000)


log_writer = LogWriter("log.txt")
base_path = os.getcwd()
csv_folder = base_path + "\\csv_folder\\"
data_sets = [
    (csv_folder + "4" + "\\train.csv", csv_folder + "4" + "\\test.csv", 20,
     "-20newsgroups-"),
    (csv_folder + "1" + "\\train.csv", csv_folder + "1" + "\\test.csv", 10,
     "-reuters-")
]  #,(csv_folder+"2"+"\\train.csv",csv_folder+"2"+"\\test.csv",14)]
#data_sets = [(csv_folder+"2"+"\\train.csv",csv_folder+"2"+"\\test.csv",14)]

strip_nums_params = use_stemmer_params = use_lemmatizer_params = strip_short_params = [
    True, False
]
preproces_all_vals = [
    strip_nums_params, use_stemmer_params, use_lemmatizer_params,