Exemple #1
0
def evaluate_test_set():
    if MODEL_TYPE == "names":
        test_dataset, _ = setup_model.create_names_dataset(DATA_FILE, True)
    elif MODEL_TYPE == "comments":
        test_dataset, _ = setup_model.create_comments_dataset(DATA_FILE, True)
    elif MODEL_TYPE == "nc":
        test_dataset, _ = setup_model.create_nc_dataset(DATA_FILE, True)
        ### TODO: twin model tests

    test_xs, test_ys = setup_model.prepare_data(test_dataset, lang_tokenizer, label_to_idx, USE_OTHER_TYPE, True)
    y_preds = model.predict(test_xs)
    y_preds_bool = np.argmax(y_preds, axis=1)
    print(classification_report(test_ys, y_preds_bool))
## Should be "names", "comments" or "nc" (which is combination of both).
## Gives type of input to train network on.
input_type = "nc"

DATA_SIZE = 100000

if USE_OTHER_TYPE:
    other_tag = "_OTHER_"
else:
    other_tag = ""

#dataset, prog_type_dict = setup_model.create_nc_dataset(DATA_FILE)
if input_type == "names":
    dataset, prog_type_dict = setup_model.create_names_dataset(DATA_FILE)
elif input_type == "comments":
    dataset, prog_type_dict = setup_model.create_comments_dataset(DATA_FILE)
elif input_type == "nc":
    dataset, prog_type_dict = setup_model.create_nc_dataset(
        DATA_FILE, DELIMITER)
else:
    raise Exception("Got unexpected input_type of {}".format(input_type))

lang_tokenizer = setup_model.create_tokenizer(dataset)
vocab_size = max(lang_tokenizer.index_word.keys())
## SAVE TOKENIZER
with open('tokenizers/names_tokenizer.pickle', 'wb') as handle:
    pickle.dump(lang_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
## LOAD TOKENIZER
#with open('tokenizers/names_tokenizer.pickle', 'rb') as handle:
#    lang_tokenizer = pickle.load(handle)
label_to_idx, idx_to_label = setup_model.create_labels(dataset, prog_type_dict,