def evaluate_test_set(): if MODEL_TYPE == "names": test_dataset, _ = setup_model.create_names_dataset(DATA_FILE, True) elif MODEL_TYPE == "comments": test_dataset, _ = setup_model.create_comments_dataset(DATA_FILE, True) elif MODEL_TYPE == "nc": test_dataset, _ = setup_model.create_nc_dataset(DATA_FILE, True) ### TODO: twin model tests test_xs, test_ys = setup_model.prepare_data(test_dataset, lang_tokenizer, label_to_idx, USE_OTHER_TYPE, True) y_preds = model.predict(test_xs) y_preds_bool = np.argmax(y_preds, axis=1) print(classification_report(test_ys, y_preds_bool))
## Should be "names", "comments" or "nc" (which is combination of both). ## Gives type of input to train network on. input_type = "nc" DATA_SIZE = 100000 if USE_OTHER_TYPE: other_tag = "_OTHER_" else: other_tag = "" #dataset, prog_type_dict = setup_model.create_nc_dataset(DATA_FILE) if input_type == "names": dataset, prog_type_dict = setup_model.create_names_dataset(DATA_FILE) elif input_type == "comments": dataset, prog_type_dict = setup_model.create_comments_dataset(DATA_FILE) elif input_type == "nc": dataset, prog_type_dict = setup_model.create_nc_dataset( DATA_FILE, DELIMITER) else: raise Exception("Got unexpected input_type of {}".format(input_type)) lang_tokenizer = setup_model.create_tokenizer(dataset) vocab_size = max(lang_tokenizer.index_word.keys()) ## SAVE TOKENIZER with open('tokenizers/names_tokenizer.pickle', 'wb') as handle: pickle.dump(lang_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) ## LOAD TOKENIZER #with open('tokenizers/names_tokenizer.pickle', 'rb') as handle: # lang_tokenizer = pickle.load(handle) label_to_idx, idx_to_label = setup_model.create_labels(dataset, prog_type_dict,