def load_test_matrices(): if not os.path.exists(prep.PRECOMPUTED_TEST_HYPOTHESES_PATH) \ or not os.path.exists(prep.PRECOMPUTED_TEST_PREMISES_PATH) \ or not os.path.exists(prep.PRECOMPUTED_TEST_LABELS_PATH): logger.error( "Testing matrices not found, please run preprocessor first.") sys.exit(1) premise = load_padded_matrix(prep.PRECOMPUTED_TEST_PREMISES_PATH) hypothesis = load_padded_matrix(prep.PRECOMPUTED_TEST_HYPOTHESES_PATH) with open(prep.PRECOMPUTED_TEST_LABELS_PATH, 'r') as labels_file: label_ids = json.load(labels_file) labels = labels_to_onehot(label_ids) return premise, hypothesis, labels
def unpack(archive_name, label=""): if is_unpacked(archive_name): logger.info(label.capitalize() + " already unpacked.") return if not os.path.exists(SAVE_DIR + "/" + archive_name + ".zip"): logger.error("No " + label + " zipfile to unpack") return logger.info("Unpacking " + label) os.makedirs(SAVE_DIR + "/" + archive_name) with zipfile.ZipFile(SAVE_DIR + "/" + archive_name + ".zip", "r") as file: file.extractall(SAVE_DIR + "/" + archive_name) logger.success("Unpacking complete.") os.remove(SAVE_DIR + "/" + archive_name + ".zip")
def load_embedding_matrix(): if not os.path.exists(prep.PRECOMPUTED_EMB_MATRIX_PATH): logger.error( "Embedding matrix not found, please run preprocessor first.") sys.exit(1) with open(prep.PRECOMPUTED_EMB_MATRIX_PATH, 'r') as matrix_file: matrix = json.load(matrix_file) np_matrix = np.array(matrix, np.float32) # whether to keep this trainable or not is up to discussion tf_matrix = tf.get_variable(name="tf_matrix", shape=np_matrix.shape, initializer=tf.constant_initializer(np_matrix), trainable=False) return tf_matrix
def run(force_recompute=True): logger.header("Running preprocessor module.") if not check_all_unpacked(): logger.error( "Unpacked datasets or word vectors are missing. Please run downloader prior to preprocessor." ) time_start = time.time() logger.info("Loading datasets into memory") try: train_dataset = json_to_array(unpacked_dataset_path() + "/snli_1.0_train.jsonl") test_dataset = json_to_array(unpacked_dataset_path() + "/snli_1.0_test.jsonl") except FileNotFoundError as error: logger.error("File: " + error.filename + " not found") return time_end = time.time() logger.success("Datasets loaded. Elapsed time: " + "{0:.2f}".format(time_end - time_start) + " s") embeddings_changed = False time_start = time.time() if os.path.exists(PRECOMPUTED_GLOVE_PATH) and not force_recompute: logger.info("Precomputed word vectors found, loading into memory.") with open(PRECOMPUTED_GLOVE_PATH, 'r') as infile: word_vectors = json.load(infile) else: logger.info("Loading word vectors into memory") # Get a set of words used in datasets, so we don't store useless word vectors. vocabulary = set() vocabulary = get_used_words(train_dataset, vocabulary) vocabulary = get_used_words(test_dataset, vocabulary) # Load needed part of word vectors. Might induce large memory costs. try: word_vectors = wordvec_to_dict( unpacked_glove_path() + "/glove.42B.300d.txt", vocabulary) except FileNotFoundError as error: logger.error("File: " + error.filename + " not found") return logger.info("Storing loaded vectors for future use.", level=2) with open(PRECOMPUTED_GLOVE_PATH, 'w') as outfile: json.dump(word_vectors, outfile) embeddings_changed = True time_end = time.time() logger.success("Word vectors loaded. Elapsed time: " + "{0:.2f}".format(time_end - time_start) + " s") id_mapping = generate_dictionary_ids(word_vectors) if not os.path.exists(PRECOMPUTED_EMB_MATRIX_PATH ) or force_recompute or embeddings_changed: logger.info("Generating initial embedding matrix.") embedding_matrix = generate_embedding_matrix(word_vectors, id_mapping) logger.info("Storing embedding matrix for future use.", level=2) with open(PRECOMPUTED_EMB_MATRIX_PATH, 'w') as outfile: json.dump(embedding_matrix.tolist(), outfile) logger.success("Embedding matrix created.") else: logger.info("Embedding matrix found, skipping its computation.") label_dict = {} if not os.path.exists(PRECOMPUTED_TRAIN_PREMISES_PATH) \ or not os.path.exists(PRECOMPUTED_TRAIN_HYPOTHESES_PATH) \ or not os.path.exists(PRECOMPUTED_TRAIN_LABELS_PATH) \ or force_recompute or embeddings_changed: logger.info("Creating train matrix and labels") train_premise_matrix, train_hypothesis_matrix, train_labels = input_data_to_matrices( train_dataset, id_mapping, label_dict) logger.info("Storing matrix for future use.", level=2) with open(PRECOMPUTED_TRAIN_PREMISES_PATH, 'w') as outfile: json.dump(train_premise_matrix, outfile) with open(PRECOMPUTED_TRAIN_HYPOTHESES_PATH, 'w') as outfile: json.dump(train_hypothesis_matrix, outfile) with open(PRECOMPUTED_TRAIN_LABELS_PATH, 'w') as outfile: json.dump(train_labels, outfile) logger.success("Matrix stored") else: logger.info("Train matrix found, skipping its computation.") if not os.path.exists(PRECOMPUTED_TEST_PREMISES_PATH) \ or not os.path.exists(PRECOMPUTED_TEST_HYPOTHESES_PATH) \ or not os.path.exists(PRECOMPUTED_TEST_LABELS_PATH) \ or force_recompute or embeddings_changed: logger.info("Creating test matrix and labels") test_premise_matrix, test_hypothesis_matrix, test_labels = input_data_to_matrices( test_dataset, id_mapping, label_dict) logger.info("Storing matrix for future use.", level=2) with open(PRECOMPUTED_TEST_PREMISES_PATH, 'w') as outfile: json.dump(test_premise_matrix, outfile) with open(PRECOMPUTED_TEST_HYPOTHESES_PATH, 'w') as outfile: json.dump(test_hypothesis_matrix, outfile) with open(PRECOMPUTED_TEST_LABELS_PATH, 'w') as outfile: json.dump(test_labels, outfile) logger.success("Matrix stored") else: logger.info("Test matrix found, skipping its computation.")