Exemple #1
0
def load_test_matrices():
    if not os.path.exists(prep.PRECOMPUTED_TEST_HYPOTHESES_PATH) \
            or not os.path.exists(prep.PRECOMPUTED_TEST_PREMISES_PATH) \
            or not os.path.exists(prep.PRECOMPUTED_TEST_LABELS_PATH):
        logger.error(
            "Testing matrices not found, please run preprocessor first.")
        sys.exit(1)
    premise = load_padded_matrix(prep.PRECOMPUTED_TEST_PREMISES_PATH)
    hypothesis = load_padded_matrix(prep.PRECOMPUTED_TEST_HYPOTHESES_PATH)
    with open(prep.PRECOMPUTED_TEST_LABELS_PATH, 'r') as labels_file:
        label_ids = json.load(labels_file)
    labels = labels_to_onehot(label_ids)
    return premise, hypothesis, labels
Exemple #2
0
def unpack(archive_name, label=""):
    if is_unpacked(archive_name):
        logger.info(label.capitalize() + " already unpacked.")
        return
    if not os.path.exists(SAVE_DIR + "/" + archive_name + ".zip"):
        logger.error("No " + label + " zipfile to unpack")
        return

    logger.info("Unpacking " + label)
    os.makedirs(SAVE_DIR + "/" + archive_name)
    with zipfile.ZipFile(SAVE_DIR + "/" + archive_name + ".zip", "r") as file:
        file.extractall(SAVE_DIR + "/" + archive_name)
    logger.success("Unpacking complete.")
    os.remove(SAVE_DIR + "/" + archive_name + ".zip")
Exemple #3
0
def load_embedding_matrix():
    if not os.path.exists(prep.PRECOMPUTED_EMB_MATRIX_PATH):
        logger.error(
            "Embedding matrix not found, please run preprocessor first.")
        sys.exit(1)
    with open(prep.PRECOMPUTED_EMB_MATRIX_PATH, 'r') as matrix_file:
        matrix = json.load(matrix_file)
    np_matrix = np.array(matrix, np.float32)

    # whether to keep this trainable or not is up to discussion
    tf_matrix = tf.get_variable(name="tf_matrix",
                                shape=np_matrix.shape,
                                initializer=tf.constant_initializer(np_matrix),
                                trainable=False)
    return tf_matrix
Exemple #4
0
def run(force_recompute=True):
    logger.header("Running preprocessor module.")

    if not check_all_unpacked():
        logger.error(
            "Unpacked datasets or word vectors are missing. Please run downloader prior to preprocessor."
        )

    time_start = time.time()
    logger.info("Loading datasets into memory")
    try:
        train_dataset = json_to_array(unpacked_dataset_path() +
                                      "/snli_1.0_train.jsonl")
        test_dataset = json_to_array(unpacked_dataset_path() +
                                     "/snli_1.0_test.jsonl")
    except FileNotFoundError as error:
        logger.error("File: " + error.filename + " not found")
        return
    time_end = time.time()
    logger.success("Datasets loaded. Elapsed time: " +
                   "{0:.2f}".format(time_end - time_start) + " s")

    embeddings_changed = False
    time_start = time.time()
    if os.path.exists(PRECOMPUTED_GLOVE_PATH) and not force_recompute:
        logger.info("Precomputed word vectors found, loading into memory.")
        with open(PRECOMPUTED_GLOVE_PATH, 'r') as infile:
            word_vectors = json.load(infile)
    else:
        logger.info("Loading word vectors into memory")
        # Get a set of words used in datasets, so we don't store useless word vectors.
        vocabulary = set()
        vocabulary = get_used_words(train_dataset, vocabulary)
        vocabulary = get_used_words(test_dataset, vocabulary)
        # Load needed part of word vectors. Might induce large memory costs.
        try:
            word_vectors = wordvec_to_dict(
                unpacked_glove_path() + "/glove.42B.300d.txt", vocabulary)
        except FileNotFoundError as error:
            logger.error("File: " + error.filename + " not found")
            return
        logger.info("Storing loaded vectors for future use.", level=2)
        with open(PRECOMPUTED_GLOVE_PATH, 'w') as outfile:
            json.dump(word_vectors, outfile)
        embeddings_changed = True
    time_end = time.time()
    logger.success("Word vectors loaded. Elapsed time: " +
                   "{0:.2f}".format(time_end - time_start) + " s")

    id_mapping = generate_dictionary_ids(word_vectors)
    if not os.path.exists(PRECOMPUTED_EMB_MATRIX_PATH
                          ) or force_recompute or embeddings_changed:
        logger.info("Generating initial embedding matrix.")
        embedding_matrix = generate_embedding_matrix(word_vectors, id_mapping)
        logger.info("Storing embedding matrix for future use.", level=2)
        with open(PRECOMPUTED_EMB_MATRIX_PATH, 'w') as outfile:
            json.dump(embedding_matrix.tolist(), outfile)
        logger.success("Embedding matrix created.")
    else:
        logger.info("Embedding matrix found, skipping its computation.")

    label_dict = {}
    if not os.path.exists(PRECOMPUTED_TRAIN_PREMISES_PATH) \
            or not os.path.exists(PRECOMPUTED_TRAIN_HYPOTHESES_PATH) \
            or not os.path.exists(PRECOMPUTED_TRAIN_LABELS_PATH) \
            or force_recompute or embeddings_changed:
        logger.info("Creating train matrix and labels")
        train_premise_matrix, train_hypothesis_matrix, train_labels = input_data_to_matrices(
            train_dataset, id_mapping, label_dict)
        logger.info("Storing matrix for future use.", level=2)
        with open(PRECOMPUTED_TRAIN_PREMISES_PATH, 'w') as outfile:
            json.dump(train_premise_matrix, outfile)
        with open(PRECOMPUTED_TRAIN_HYPOTHESES_PATH, 'w') as outfile:
            json.dump(train_hypothesis_matrix, outfile)
        with open(PRECOMPUTED_TRAIN_LABELS_PATH, 'w') as outfile:
            json.dump(train_labels, outfile)
        logger.success("Matrix stored")
    else:
        logger.info("Train matrix found, skipping its computation.")

    if not os.path.exists(PRECOMPUTED_TEST_PREMISES_PATH) \
            or not os.path.exists(PRECOMPUTED_TEST_HYPOTHESES_PATH) \
            or not os.path.exists(PRECOMPUTED_TEST_LABELS_PATH) \
            or force_recompute or embeddings_changed:
        logger.info("Creating test matrix and labels")
        test_premise_matrix, test_hypothesis_matrix, test_labels = input_data_to_matrices(
            test_dataset, id_mapping, label_dict)
        logger.info("Storing matrix for future use.", level=2)
        with open(PRECOMPUTED_TEST_PREMISES_PATH, 'w') as outfile:
            json.dump(test_premise_matrix, outfile)
        with open(PRECOMPUTED_TEST_HYPOTHESES_PATH, 'w') as outfile:
            json.dump(test_hypothesis_matrix, outfile)
        with open(PRECOMPUTED_TEST_LABELS_PATH, 'w') as outfile:
            json.dump(test_labels, outfile)
        logger.success("Matrix stored")
    else:
        logger.info("Test matrix found, skipping its computation.")