Exemple #1
0
def wordvec_to_dict(filename, word_filter):
    vec_dict = {}

    # progress bar overhead
    bar = logger.get_progress_bar("Reading GloVe vectors", level=2, limit=20)
    total_size = os.path.getsize(filename)
    found_counter = 0
    size_counter = 0
    last_milestone = 0
    with open(filename, "r", encoding="utf8") as file:
        for raw_line in file:
            # progress bar overhead
            size_counter += len(raw_line) + 1
            if int((size_counter / total_size) * 20) > last_milestone:
                bar.next()
                last_milestone += 1

            # process line
            line = raw_line.split()
            if not line[0] in word_filter:
                continue
            found_counter += 1
            num_line = [float(x) for x in line[1:]]
            vec_dict[line[0]] = num_line

    bar.finish()
    # Observation: most of the unmatched words are typos, compounds or really uncommon.
    logger.info("Found vectors for " + str(found_counter) + " words out of " +
                str(len(word_filter)) + "." + " Elapsed time: " +
                str(bar.elapsed) + " s",
                level=2)
    return vec_dict
Exemple #2
0
def download(url, archive_name, label=""):
    if os.path.exists(SAVE_DIR + "/" + archive_name +
                      ".zip") or is_unpacked(archive_name):
        logger.info(label.capitalize() + " already downloaded.")
        return
    if not os.path.exists(SAVE_DIR):
        os.makedirs(SAVE_DIR)

    logger.info("Downloading the " + label + " archive from " + url)
    wget.download(url, SAVE_DIR + "/" + archive_name + ".zip")
    logger.success("Download completed.")
Exemple #3
0
def unpack(archive_name, label=""):
    if is_unpacked(archive_name):
        logger.info(label.capitalize() + " already unpacked.")
        return
    if not os.path.exists(SAVE_DIR + "/" + archive_name + ".zip"):
        logger.error("No " + label + " zipfile to unpack")
        return

    logger.info("Unpacking " + label)
    os.makedirs(SAVE_DIR + "/" + archive_name)
    with zipfile.ZipFile(SAVE_DIR + "/" + archive_name + ".zip", "r") as file:
        file.extractall(SAVE_DIR + "/" + archive_name)
    logger.success("Unpacking complete.")
    os.remove(SAVE_DIR + "/" + archive_name + ".zip")
Exemple #4
0
def input_data_to_matrices(dataset, word_id_mapping, label_dict):
    premise_matrix = []
    hypothesis_matrix = []
    label_counter = len(label_dict)
    labels = []

    # sentence1 denotes premise, sentence2 is hypothesis
    for sentence_pair in dataset:
        label = sentence_pair["gold_label"]
        if label == '-':
            continue
        if label not in label_dict:
            label_dict[label] = label_counter
            label_counter += 1

        premise_row = []
        hypothesis_row = []
        scrap = False
        for item in sentence_to_words(sentence_pair["sentence1"]):
            if item in word_id_mapping:
                premise_row.append(word_id_mapping[item])
            else:
                scrap = True
                break
        for item in sentence_to_words(sentence_pair["sentence2"]):
            if item in word_id_mapping:
                hypothesis_row.append(word_id_mapping[item])
            else:
                scrap = True
                break
        # do not use sentence, if it contains an unknown word
        if scrap:
            continue
        premise_matrix.append(premise_row)
        hypothesis_matrix.append(hypothesis_row)
        labels.append(label_dict[label])

    logger.info("Number of distinct labels: " + str(label_counter), level=2)
    logger.info("Length of cleaned dataset: " + str(len(labels)), level=2)
    return premise_matrix, hypothesis_matrix, labels
Exemple #5
0
def run(force_recompute=True):
    logger.header("Running preprocessor module.")

    if not check_all_unpacked():
        logger.error(
            "Unpacked datasets or word vectors are missing. Please run downloader prior to preprocessor."
        )

    time_start = time.time()
    logger.info("Loading datasets into memory")
    try:
        train_dataset = json_to_array(unpacked_dataset_path() +
                                      "/snli_1.0_train.jsonl")
        test_dataset = json_to_array(unpacked_dataset_path() +
                                     "/snli_1.0_test.jsonl")
    except FileNotFoundError as error:
        logger.error("File: " + error.filename + " not found")
        return
    time_end = time.time()
    logger.success("Datasets loaded. Elapsed time: " +
                   "{0:.2f}".format(time_end - time_start) + " s")

    embeddings_changed = False
    time_start = time.time()
    if os.path.exists(PRECOMPUTED_GLOVE_PATH) and not force_recompute:
        logger.info("Precomputed word vectors found, loading into memory.")
        with open(PRECOMPUTED_GLOVE_PATH, 'r') as infile:
            word_vectors = json.load(infile)
    else:
        logger.info("Loading word vectors into memory")
        # Get a set of words used in datasets, so we don't store useless word vectors.
        vocabulary = set()
        vocabulary = get_used_words(train_dataset, vocabulary)
        vocabulary = get_used_words(test_dataset, vocabulary)
        # Load needed part of word vectors. Might induce large memory costs.
        try:
            word_vectors = wordvec_to_dict(
                unpacked_glove_path() + "/glove.42B.300d.txt", vocabulary)
        except FileNotFoundError as error:
            logger.error("File: " + error.filename + " not found")
            return
        logger.info("Storing loaded vectors for future use.", level=2)
        with open(PRECOMPUTED_GLOVE_PATH, 'w') as outfile:
            json.dump(word_vectors, outfile)
        embeddings_changed = True
    time_end = time.time()
    logger.success("Word vectors loaded. Elapsed time: " +
                   "{0:.2f}".format(time_end - time_start) + " s")

    id_mapping = generate_dictionary_ids(word_vectors)
    if not os.path.exists(PRECOMPUTED_EMB_MATRIX_PATH
                          ) or force_recompute or embeddings_changed:
        logger.info("Generating initial embedding matrix.")
        embedding_matrix = generate_embedding_matrix(word_vectors, id_mapping)
        logger.info("Storing embedding matrix for future use.", level=2)
        with open(PRECOMPUTED_EMB_MATRIX_PATH, 'w') as outfile:
            json.dump(embedding_matrix.tolist(), outfile)
        logger.success("Embedding matrix created.")
    else:
        logger.info("Embedding matrix found, skipping its computation.")

    label_dict = {}
    if not os.path.exists(PRECOMPUTED_TRAIN_PREMISES_PATH) \
            or not os.path.exists(PRECOMPUTED_TRAIN_HYPOTHESES_PATH) \
            or not os.path.exists(PRECOMPUTED_TRAIN_LABELS_PATH) \
            or force_recompute or embeddings_changed:
        logger.info("Creating train matrix and labels")
        train_premise_matrix, train_hypothesis_matrix, train_labels = input_data_to_matrices(
            train_dataset, id_mapping, label_dict)
        logger.info("Storing matrix for future use.", level=2)
        with open(PRECOMPUTED_TRAIN_PREMISES_PATH, 'w') as outfile:
            json.dump(train_premise_matrix, outfile)
        with open(PRECOMPUTED_TRAIN_HYPOTHESES_PATH, 'w') as outfile:
            json.dump(train_hypothesis_matrix, outfile)
        with open(PRECOMPUTED_TRAIN_LABELS_PATH, 'w') as outfile:
            json.dump(train_labels, outfile)
        logger.success("Matrix stored")
    else:
        logger.info("Train matrix found, skipping its computation.")

    if not os.path.exists(PRECOMPUTED_TEST_PREMISES_PATH) \
            or not os.path.exists(PRECOMPUTED_TEST_HYPOTHESES_PATH) \
            or not os.path.exists(PRECOMPUTED_TEST_LABELS_PATH) \
            or force_recompute or embeddings_changed:
        logger.info("Creating test matrix and labels")
        test_premise_matrix, test_hypothesis_matrix, test_labels = input_data_to_matrices(
            test_dataset, id_mapping, label_dict)
        logger.info("Storing matrix for future use.", level=2)
        with open(PRECOMPUTED_TEST_PREMISES_PATH, 'w') as outfile:
            json.dump(test_premise_matrix, outfile)
        with open(PRECOMPUTED_TEST_HYPOTHESES_PATH, 'w') as outfile:
            json.dump(test_hypothesis_matrix, outfile)
        with open(PRECOMPUTED_TEST_LABELS_PATH, 'w') as outfile:
            json.dump(test_labels, outfile)
        logger.success("Matrix stored")
    else:
        logger.info("Test matrix found, skipping its computation.")
Exemple #6
0
def run():
    logger.header("Running trainer module.")

    logger.info("Loading embedding matrix into tensorflow model.")
    embedding_matrix = load_embedding_matrix()
    logger.success("Matrix loaded.")

    logger.info("Loading training data matrices.")
    train_premise_matrix, train_hypothesis_matrix, train_labels = load_train_matrices(
    )
    test_premise_matrix, test_hypothesis_matrix, test_labels = load_test_matrices(
    )
    logger.success("Matrices loaded.")

    logger.info("Building Tensorflow model.")

    # Placeholders for the feed dict
    premise_ph = tf.placeholder(tf.int32, [None, None])
    hypothesis_ph = tf.placeholder(tf.int32, [None, None])
    labels_ph = tf.placeholder(tf.float32, [None, train_labels.shape[1]])
    keep_rate_ph = tf.placeholder(tf.float32)

    # Model is given as optimizer minimize operation
    model, loss, error = build_model(premise_ph, hypothesis_ph, labels_ph,
                                     embedding_matrix, keep_rate_ph)

    # create batch producers for both training and testing
    num_batches = min(BATCH_CEILING, train_labels.shape[0] // BATCH_SIZE)
    num_test_batches = min(BATCH_CEILING, test_labels.shape[0] // BATCH_SIZE)
    train_batch_queue = tf.train.range_input_producer(limit=num_batches,
                                                      shuffle=True)
    test_batch_queue = tf.train.range_input_producer(limit=num_test_batches,
                                                     shuffle=False)
    premise_tf, hypothesis_tf, label_tf = produce_batch(
        train_premise_matrix, train_hypothesis_matrix, train_labels,
        train_batch_queue)
    premise_ts, hypothesis_ts, label_ts = produce_batch(
        test_premise_matrix, test_hypothesis_matrix, test_labels,
        test_batch_queue)
    logger.success("Model built. Number of variables: " +
                   str(get_model_variable_count()))

    logger.info("Running Tensorflow session. Good luck.")
    with tf.Session() as session:
        # Wouldn't work without this, for some reason
        input_coord = tf.train.Coordinator()
        input_threads = tf.train.start_queue_runners(session,
                                                     coord=input_coord)

        session.run(tf.global_variables_initializer())

        train_stats = []
        test_stats = []
        for epoch in range(1, EPOCH_COUNT + 1):
            logger.info("Epoch " + str(epoch) + " startup...", level=2)

            # Run training on all batches (optimizer on)
            sum_loss = 0
            sum_err = 0
            for batch in range(1, num_batches + 1):
                premise_batch, hypothesis_batch, labels_batch = session.run(
                    [premise_tf, hypothesis_tf, label_tf])
                _, curr_loss, curr_err = session.run(
                    [model, loss, error], {
                        premise_ph: premise_batch,
                        hypothesis_ph: hypothesis_batch,
                        labels_ph: labels_batch,
                        keep_rate_ph: DROPOUT_RATE
                    })
                sum_loss += curr_loss
                sum_err += curr_err
                if batch % 100 == 0 and batch > 0:
                    logger.info("Batch " + str(batch) + ", loss: " +
                                str(sum_loss / batch) + "    acc.: " + str(
                                    (1 - sum_err / batch) * 100),
                                level=3)
            train_stats.append(
                [sum_loss / num_batches, (1 - sum_err / num_batches) * 100])

            # Run testing on all batches (optimizer off)
            test_loss = 0
            test_err = 0
            for test_batch in range(1, num_test_batches + 1):
                premise_batch_t, hypothesis_batch_t, labels_batch_t = session.run(
                    [premise_ts, hypothesis_ts, label_ts])
                curr_loss, curr_err = session.run(
                    [loss, error], {
                        premise_ph: premise_batch_t,
                        hypothesis_ph: hypothesis_batch_t,
                        labels_ph: labels_batch_t,
                        keep_rate_ph: 1.0
                    })
                test_loss += curr_loss
                test_err += curr_err
            test_loss /= num_test_batches
            test_err /= num_test_batches
            logger.info("Epoch " + str(epoch) + " done. Test loss: " +
                        str(test_loss) + "    Test acc: " + str(
                            (1 - test_err) * 100),
                        level=2)
            test_stats.append([test_loss, (1 - test_err) * 100])
        print(train_stats)
        print(test_stats)

        input_coord.request_stop()
        input_coord.join(input_threads)
    logger.success("Session run complete")