Esempio n. 1
0
def create_corpus_file():
    model = load_google_w2v_model()

    with open(TREC_TEXT_FILE_PATH, 'r', encoding='utf-8',
              errors='ignore') as file:
        iter_file = iter(file)
        for line in iter_file:
            label = line[:str(line).find(":")]
            if label in LABELS:
                line = line[str(line).find(" ") + 1:]
                LABELS[label].append(line)

    labels = []
    create_file_and_folders_if_not_exist(WORD_VECTORS_DIRECTORY)

    keys = list(LABELS.keys())
    counter = 0
    sublist_counter = 0
    document_available = True

    with open(CORPUS_FILE_NAME, 'w+') as corpus_file:
        while document_available:
            document_available = False
            for key in keys:
                if sublist_counter < len(LABELS[key]):
                    document_available = True

                    document = LABELS[key][sublist_counter]

                    corpus_file.write(document)

                    batch_file_name = get_batch_file_name_for_dataset(
                        counter, DATA_SET_TREC)
                    create_file_and_folders_if_not_exist(batch_file_name)

                    word_vector = document_to_batch(
                        document, model, DATA_SET_TREC['max_time_steps'])

                    np.save(batch_file_name, word_vector)
                    counter += 1

                    labels.append(_generate_label_vector(keys.index(key)))
            sublist_counter += 1

        create_file_and_folders_if_not_exist(LABELS_FILE_PATH)
        np.save(LABELS_FILE_PATH, labels)
Esempio n. 2
0
    def test_corpus_to_model(self):
        corpus, labels = create_corpus_and_labels()
        model = create_w2v_from_corpus(corpus)
        google_model = load_google_w2v_model()

        print(model.wv.similarity('old', 'yes'))
        print(model.wv.similarity('young', 'yes'))
        print(model.wv.similarity('man', 'woman'))

        print("woman - king + man = ...")

        google_most_similar = google_model.wv.most_similar(
            positive=['woman', 'king'], negative=['man'])
        print("google: ", google_most_similar)

        most_similar = model.wv.most_similar(positive=['woman', 'king'],
                                             negative=['man'])
        print("model: ", most_similar)
Esempio n. 3
0
    def test_google_model_word_occurrence_percentage(self):
        corpus, labels = create_corpus_and_labels()
        dictionary = _dictionary(corpus)

        google_model = load_google_w2v_model()
        model = create_w2v_from_corpus(corpus)

        counter = 0.0
        counter_2 = 0.0

        for word in dictionary.token2id.keys():
            if word in google_model:
                counter += 1
            if word in model.wv.vocab:
                counter_2 += 1

        print("[%] of words in google model",
              round((counter / len(dictionary.token2id.keys())) * 100, 2))
        print("[%] of words in generated model",
              round((counter_2 / len(dictionary.token2id.keys())) * 100, 2))
        print(len(dictionary))
        print(len(dictionary.token2id.keys()))
def review_your_review():
    print("Wait for the google w2v model to load...")
    w2v_model = load_google_w2v_model()
    net_model = keras.models.load_model(
        full_path(
            "lstm-net-backups/imdb_timestep150_drout0.4_rdrout0.4_batch64.h5"))
    print("done")

    while True:
        line = input(
            "Type in your review or \"quit\" to finish then press ENTER: ")

        if line == 'quit':
            break

        tokens_line = list(utils.tokenize(line, deacc=True, lower=True))
        document_review = list(
            filter(lambda x: x not in STOP_LIST, tokens_line))
        line_numeric = np.array(
            [document_to_batch(document_review, w2v_model, 150)])

        print('I think this review is: ' + evaluate(net_model, line_numeric))
    print('Good bye')
Esempio n. 5
0
def ensure_word_numeric_representation_created():
    labels_file_name = get_vector_labels_file_name(DATA_SET['label'])

    try:
        np.load(get_batch_file_name(0))
    except IOError:
        logging.info("word vector files does not exist - creating...")
        corpus, labels = create_corpus_and_labels()

        w2v_model = load_google_w2v_model() if USE_GOOGLE_W2V else create_w2v_from_corpus(corpus)

        # each time_step portion conforms to one document in the corpus
        time_steps = DATA_SET["time_steps"]

        for document_idx in range(len(corpus)):
            document_batch = document_to_batch(corpus[document_idx], w2v_model, time_steps)
            batch_file_name = get_batch_file_name(document_idx)
            create_file_and_folders_if_not_exist(batch_file_name)
            np.save(batch_file_name, document_batch)

        create_file_and_folders_if_not_exist(labels_file_name)

        np.save(labels_file_name, labels)
        logging.info("word vector files created")
Esempio n. 6
0
def corpus_to_vectors():
    corpus, labels = create_corpus_and_labels()

    model = create_w2v_from_corpus(corpus=corpus)
    google_model = load_google_w2v_model() if USE_GOOGLE_W2V else None

    dictionary = _dictionary(corpus)
    tfidf = _tfidf(corpus, dictionary)

    document_vectors = documents_to_vector_from_w2v(corpus, google_model,
                                                    model, tfidf)

    train_samples_count = round(
        BATCH_SIZE * (100 - TEST_DATA_PERCENTAGE) / 100, 0)

    x_train = []
    y_train = []
    x_test = []
    y_test = []

    doc_iter = iter(document_vectors)
    label_iter = iter(labels)
    counter = 0
    while True:
        try:
            document_vector = next(doc_iter)
        except StopIteration:
            break
        if counter < train_samples_count:
            x_train.append([document_vector])
        else:
            x_test.append([document_vector])
        counter = (counter + 1) % BATCH_SIZE

    counter = 0
    while True:
        try:
            label = next(label_iter)
        except StopIteration:
            break
        if counter < train_samples_count:
            y_train.append(label)
        else:
            y_test.append(label)
        counter = (counter + 1) % BATCH_SIZE

    x_train = np.concatenate(tuple(x_train), axis=0)
    y_train = np.array(y_train)

    x_test = np.concatenate(tuple(x_test), axis=0)
    y_test = np.array(y_test)

    result = (x_train, y_train), (x_test, y_test)

    # normalize (get rid of the negative values)
    global_minimum = abs(
        x_train.min()) if abs(x_train.min()) > abs(x_test.min()) else abs(
            x_test.min())
    x_train += global_minimum
    x_test += global_minimum
    return result