def test_skipgrams(self):
        # test with no window size and binary labels
        couples, labels = preprocessing_sequence.skipgrams(np.arange(3),
                                                           vocabulary_size=3)
        for couple in couples:
            self.assertIn(couple[0], [0, 1, 2])
            self.assertIn(couple[1], [0, 1, 2])

        # test window size and categorical labels
        couples, labels = preprocessing_sequence.skipgrams(np.arange(5),
                                                           vocabulary_size=5,
                                                           window_size=1,
                                                           categorical=True)
        for couple in couples:
            self.assertLessEqual(couple[0] - couple[1], 3)
        for l in labels:
            self.assertEqual(len(l), 2)
Example #2
0
_OOV_TOKEN_ID = -1

dct, tokenized_docs = preprocess(texts, stem=False)
_OOV_TOKEN_ID = dct.token2id[_OOV_TOKEN]

frequency = defaultdict(int)

data = []

for idx, doc in enumerate(tokenized_docs):
    id_doc = dct.doc2idx(doc, _OOV_TOKEN_ID)
    for token_id in id_doc:
        frequency[token_id] += 1
    pairs, _ = skipgrams(id_doc,
                         vocabulary_size=len(dct),
                         window_size=5,
                         shuffle=True,
                         negative_samples=0)

    if len(pairs) > 2:
        for pair in pairs:
            ex = {}
            ex["target"], ex["context"] = pair
            ex["doc_id"] = idx
            data.append(ex)

df = pd.DataFrame(data)
df.to_csv("{}/train.csv".format(SAVE_DIR), index=False, header=True)

total_count = sum(frequency.values())
normalized_frequency = {k: v / total_count for k, v in frequency.items()}
Example #3
0
CORPUS = "datasets/twenty_newsgroups.txt"
EXPERIMENT_DIR = "experiments/twenty_newsgroups/"
create_dirs([EXPERIMENT_DIR])

texts = read_file(CORPUS)
pipeline = NlpPipeline(texts, max_length=1000)

pipeline.tokenize()
pipeline.compact_documents()

data = []

for idx, document in enumerate(pipeline.compact_docs):
    pairs, _ = skipgrams(
        document,
        vocabulary_size=len(pipeline.vocab),
        window_size=5,
        shuffle=True,
        negative_samples=0)

    if len(pairs) > 2:
        for pair in pairs:
            ex = {}
            ex["target"], ex["context"] = pair
            ex["doc_id"] = idx
            data.append(ex)

df = pd.DataFrame(data)
df.to_csv("{}/train_data.csv".format(EXPERIMENT_DIR), index=False, header=True)

np.save("{}/freqs".format(EXPERIMENT_DIR), pipeline.token_counts)
with open("{}/idx_to_word.pickle".format(EXPERIMENT_DIR), "wb") as fp:
Example #4
0
    val_model = Model([target_inputs, context_inputs], similarity)

    return model, val_model


if __name__ == '__main__':
    data, word2idx = build_dataset(max_words=VOCAB_SIZE)
    print('Printing first 10 words from vocabulary: ', data[:10])

    model, val_model = build_model()

    callback = SimilarityCallback(word2idx, val_model, vocab_size=VOCAB_SIZE)

    sampling_table = sequence.make_sampling_table(VOCAB_SIZE)
    couples, labels = sequence.skipgrams(data,
                                         VOCAB_SIZE,
                                         window_size=WINDOW_SIZE,
                                         sampling_table=sampling_table)
    word_target, word_context = zip(*couples)
    word_target = np.array(word_target, dtype="int32")
    word_context = np.array(word_context, dtype="int32")

    x1, x2, y = np.zeros((1, )), np.zeros((1, )), np.zeros((1, ))

    for c in range(EPOCHS):
        idx = np.random.randint(0, len(labels) - 1)
        x1[0, ] = word_target[idx]
        x2[0, ] = word_context[idx]
        y[0, ] = labels[idx]  # As an np.array
        loss = model.train_on_batch([x1, x2], y)
        if c % 100 == 0:
            print("Iteration: {}, Loss: {}".format(c, loss))