def test_skipgrams(self): # test with no window size and binary labels couples, labels = preprocessing_sequence.skipgrams(np.arange(3), vocabulary_size=3) for couple in couples: self.assertIn(couple[0], [0, 1, 2]) self.assertIn(couple[1], [0, 1, 2]) # test window size and categorical labels couples, labels = preprocessing_sequence.skipgrams(np.arange(5), vocabulary_size=5, window_size=1, categorical=True) for couple in couples: self.assertLessEqual(couple[0] - couple[1], 3) for l in labels: self.assertEqual(len(l), 2)
_OOV_TOKEN_ID = -1 dct, tokenized_docs = preprocess(texts, stem=False) _OOV_TOKEN_ID = dct.token2id[_OOV_TOKEN] frequency = defaultdict(int) data = [] for idx, doc in enumerate(tokenized_docs): id_doc = dct.doc2idx(doc, _OOV_TOKEN_ID) for token_id in id_doc: frequency[token_id] += 1 pairs, _ = skipgrams(id_doc, vocabulary_size=len(dct), window_size=5, shuffle=True, negative_samples=0) if len(pairs) > 2: for pair in pairs: ex = {} ex["target"], ex["context"] = pair ex["doc_id"] = idx data.append(ex) df = pd.DataFrame(data) df.to_csv("{}/train.csv".format(SAVE_DIR), index=False, header=True) total_count = sum(frequency.values()) normalized_frequency = {k: v / total_count for k, v in frequency.items()}
CORPUS = "datasets/twenty_newsgroups.txt" EXPERIMENT_DIR = "experiments/twenty_newsgroups/" create_dirs([EXPERIMENT_DIR]) texts = read_file(CORPUS) pipeline = NlpPipeline(texts, max_length=1000) pipeline.tokenize() pipeline.compact_documents() data = [] for idx, document in enumerate(pipeline.compact_docs): pairs, _ = skipgrams( document, vocabulary_size=len(pipeline.vocab), window_size=5, shuffle=True, negative_samples=0) if len(pairs) > 2: for pair in pairs: ex = {} ex["target"], ex["context"] = pair ex["doc_id"] = idx data.append(ex) df = pd.DataFrame(data) df.to_csv("{}/train_data.csv".format(EXPERIMENT_DIR), index=False, header=True) np.save("{}/freqs".format(EXPERIMENT_DIR), pipeline.token_counts) with open("{}/idx_to_word.pickle".format(EXPERIMENT_DIR), "wb") as fp:
val_model = Model([target_inputs, context_inputs], similarity) return model, val_model if __name__ == '__main__': data, word2idx = build_dataset(max_words=VOCAB_SIZE) print('Printing first 10 words from vocabulary: ', data[:10]) model, val_model = build_model() callback = SimilarityCallback(word2idx, val_model, vocab_size=VOCAB_SIZE) sampling_table = sequence.make_sampling_table(VOCAB_SIZE) couples, labels = sequence.skipgrams(data, VOCAB_SIZE, window_size=WINDOW_SIZE, sampling_table=sampling_table) word_target, word_context = zip(*couples) word_target = np.array(word_target, dtype="int32") word_context = np.array(word_context, dtype="int32") x1, x2, y = np.zeros((1, )), np.zeros((1, )), np.zeros((1, )) for c in range(EPOCHS): idx = np.random.randint(0, len(labels) - 1) x1[0, ] = word_target[idx] x2[0, ] = word_context[idx] y[0, ] = labels[idx] # As an np.array loss = model.train_on_batch([x1, x2], y) if c % 100 == 0: print("Iteration: {}, Loss: {}".format(c, loss))