def find_most_similar_words(model=None, word_vec_path=None, word=None): # === Load dictionary dict_dir = os.path.join(get_data_dir(), "book_dict") word_cnt_dict_path = os.path.join(dict_dir, "word_cnt_dict.pkl") word_cnt_dict = load_dictionary(dict_path=word_cnt_dict_path) print(len(word_cnt_dict)) word2id_dict_path = os.path.join(dict_dir, "word2id_dict.pkl") word2id_dict = load_dictionary(dict_path=word2id_dict_path) print(len(word2id_dict)) id2word_dict_path = os.path.join(dict_dir, "id2word_dict.pkl") id2word_dict = load_dictionary(dict_path=id2word_dict_path) print(len(id2word_dict)) if word not in word2id_dict: print('%s not in dict' % word) return None word_idx = word2id_dict[word] # === Load word_vec word_vecs = [] with open(word_vec_path, 'r') as f: for line in f: buf = line[:-1].split('\t') vec = np.array(list(map(lambda x: float(x), buf[1].split(',')))) word_vecs.append(vec) word_vecs = np.array(word_vecs) # === Find word_vec = word_vecs[word_idx] print('word_idx', word_idx) sims = np.dot(word_vecs, word_vec) ranks = np.argsort(-sims) print('ranks', ranks[:20]) print('scores', sims[ranks[:20]]) sim_words = [id2word_dict[idx] for idx in ranks[:20]] print("Top sim words of '%s' are: " % word) print(sim_words)
def sample(self, num_sample=None, method="random"): if method == "random": return self._random_sample(num_sample) elif method == "weighted": return self._weighted_sample(num_sample) def _random_sample(self, num_sample): return np.random.choice(self.word_idxes, num_sample) def _weighted_sample(self, num_sample): return np.random.choice(self.word_idxes, num_sample, p=self.probs) if __name__ == '__main__': dict_dir = os.path.join(get_data_dir(), "book_dict") word_cnt_dict_path = os.path.join(dict_dir, "word_cnt_dict.pkl") word_cnt_dict = load_dictionary(dict_path=word_cnt_dict_path) print(len(word_cnt_dict)) word2id_dict_path = os.path.join(dict_dir, "word2id_dict.pkl") word2id_dict = load_dictionary(dict_path=word2id_dict_path) print(len(word2id_dict)) id2word_dict_path = os.path.join(dict_dir, "id2word_dict.pkl") id2word_dict = load_dictionary(dict_path=id2word_dict_path) print(len(id2word_dict)) #word_cnt_dict = {"word": 100, 2: 200, 3: 300} #word2id_dict = {"machine": 0, 2: 1, 3: 2}
# === Find word_vec = word_vecs[word_idx] print('word_idx', word_idx) sims = np.dot(word_vecs, word_vec) ranks = np.argsort(-sims) print('ranks', ranks[:20]) print('scores', sims[ranks[:20]]) sim_words = [id2word_dict[idx] for idx in ranks[:20]] print("Top sim words of '%s' are: " % word) print(sim_words) if __name__ == '__main__': checkpoint_dir = os.path.join(get_model_dir(), "word2vec") word_vec_path = os.path.join(get_data_dir(), "word_vectors") vocab_size = 10001 # ptb, min_cnt = 5 window_size = 5 num_neg = 5 embedding_dim = 64 # === Load model checkpoint_dir = os.path.join(get_model_dir(), "word2vec") checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt") w2v = Word2vec(vocab_size=vocab_size, window_size=window_size, num_neg=num_neg, embedding_dim=embedding_dim)
yield contexts, target, negatives dataset = tf.data.Dataset.from_generator( generator, output_shapes=((window_size*2, ), (1, ), (num_neg, )), output_types=(tf.int32, tf.int32, tf.int32) ) return dataset.repeat(count=epochs)\ .shuffle(buffer_size=shuffle_buffer_size)\ .batch(batch_size=batch_size) if __name__ == "__main__": train_path = os.path.join(get_data_dir(), "ptb.train.txt") val_path = os.path.join(get_data_dir(), "ptb.valid.txt") dict_dir = os.path.join(get_data_dir(), "book_dict") """ train_dataset = get_dataset(input_path=train_path, dict_dir=dict_dir) print("===Train===") for data in train_dataset.take(2): print(data) """ model = Word2vec()
with open(data_path, 'r', encoding='utf-8') as f: for line in f: """ if len(line[:-1].split(' ')) < 7: # window_size*2+1 continue """ ratio = np.random.random() if ratio < train_ratio: fw_train.write(line) else: fw_val.write(line) fw_train.close() fw_val.close() if __name__ == '__main__': data_dir = get_data_dir() data_path = os.path.join(data_dir, "book_text.txt") train_path = os.path.join(data_dir, "book_text_train.txt") val_path = os.path.join(data_dir, "book_text_val.txt") train_ratio = 0.8 train_val_split(data_path=data_path, train_path=train_path, val_path=val_path, train_ratio=train_ratio) print("Split done! train_path: %s" % train_path)
def train_word2vec(): vocab_size = 10001 # min_cnt=5, ptb total_num_train = 971657 total_num_val = 77130 shuffle_buffer_size = 2048 * 2 epochs = 10 batch_size = 128 window_size = 5 num_neg = 5 embedding_dim = 64 # To tune train_path = os.path.join(get_data_dir(), "ptb.train.txt") val_path = os.path.join(get_data_dir(), "ptb.valid.txt") dict_dir = os.path.join(get_data_dir(), "book_dict") train_dataset = get_dataset(input_path=train_path, dict_dir=dict_dir, shuffle_buffer_size=shuffle_buffer_size, epochs=epochs, batch_size=batch_size, window_size=window_size, num_neg=num_neg) val_dataset = get_dataset(input_path=val_path, dict_dir=dict_dir, shuffle_buffer_size=shuffle_buffer_size, epochs=epochs, batch_size=batch_size, window_size=window_size, num_neg=num_neg) optimizer = tf.keras.optimizers.Adam(0.001) model = Word2vec(vocab_size=vocab_size, window_size=window_size, num_neg=num_neg, embedding_dim=embedding_dim) checkpoint_dir = os.path.join(get_model_dir(), "word2vec") checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt") checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model) total_train_batch = total_num_train // batch_size + 1 # === Train start = time.time() for epoch in range(epochs): total_loss = 0 batch_loss = 0 #total_train_batch = 101 # just for debug epoch_start = time.time() i = 0 for batch_idx, (contexts, target, negatives) in zip(range(total_train_batch), train_dataset): i += 1 cur_loss = train_step(model, optimizer, contexts, target, negatives) batch_loss += cur_loss if i % 100 == 0: batch_end = time.time() batch_last = batch_end - start print( "Epoch: %d/%d, batch: %d/%d, batch_loss: %.4f, cur_loss: %.4f, lasts: %.2fs" % (epoch + 1, epochs, batch_idx + 1, total_train_batch, batch_loss / (batch_idx + 1), cur_loss, batch_last)) assert i > 0 batch_loss /= i total_loss += batch_loss epoch_end = time.time() epoch_last = epoch_end - epoch_start print("Epoch: %d/%d, loss: %.4f, lasts: %.2fs" % (epoch + 1, epochs, total_loss / (epoch + 1), epoch_last)) # === Test sim """ # [vocab_size, embedding_dim] weights = model.output_embedding_layer.get_weights() weights = np.array(weights[0]) print(weights.shape) # computer: 236 #sample_word_idx = 236 # [embedding_dim, ] sample = weights[236] scores = np.dot(weights, sample) rank = np.argsort(-scores) top = rank[:20] print("top", top) print("score", scores[top]) """ checkpoint.save(file_prefix=checkpoint_prefix) #print(model.output_embedding_layer.get_weights()) #print(get_word_representation(model=model)) end = time.time() last = end - start print("Lasts %.2fs" % last)
continue for word in line.split(' '): if word not in word_cnt_dict: word_cnt_dict[word] = 1 else: word_cnt_dict[word] += 1 # === Save word_cnt_dict with open(dict_path, 'wb') as fw: pickle.dump(word_cnt_dict, fw) if __name__ == "__main__": #text_path = os.path.join(get_data_dir(), "book_text.txt") text_path = os.path.join(get_data_dir(), "ptb_train_val.txt") dict_dir = os.path.join(get_data_dir(), "book_dict") word_cnt_dict_path = os.path.join(dict_dir, "word_cnt_dict.pkl") count_word(text_path=text_path, dict_path=word_cnt_dict_path) build_dictionary(dict_dir=dict_dir, min_word_count=5) word_cnt_dict = load_dictionary(dict_path=word_cnt_dict_path) print(len(word_cnt_dict)) word2id_dict_path = os.path.join(dict_dir, "word2id_dict.pkl") word2id_dict = load_dictionary(dict_path=word2id_dict_path) print(len(word2id_dict)) id2word_dict_path = os.path.join(dict_dir, "id2word_dict.pkl")
training_num = 0 for line in f: for word in line[:-1].split(' '): if word not in word_cnt_dict: continue cnt = word_cnt_dict[word] if cnt < min_word_cnt: continue training_num += 1 return training_num if __name__ == '__main__': #train_path = os.path.join(get_data_dir(), "shuf_train.txt") train_path = os.path.join(get_data_dir(), "ptb.valid.txt") word_cnt_dict_path = os.path.join(get_data_dir(), "book_dict", "word_cnt_dict.pkl") min_word_cnt = 5 word_cnt_dict = load_dictionary(dict_path=word_cnt_dict_path) print(len(word_cnt_dict)) training_num = count_training_num(input_path=train_path, word_cnt_dict=word_cnt_dict, min_word_cnt=min_word_cnt) print(training_num)