class Word2Vec: def __init__(self, input_file_name, output_file_name): self.output_file_name = output_file_name self.data = InputData(input_file_name, MIN_COUNT) self.model = SkipGramModel(self.data.word_count, EMB_DIMENSION).cuda() self.lr = LR self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr) def train(self): #self.model.load_state_dict(torch.load("../results/skipgram_nge.pkl")) print("SkipGram Training......") pairs_count = self.data.evaluate_pairs_count(WINDOW_SIZE) print("pairs_count", pairs_count) batch_count = pairs_count / BATCH_SIZE print("batch_count", batch_count) process_bar = tqdm(range(int(5 * batch_count))) for i in process_bar: pos_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE) pos_w = [int(pair[0]) for pair in pos_pairs] pos_v = [int(pair[1]) for pair in pos_pairs] neg_v = self.data.get_negative_sampling(pos_pairs, NEG_COUNT) pos_w = pos_w pos_v = pos_v neg_v = neg_v self.optimizer.zero_grad() loss = self.model.forward(pos_w, pos_v, neg_v) loss.backward() self.optimizer.step() process_bar.set_postfix(loss=loss.data) process_bar.update() torch.save(self.model.state_dict(), "../results/skipgram_nge.pkl") self.model.save_embedding(self.data.id2word_dict, self.output_file_name)
class Word2Vec: def __init__(self, input_file_name, output_file_name): self.output_file_name = output_file_name self.data = InputData(input_file_name, MIN_COUNT) self.model = SkipGramModel(self.data.word_count, EMB_DIMENSION) self.lr = LR self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr) def train(self): print("SkipGram Training......") pairs_count = self.data.evaluate_pairs_count(WINDOW_SIZE) print("pairs_count", pairs_count) batch_count = pairs_count / BATCH_SIZE print("batch_count", batch_count) process_bar = tqdm(range(int(batch_count))) for i in process_bar: pos_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE) pos_w = [int(pair[0]) for pair in pos_pairs] pos_v = [int(pair[1]) for pair in pos_pairs] neg_v = self.data.get_negative_sampling(pos_pairs, NEG_COUNT) self.optimizer.zero_grad() loss = self.model.forward(pos_w, pos_v, neg_v) loss.backward() self.optimizer.step() if i * BATCH_SIZE % 100000 == 0: self.lr = self.lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = self.lr self.model.save_embedding(self.data.id2word_dict, self.output_file_name)
class Word2Vec: def __init__(self, input_file_name, output_file_name): self.min_count = 5 self.emb_dimension = 100 self.batch_size = 64 self.window_size = 5 self.iteration = 1 self.initial_lr = 0.001 self.data = InputData(input_file_name, self.min_count) self.output_file_name = output_file_name self.emb_size = len(self.data.word2id) self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension, self.batch_size, self.window_size, self.iteration, self.initial_lr, self.min_count) self.use_cuda = torch.cuda.is_available() if self.use_cuda: self.skip_gram_model.cuda() self.optimizer = optim.SGD( self.skip_gram_model.parameters(), lr=self.initial_lr) def train(self): """Multiple training. Returns: None. """ pair_count = self.data.evaluate_pair_count(self.window_size) batch_count = self.iteration * pair_count / self.batch_size process_bar = tqdm(range(int(batch_count))) for i in process_bar: pos_pairs = self.data.get_batch_pairs(self.batch_size, self.window_size) neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, 5) pos_u = [pair[0] for pair in pos_pairs] pos_v = [pair[1] for pair in pos_pairs] pos_u = Variable(torch.LongTensor(pos_u)) pos_v = Variable(torch.LongTensor(pos_v)) neg_v = Variable(torch.LongTensor(neg_v)) if self.use_cuda: pos_u = pos_u.cuda() pos_v = pos_v.cuda() neg_v = neg_v.cuda() self.optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v) loss.backward() self.optimizer.step() process_bar.set_description("Loss: %0.8f, lr: %0.6f" % (loss.data, self.optimizer.param_groups[0]['lr'])) if i * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr self.skip_gram_model.save_embedding( self.data.id2word, self.output_file_name, self.use_cuda)
class Word2Vec: def __init__(self, input_file_name, output_file_name): self.output_file_name = output_file_name self.data = InputData(input_file_name, MIN_COUNT) self.model = CBOWModel(self.data.word_count, EMB_DIMENSION) self.lr = LR self.optimizer = optim.SparseAdam(self.model.parameters(), lr=self.lr) def train(self): start = time.clock() max_accuracy = 0 for epoch in range(5000): all_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE) pos_pairs, neg_pairs = self.data.get_pairs(all_pairs) # pos是huffman编码为1的部分 pos_u = [pair[0] for pair in pos_pairs] pos_v = [int(pair[1]) for pair in pos_pairs] # 与1对应的非叶子节点 #neg是huffman编码为0的部分 neg_u = [pair[0] for pair in neg_pairs] neg_v = [int(pair[1]) for pair in neg_pairs] # 与0对应的非叶子节点 self.optimizer.zero_grad() loss = self.model.forward(pos_u, pos_v, neg_u, neg_v) loss.backward() self.optimizer.step() #梯度更新 #mid_end=time.clock() #print('one time:%s seconds'%(mid_end-start)) if epoch % 100 == 0: print("Epoch : %d, loss : %.02f" % (epoch, loss)) ac = self.model.predict(all_pairs, self.data.huffman_tree) if ac > max_accuracy: max_accuracy = ac end = time.clock() print('time:%s seconds' % (end - start)) print('accuracy:%.06f' % (max_accuracy)) #self.model.save_embedding(self.data.id2word_dict, self.output_file_name) tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=500) #词向量图 embed_two = tsne.fit_transform( self.model.u_embeddings.weight.cpu().detach().numpy()) labels = [self.data.id2word_dict[i] for i in range(200)] plt.figure(figsize=(15, 12)) for i, label in enumerate(labels): x, y = embed_two[i, :] plt.scatter(x, y) plt.annotate(label, (x, y), ha='center', va='top') plt.savefig('HS.png')
class Word2Vec: def __init__(self, input_file_name, output_file_name, emb_dimension=100, batch_size=50, window_size=5, iteration=5, initial_lr=0.025, neg_num=5, min_count=5): self.data = InputData(input_file_name, min_count) self.output_file_name = output_file_name self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.iteration = iteration self.initial_lr = initial_lr self.neg_num = neg_num self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.skip_gram_model.cuda() self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) def train(self): pair_count = self.data.evaluate_pair_count(self.window_size) batch_count = self.iteration * pair_count / self.batch_size process_bar = tqdm(range(int(batch_count))) count = int(batch_count) // 3 for i in process_bar: pos_pairs = self.data.get_batch_pairs(self.batch_size, self.window_size) neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, self.neg_num) pos_u = [pair[0] for pair in pos_pairs] pos_v = [pair[1] for pair in pos_pairs] pos_u = Variable(torch.LongTensor(pos_u)).cuda() pos_v = Variable(torch.LongTensor(pos_v)).cuda() neg_v = Variable(torch.LongTensor(neg_v)).cuda() self.optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v) loss.backward() self.optimizer.step() process_bar.set_description("Loss: %0.8f, lr: %0.6f" % (loss.item(), self.optimizer.param_groups[0]['lr'])) if i * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr if i != 0 and i % count == 0: self.skip_gram_model.save_embedding(self.data.id2word,self.output_file_name + str(i)) self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name + 'final')
class Word2Vec: def __init__(self, input_file_name, output_file_name): self.output_file_name = output_file_name self.data = InputData(input_file_name, MIN_COUNT) self.model = CBOWModel(self.data.word_count, EMB_DIMENSION).cuda() self.lr = LR self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr) def train(self): print("CBOW Training......") pairs_count = self.data.evaluate_pairs_count(WINDOW_SIZE) print("pairs_count", pairs_count) batch_count = pairs_count / BATCH_SIZE print("batch_count", batch_count) process_bar = tqdm(range(int(batch_count))) loss = -1 for i in process_bar: pos_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE) pos_u = [pair[0] for pair in pos_pairs] pos_w = [int(pair[1]) for pair in pos_pairs] neg_w = self.data.get_negative_sampling(pos_pairs, NEG_COUNT) self.optimizer.zero_grad() loss_now = self.model.forward(pos_u, pos_w, neg_w) if loss == -1: loss = loss_now.data.item() else: loss = 0.95 * loss + 0.05 * loss_now.data.item() loss_now.backward() self.optimizer.step() if i * BATCH_SIZE % 100000 == 0: self.lr = self.lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = self.lr process_bar.set_postfix(loss=loss) process_bar.update() self.model.save_embedding(self.data.id2word_dict, self.output_file_name)
class Word2Vec: def __init__(self, input_file_name, output_file_name): self.output_file_name = output_file_name self.data = InputData(input_file_name, MIN_COUNT) self.model = CBOWModel(self.data.word_count, EMB_DIMENSION).cuda() self.lr = LR self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr) def train(self): for _ in range(1, EPOCH + 1): print("CBOW Training......") pairs_count = self.data.evaluate_pairs_count(WINDOW_SIZE) print("pairs_count", pairs_count) batch_count = int(np.ceil(pairs_count / BATCH_SIZE)) print("batch_count", batch_count) process_bar = tqdm(range(int(batch_count))) # for _ in range(1, EPOCH + 1): for i in process_bar: pos_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE) pos_u = [pair[0] for pair in pos_pairs] pos_w = [int(pair[1]) for pair in pos_pairs] neg_w = self.data.get_negative_sampling(pos_pairs, NEG_COUNT) self.optimizer.zero_grad() loss = self.model.forward(pos_u, pos_w, neg_w) loss.backward() self.optimizer.step() if i * BATCH_SIZE % 100000 == 0: self.lr = self.lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = self.lr process_bar.set_postfix(loss=loss.data) process_bar.update() print('\n') torch.save(self.model.state_dict(), "../results/url_with_location_cbow_neg.pkl") self.model.save_embedding(self.data.id2word_dict, self.output_file_name)
class Word2Vec: def __init__(self, input_file_name, output_file_name): self.output_file_name = output_file_name self.data = InputData(input_file_name, MIN_COUNT) self.model = SkipGramModel(self.data.word_count, EMB_DIMENSION).cuda() self.lr = LR self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr) def train(self): print("SkipGram Training......") pairs_count = self.data.evaluate_pairs_count(WINDOW_SIZE) print("pairs_count", pairs_count) batch_count = pairs_count / BATCH_SIZE print("batch_count", batch_count) process_bar = tqdm(range(int(batch_count))) for i in process_bar: pos_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE) pos_pairs, neg_pairs = self.data.get_pairs(pos_pairs) pos_u = [pair[0] for pair in pos_pairs] pos_v = [int(pair[1]) for pair in pos_pairs] neg_u = [pair[0] for pair in neg_pairs] neg_v = [int(pair[1]) for pair in neg_pairs] self.optimizer.zero_grad() loss = self.model.forward(pos_u, pos_v, neg_u, neg_v) loss.backward() self.optimizer.step() if i * BATCH_SIZE % 100000 == 0: self.lr = self.lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = self.lr process_bar.set_postfix(loss=loss.data.cpu().numpy()) process_bar.update() torch.save(self.model.state_dict(), "../results/skipgram_hs.pkl") self.model.save_embedding(self.data.id2word_dict, self.output_file_name)
class Word2Vec: def __init__(self, input_file_name, output_file_name, emb_dimension=100, batch_size=50, window_size=5, iteration=1, initial_lr=0.025, min_count=1): """Initilize class parameters. Args: input_file_name: Name of a text data from file. Each line is a sentence splited with space. output_file_name: Name of the final embedding file. emb_dimention: Embedding dimention, typically from 50 to 500. batch_size: The count of word pairs for one forward. window_size: Max skip length between words. iteration: Control the multiple training iterations. initial_lr: Initial learning rate. min_count: The minimal word frequency, words with lower frequency will be filtered. Returns: None. """ self.data = InputData(input_file_name, min_count) self.output_file_name = output_file_name self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.iteration = iteration self.initial_lr = initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() if self.use_cuda: self.skip_gram_model.cuda() self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) def train(self): """Multiple training. Returns: None. """ pair_count = self.data.evaluate_pair_count(self.window_size) batch_count = self.iteration * pair_count / self.batch_size process_bar = tqdm(range(int(batch_count))) # self.skip_gram_model.save_embedding( # self.data.id2word, 'begin_embedding.txt', self.use_cuda) for i in process_bar: pos_pairs = self.data.get_batch_pairs(self.batch_size, self.window_size) neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, 5) pos_u = [pair[0] for pair in pos_pairs] pos_v = [pair[1] for pair in pos_pairs] pos_u = Variable(torch.LongTensor(pos_u)) pos_v = Variable(torch.LongTensor(pos_v)) neg_v = Variable(torch.LongTensor(neg_v)) if self.use_cuda: pos_u = pos_u.cuda() pos_v = pos_v.cuda() neg_v = neg_v.cuda() self.optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v) loss.backward() self.optimizer.step() process_bar.set_description( "Loss: %0.8f, lr: %0.6f" % (loss.data[0], self.optimizer.param_groups[0]['lr'])) if i * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name, self.use_cuda)
class Word2Vec: def __init__( self, input_file_name, input_wvectors, input_cvectors, input_ps, input_ns, output_file_name, emb_dimension=100, batch_size=50, window_size=5, kn=20, iteration=1, initial_lr=0.001, clip=1.0, min_count=30, batch_num_to_valid=100000, ): """Initilize class parameters. Args: input_file_name: Name of a text data from file. Each line is a sentence splited with space. input_vectors: Pretrained vector input_psns: Pretrained positive sample & negative sample output_file_name: Name of the final embedding file. emb_dimention: Embedding dimention, typically from 50 to 500. batch_size: The count of word pairs for one forward. window_size: Max skip length between words. kn: k neighbors. iteration: Control the multiple training iterations. initial_lr: Initial learning rate. min_count: The minimal word frequency, words with lower frequency will be filtered. Returns: None. """ self.data = InputData(input_file_name, min_count) self.pre_wvectors = InputVector(input_wvectors) self.pre_cvectors = InputVector(input_cvectors) self.ps_w = load_from_pkl(input_ps) self.ns_w = load_from_pkl(input_ns) self.ps = convert_word_to_id(self.ps_w, self.data.word2id) self.ns = convert_word_to_id(self.ns_w, self.data.word2id) self.output_file_name = output_file_name self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.kn = kn self.iteration = iteration self.initial_lr = initial_lr self.clip = clip self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension, self.pre_wvectors, self.pre_cvectors) self.use_cuda = torch.cuda.is_available() if self.use_cuda: self.skip_gram_model.cuda() self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) self.batch_num_to_valid = batch_num_to_valid def train(self, similarity_test_paths, synset_paths, analogy_paths): """Multiple training. Returns: None. """ pair_count = self.data.evaluate_pair_count(self.window_size) batch_count = self.iteration * pair_count / self.batch_size process_bar = tqdm(range(int(batch_count))) # self.skip_gram_model.save_embedding( # self.data.id2word, 'begin_embedding.txt', self.use_cuda) best_scores = dict() tmp_emb_dir = os.path.join(tempfile.gettempdir(), 'embedding') tmp_emb_path = os.path.join( tmp_emb_dir, ''.join(random.sample(string.ascii_letters + string.digits, 16))) for epoch in range(self.iteration): for i in process_bar: pos_pairs = self.data.get_batch_pairs(self.batch_size, self.window_size) pos_u, mask_pos_u = self.data.get_ps_batch( pos_pairs, self.ps, self.kn) neg_u, mask_neg_u = self.data.get_ns_batch( pos_pairs, self.ns, self.kn) pair_u = [pair[0] for pair in pos_pairs] pair_v = [pair[1] for pair in pos_pairs] pair_u = Variable(torch.LongTensor(pair_u)) pair_v = Variable(torch.LongTensor(pair_v)) pos_u = Variable(torch.LongTensor(pos_u)) mask_pos_u = Variable(torch.FloatTensor(mask_pos_u)) neg_u = Variable(torch.LongTensor(neg_u)) mask_neg_u = Variable(torch.FloatTensor(mask_neg_u)) if self.use_cuda: pair_u = pair_u.cuda() pair_v = pair_v.cuda() pos_u = pos_u.cuda() mask_pos_u = mask_pos_u.cuda() neg_u = neg_u.cuda() mask_neg_u = mask_neg_u.cuda() self.optimizer.zero_grad() ''' param = self.skip_gram_model.parameters() tmp = [] try: while True: tmp.append(param.__next__()) except: pass ''' loss = self.skip_gram_model.forward(pair_u, pair_v, pos_u, mask_pos_u, neg_u, mask_neg_u) loss.backward() torch.nn.utils.clip_grad_norm( self.skip_gram_model.parameters(), self.clip) self.optimizer.step() process_bar.set_description( "Loss: %0.8f, lr: %0.6f" % (loss.data[0], self.optimizer.param_groups[0]['lr'])) if i * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr if i % self.batch_num_to_valid == 0: logging.info('epoch%d_batch%d, evaluating...' % (epoch, i)) self.save_embedding(self.data.id2word, tmp_emb_path, self.use_cuda) best_scores, save_flag = evaluation( tmp_emb_path, similarity_test_paths, synset_paths, analogy_paths, best_scores) if save_flag == True: emb_save_path = self.output_file_name + "_epoch%d_batch%d" % ( epoch, i) shutil.move(tmp_emb_path, emb_save_path) logging.info('Save current embedding to %s' % emb_save_path) self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name, self.use_cuda) logging.info('final evaluating...') self.save_embedding(self.data.id2word, tmp_emb_path, self.use_cuda) best_scores, save_flag = evaluation(tmp_emb_path, similarity_test_paths, synset_paths, analogy_paths, best_scores) if save_flag == True: emb_save_path = self.output_file_name + "_epoch%d" % epoch shutil.move(tmp_emb_path, emb_save_path) logging.info('Save current embedding to %s' % emb_save_path) def save_embedding(self, id2word, file_name, use_cuda): """Save all embeddings to file. As this class only record word id, so the map from id to word has to be transfered from outside. Args: id2word: map from word id to word. file_name: file name. Returns: None. """ if use_cuda: embedding = self.skip_gram_model.u_embeddings.weight.cpu( ).data.numpy() else: embedding = self.skip_gram_model.u_embeddings.weight.data.numpy() fout = open(file_name, 'w') fout.write('%d %d\n' % (len(id2word), self.emb_dimension)) for wid, w in id2word.items(): e = embedding[wid] e = ' '.join(map(lambda x: str(x), e)) fout.write('%s %s\n' % (w, e))
class Word2Vec: def __init__( self, input_path, output_dir, wordsim_path, dimension=100, batch_size=batch_size, window_size=5, epoch_count=1, initial_lr=1e-6, min_count=5, ): self.data = InputData(input_path, min_count) self.output_dir = output_dir self.vocabulary_size = len(self.data.id_from_word) self.dimension = dimension self.batch_size = batch_size self.window_size = window_size self.epoch_count = epoch_count self.initial_lr = initial_lr self.model = SkipGramModel(self.vocabulary_size, self.dimension) if torch.cuda.is_available(): self.device = torch.device('cuda') else: self.device = torch.device('cpu') self.model = nn.DataParallel(self.model.to(self.device)) self.optimizer = optim.SGD(self.model.parameters(), lr=self.initial_lr) if wordsim_path: self.wordsim_verification_tuples = [] with open(wordsim_path, 'r') as f: f.readline() # Abandon header for line in f: word1, word2, actual_similarity = line.split(',') self.wordsim_verification_tuples.append( (word1, word2, float(actual_similarity)) ) else: self.wordsim_verification_tuples = None def train(self): pair_count = self.data.get_pair_count(self.window_size) batch_count = self.epoch_count * pair_count / self.batch_size best_rho = float('-inf') for i in tqdm(range(int(batch_count)), total=batch_count): self.model.train() pos_pairs = self.data.get_batch_pairs( self.batch_size, self.window_size ) neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, 5) pos_u = [pair[0] for pair in pos_pairs] pos_v = [pair[1] for pair in pos_pairs] pos_u = torch.tensor(pos_u, device=self.device) pos_v = torch.tensor(pos_v, device=self.device) neg_v = torch.tensor(neg_v, device=self.device) self.optimizer.zero_grad() loss = self.model(pos_u, pos_v, neg_v) loss.backward() self.optimizer.step() if i % 250 == 0: self.model.eval() rho = self.model.module.get_wordsim_rho( self.wordsim_verification_tuples, self.data.id_from_word, self.data.word_from_id ) print( f'Loss: {loss.item()},' f' lr: {self.optimizer.param_groups[0]["lr"]},' f' rho: {rho}' ) dump_embedding( self.model.module.get_embedding( self.data.id_from_word, self.data.word_from_id ), self.model.module.dimension, self.data.word_from_id, os.path.join(self.output_dir, f'latest.txt'), ) if rho > best_rho: dump_embedding( self.model.module.get_embedding( self.data.id_from_word, self.data.word_from_id ), self.model.module.dimension, self.data.word_from_id, os.path.join(self.output_dir, f'{i}_{rho}.txt') ) best_rho = rho # warm up if i < 10000: lr = self.initial_lr * i / 10000 for param_group in self.optimizer.param_groups: param_group['lr'] = lr elif i * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr
class Word2Vec: def __init__(self, input_file_name, output_file_name, emb_dimension=100, batch_size=100, window_size=5, iteration=5, initial_lr=0.025, min_count=5, using_hs=False, using_neg=False, context_size=2, hidden_size=128, cbow=None, skip_gram=None): print("\nInput File loading......\n") self.data = InputData(input_file_name, min_count) print("\nInput File loaded.\n") self.output_file_name = output_file_name self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.iteration = iteration self.initial_lr = initial_lr self.context_size = context_size self.hidden_size = hidden_size self.using_hs = using_hs self.using_neg = using_neg self.cbow = cbow self.skip_gram = skip_gram if self.skip_gram is not None and self.skip_gram: self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) print("skip_gram_model", self.skip_gram_model) self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) if self.cbow is not None and self.cbow: self.cbow_model = CBOW(self.emb_size, self.emb_dimension) print("CBOW_model", self.cbow_model) self.optimizer = optim.SGD(self.cbow_model.parameters(), lr=self.initial_lr) def skip_gram_train(self): """Multiple training. Returns: None. """ print("Skip_Gram Training......") pair_count = self.data.evaluate_pair_count(self.window_size) print("pair_count", pair_count) batch_count = self.iteration * pair_count / self.batch_size print("batch_count", batch_count) process_bar = tqdm(range(int(batch_count))) self.skip_gram_model.save_embedding(self.data.id2word, 'skip_gram_begin_embedding.txt') for i in process_bar: pos_pairs = self.data.get_batch_pairs(self.batch_size, self.window_size) if self.using_hs: pos_pairs, neg_pairs = self.data.get_pairs_by_huffman( pos_pairs) else: pos_pairs, neg_pairs = self.data.get_pairs_by_neg_sampling( pos_pairs, 5) pos_u = [int(pair[0]) for pair in pos_pairs] pos_v = [int(pair[1]) for pair in pos_pairs] neg_u = [int(pair[0]) for pair in neg_pairs] neg_v = [int(pair[1]) for pair in neg_pairs] self.optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_u, neg_v) loss.backward() self.optimizer.step() process_bar.set_description( "Loss: %0.8f, lr: %0.6f" % (loss.data[0], self.optimizer.param_groups[0]['lr'])) print("Loss: %0.8f, lr: %0.6f" % (loss.data[0], self.optimizer.param_groups[0]['lr'])) if i * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr print("Skip_Gram Trained and Saving File......") self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name) print("Skip_Gram Trained and Saved File.") def cbow_train(self): print("CBOW Training......") self.cbow_model.save_embedding(self.data.id2word, 'cbow_begin_embedding.txt') pos_all_pairs = self.data.get_cbow_batch_all_pairs( self.batch_size, self.context_size) pair_count = len(pos_all_pairs) process_bar = tqdm(range(int(pair_count / self.batch_size))) for _ in process_bar: pos_pairs = self.data.get_cbow_batch_pairs(self.batch_size, self.window_size) if self.using_hs: pos_pairs, neg_pairs = self.data.get_cbow_pairs_by_huffman( pos_pairs) else: pos_pairs, neg_pairs = self.data.get_cbow_pairs_by_neg_sampling( pos_pairs, self.context_size) pos_u = [pair[0] for pair in pos_pairs] pos_v = [int(pair[1]) for pair in pos_pairs] neg_u = [pair[0] for pair in neg_pairs] neg_v = [int(pair[1]) for pair in neg_pairs] self.optimizer.zero_grad() loss = self.cbow_model.forward(pos_u, pos_v, neg_u, neg_v) loss.backward() self.optimizer.step() print("CBOW Trained and Saving File......") self.cbow_model.save_embedding(self.data.id2word, self.output_file_name) print("CBOW Trained and Saved File.")
class Word2Vec: def __init__(self, input_file_name, output_file_name, emb_dimension=100, batch_size=100, window_size=5, iteration=5, initial_lr=0.025, min_count=5, using_hs=False, using_neg=False, context_size=2, hidden_size=128, cbow=None, skip_gram=None): """Initilize class parameters. Args: input_file_name: Name of a text data from file. Each line is a sentence splited with space. output_file_name: Name of the final embedding file. emb_dimention: Embedding dimention, typically from 50 to 500. batch_size: The count of word pairs for one forward. window_size: Max skip length between words. iteration: Control the multiple training iterations. initial_lr: Initial learning rate. min_count: The minimal word frequency, words with lower frequency will be filtered. using_hs: Whether using hierarchical softmax. Returns: None. """ print("\nInput File loading......\n") self.data = InputData(input_file_name, min_count) print("\nInput File loaded.\n") print("Input Data", self.data) self.output_file_name = output_file_name self.emb_size = len(self.data.word2id) print("emb_size", self.emb_size) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.iteration = iteration self.initial_lr = initial_lr self.context_size = context_size self.hidden_size = hidden_size self.using_hs = using_hs self.using_neg = using_neg self.cbow = cbow self.skip_gram = skip_gram if self.skip_gram is not None and self.skip_gram: self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) print("skip_gram_model", self.skip_gram_model) self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) if self.cbow is not None and self.cbow: # self.cbow_model = CBOW(self.emb_size, self.context_size, self.emb_dimension, self.hidden_size) self.cbow_model = CBOW(self.emb_size, self.emb_dimension) print("CBOW_model", self.cbow_model) self.optimizer = optim.SGD(self.cbow_model.parameters(), lr=self.initial_lr) # @profile def skip_gram_train(self): """Multiple training. Returns: None. """ pair_count = self.data.evaluate_pair_count(self.window_size) print("pair_count", pair_count) batch_count = self.iteration * pair_count / self.batch_size print("batch_count", batch_count) process_bar = tqdm(range(int(batch_count))) self.skip_gram_model.save_embedding(self.data.id2word, 'skip_gram_begin_embedding.txt') for i in process_bar: pos_pairs = self.data.get_batch_pairs(self.batch_size, self.window_size) if self.using_hs: pos_pairs, neg_pairs = self.data.get_pairs_by_huffman( pos_pairs) else: pos_pairs, neg_pairs = self.data.get_pairs_by_neg_sampling( pos_pairs, 5) pos_u = [int(pair[0]) for pair in pos_pairs] pos_v = [int(pair[1]) for pair in pos_pairs] neg_u = [int(pair[0]) for pair in neg_pairs] neg_v = [int(pair[1]) for pair in neg_pairs] self.optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_u, neg_v) loss.backward() self.optimizer.step() process_bar.set_description( "Loss: %0.8f, lr: %0.6f" % (loss.data[0], self.optimizer.param_groups[0]['lr'])) print("Loss: %0.8f, lr: %0.6f" % (loss.data[0], self.optimizer.param_groups[0]['lr'])) if i * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name) def cbow_train(self): print("CBOW Training......") pair_count = self.data.evaluate_pair_count(self.context_size * 2 + 1) print("pair_count", pair_count) batch_count = self.iteration * pair_count / self.batch_size print("batch_count", batch_count) process_bar = tqdm(range(int(batch_count))) self.cbow_model.save_embedding(self.data.id2word, 'cbow_begin_embedding.txt') for i in process_bar: pos_pairs = self.data.get_cbow_batch_all_pairs( self.batch_size, self.context_size) if self.using_hs: pos_pairs, neg_pairs = self.data.get_cbow_pairs_by_huffman( pos_pairs) else: pos_pairs, neg_pairs = self.data.get_cbow_pairs_by_neg_sampling( pos_pairs, self.context_size) pos_u = [pair[0] for pair in pos_pairs] pos_v = [int(pair[1]) for pair in pos_pairs] neg_u = [pair[0] for pair in neg_pairs] neg_v = [int(pair[1]) for pair in neg_pairs] self.optimizer.zero_grad() loss = self.cbow_model.forward(pos_u, pos_v, neg_u, neg_v) # loss = self.cbow_model.forwards(pos_v, pos_u, neg_v, neg_u) loss.backward() self.optimizer.step() process_bar.set_description( "Loss: %0.8f, lr: %0.6f" % (loss.data[0], self.optimizer.param_groups[0]['lr'])) print("Loss: %0.8f, lr: %0.6f" % (loss.data[0], self.optimizer.param_groups[0]['lr'])) if i * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr print("CBOW Trained and Saving File......") self.cbow_model.save_embedding(self.data.id2word, self.output_file_name) print("CBOW Trained and Saved File.")
class Word2Vec: """ Word2vec complete process """ def __init__(self, infile, outfile, emb_dim=100, batch_size=128, window_size=5, epochs=5, initial_lr=1, min_count=5): self.data = InputData(infile, min_count) self.outfile = outfile self.emb_size = len(self.data.id2word) self.emb_dim = emb_dim self.batch_size = batch_size self.window_size = window_size self.epochs = epochs self.initial_lr = initial_lr self.wv_model = SkipgramModel(self.emb_size, self.emb_dim) self.use_cuda = torch.cuda.is_available() if self.use_cuda: self.wv_model.cuda() self.optimizer = optim.SGD(self.wv_model.parameters(), lr=self.initial_lr) def train(self, use_neg=False): """ Train word2vec """ pair_count = self.data.estimate_pair_count(self.window_size) batch_count = self.epochs * pair_count / self.batch_size process_bar = tqdm(range(int(batch_count))) for idx in process_bar: pos_pairs = self.data.get_batch_pairs(self.batch_size, self.window_size) targs = [x[0] for x in pos_pairs] conts = [x[1] for x in pos_pairs] if use_neg: negs = self.data.get_neg_pairs(pos_pairs, self.window_size) else: negs = None targs = Variable(torch.LongTensor(targs)) conts = Variable(torch.LongTensor(conts)) if use_neg: negs = Variable(torch.LongTensor(negs)) if self.use_cuda: targs = targs.cuda() conts = conts.cuda() if use_neg: negs = negs.cuda() self.optimizer.zero_grad() loss = self.wv_model.forward(targs, conts, negs) loss.backward() self.optimizer.step() process_bar.set_description( "Loss: %0.8f, lr: %0.6f" % #(loss.data[0], self.optimizer.param_groups[0]['lr'])) (loss.data.item(), self.optimizer.param_groups[0]['lr'])) if idx * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * idx / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr self.wv_model.save(self.data.id2word, self.outfile, self.use_cuda)
class Word2Vec: def __init__(self, wikidump_filename, output_text_filename, emb_dimension, batch_size, window_size, iteration, initial_lr, min_count): self.data = InputData(wikidump_filename, min_count, output_text_filename) self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.iteration = iteration self.initial_lr = initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) def train(self): pair_count = self.data.evaluate_pair_count(self.window_size) batch_count = self.iteration * pair_count / self.batch_size process_bar = tqdm(range(int(batch_count))) for i in process_bar: pos_pairs = self.data.get_batch_pairs(self.batch_size, self.window_size) neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, 5) pos_u = [pair[0] for pair in pos_pairs] pos_v = [pair[1] for pair in pos_pairs] pos_u = Variable(torch.LongTensor(pos_u)) pos_v = Variable(torch.LongTensor(pos_v)) neg_v = Variable(torch.LongTensor(neg_v)) self.optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v) loss.backward() self.optimizer.step() process_bar.set_description( "Loss: %0.8f, lr: %0.6f" % (loss.data, self.optimizer.param_groups[0]['lr'])) if i * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr def calculate_probability(self, word1, word2): embeddings = self.skip_gram_model.u_embeddings.weight.data.numpy() embedding_1 = embeddings[self.data.word2id[word1]] embedding_2 = embeddings[self.data.word2id[word2]] numerator = np.exp(np.sum(embedding_1 * embedding_2)) denominator = np.sum(np.exp( np.sum(np.multiply(embedding_1, embeddings), axis=1)), axis=0) return (numerator / denominator) def wordsim353_spearman(self, input_filename): target_word = [] context_word = [] human_scores = [] with open(input_filename) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') ws353_pairs = -1 for row in csv_reader: if ws353_pairs == -1: ws353_pairs += 1 else: target_word.append(row[0]) context_word.append(row[1]) human_scores.append(float(row[2])) ws353_pairs += 1 for pair in range(0, ws353_pairs): if (target_word[pair] not in self.data.word2id): raise Exception('Target word not in model vocab: ', target_word[pair]) if (context_word[pair] not in self.data.word2id): raise Exception('Context word not in model vocab: ', context_word[pair]) human_rankings = ss.rankdata(human_scores) machine_scores = [] for pair in range(0, len(human_scores)): machine_scores.append( self.calculate_probability(target_word[pair], context_word[pair])) machine_rankings = ss.rankdata(machine_scores) human_scores_dict = dict() machine_scores_dict = dict() for pair in range(0, len(human_scores)): human_scores_dict[pair] = human_rankings[pair] machine_scores_dict[pair] = machine_rankings[pair] return spearman.spearman_correlation(human_scores_dict, machine_scores_dict)