def __init__(self, inputfile, pre_trained_vocab_reverse = {}, pre_trained_vocab = {}, vocabulary_size=300000, embedding_dim=200, epoch_num=5, batch_size=16, windows_size=5,neg_sample_num=10): self.op = Options(inputfile, pre_trained_vocab_reverse, pre_trained_vocab, vocabulary_size) self.embedding_dim = embedding_dim self.windows_size = windows_size self.vocabulary_size = len(self.op.vocab_words) self.batch_size = batch_size self.epoch_num = epoch_num self.neg_sample_num = neg_sample_num
def __init__(self, inputfile, vocabulary_size=100000, embedding_dim=200, epoch_num=10, batch_size=16, windows_size=5,neg_sample_num=10): self.op = Options(inputfile, vocabulary_size) self.embedding_dim = embedding_dim self.windows_size = windows_size self.vocabulary_size = vocabulary_size self.batch_size = batch_size self.epoch_num = epoch_num self.neg_sample_num = neg_sample_num
def __init__(self, inputfile, embeddingsfile, reg_lambda=0.000001, embedding_dim=300, epoch_num=5, batch_size=16, windows_size=5, neg_sample_num=10): self.op = Options(inputfile, embeddingsfile, embedding_dim) self.windows_size = windows_size self.vocabulary_size = self.op.vocabulary_size self.batch_size = batch_size self.epoch_num = epoch_num self.neg_sample_num = neg_sample_num self.reg_lambda = reg_lambda
def __init__(self, embedsfile, netfile, netlist, rellist, n_negative, testmodel, lamtas=[1, 1, 8, 8]): self.op = Options(embedsfile, netfile, netlist, rellist, lamtas, testmodel) self.neg_generator = UniformNegativeGenerator(self.op.vocab_size, self.op.sample_table, n_negative=n_negative) self.n_negative = n_negative print('Inialize Finish')
class word2vec: def __init__(self, inputfile, embeddingsfile, reg_lambda=0.000001, embedding_dim=300, epoch_num=5, batch_size=16, windows_size=5, neg_sample_num=10): self.op = Options(inputfile, embeddingsfile, embedding_dim) self.windows_size = windows_size self.vocabulary_size = self.op.vocabulary_size self.batch_size = batch_size self.epoch_num = epoch_num self.neg_sample_num = neg_sample_num self.reg_lambda = reg_lambda def train(self): model = skipgram(self.vocabulary_size, self.op.embeddings.shape[1], reg=self.reg_lambda) model.init_emd(self.op.embeddings) if torch.cuda.is_available(): model.cuda() optimizer = optim.SGD(model.parameters(),lr=0.2) for epoch in range(self.epoch_num): start = time.time() self.op.process = True batch_num = 0 batch_new = 0 while self.op.process: pos_u, pos_v, neg_v = self.op.generate_batch(self.windows_size, self.batch_size, self.neg_sample_num) pos_u = Variable(torch.LongTensor(pos_u)) pos_v = Variable(torch.LongTensor(pos_v)) neg_v = Variable(torch.LongTensor(neg_v)) if torch.cuda.is_available(): pos_u = pos_u.cuda() pos_v = pos_v.cuda() neg_v = neg_v.cuda() optimizer.zero_grad() loss = model(pos_u, pos_v, neg_v,self.batch_size) loss.backward() optimizer.step() if batch_num%30000 == 0: torch.save(model.state_dict(), './tmp/skipgram.epoch{}.batch{}'.format(epoch,batch_num)) if batch_num%2000 == 0: end = time.time() word_embeddings = model.input_embeddings() sp1, sp2 = scorefunction(word_embeddings) print 'epoch,batch=%2d %5d: sp=%1.3f %1.3f pair/sec = %4.2f loss=%4.3f' %(epoch, batch_num, sp1, sp2, (batch_num-batch_new)*self.batch_size/(end-start),loss.data[0]) batch_new = batch_num start = time.time() batch_num = batch_num + 1 print() print("Optimization Finished!") model.save_embedding("embeddings.txt", lambda x: self.op.vocab_words[x])
def __init__(self, inputfile, val_fn, vocabulary_size=4000, embedding_dim=100, epoch_num=2, batch_size=16, windows_size=5, neg_sample_num=10): logger = logging.getLogger() logger.info("Load train data") self.op = Options(inputfile, vocabulary_size) logger.info("Load test data") self.val_op = Options(val_fn, vocabulary_size, dictionary=self.op.dictionary) self.embedding_dim = embedding_dim self.windows_size = windows_size self.vocabulary_size = vocabulary_size self.batch_size = batch_size self.epoch_num = epoch_num self.neg_sample_num = neg_sample_num
class ComEmb(object): def __init__(self, embedsfile, netfile, netlist, rellist, n_negative, testmodel, lamtas=[1, 1, 8, 8]): self.op = Options(embedsfile, netfile, netlist, rellist, lamtas, testmodel) self.neg_generator = UniformNegativeGenerator(self.op.vocab_size, self.op.sample_table, n_negative=n_negative) self.n_negative = n_negative print('Inialize Finish') def train(self, gamma=1, l2=1e-3, epoch_num=400, batch_size=32, embedding_dim=300, lr=0.01, emnames='w2v', sname='geo'): embeds = self.op.embeds dims = self.op.dims vocab_size = self.op.vocab_size rel_size = self.op.rel_size id2word = self.op.id2word id2rel = self.op.id2rel wordindex = self.op.wordindex triples_id = self.op.subsampled_data oovs = self.op.oovs mean_score = self.op.meanscore batch_num = math.ceil(len(triples_id) / batch_size) print('dims: ' + str(dims)) print('learning rate:' + str(lr)) print('gamma: ' + str(gamma)) print('batch_num:' + str(batch_num)) print('mean score:' + str(mean_score)) model = ensemble(vocab_size, rel_size, embedding_dim, embeds, gamma, l2, dims) optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=lr) if torch.cuda.is_available(): model = model.cuda() # note = open('note.txt', 'w') # note.write('gamma= {}\n'.format(gamma)) # scheduler = MultiStepLR(optimizer, milestones=[50,90,120], gamma=0.5) for t in range(epoch_num): #scheduler.step() batch_num = 0 for pos_triplets in self.op.batch_iter(triples_id, batch_size): neg_triplets, neg_ents = self.neg_generator.generate( pos_triplets) inp = np.concatenate( (pos_triplets[:, 0], pos_triplets[:, 2], neg_ents)) pos_triplets = np.tile(pos_triplets, (self.n_negative, 1)) weight = pos_triplets[:, 3] weight = Variable(torch.FloatTensor(weight)) inp = Variable(torch.LongTensor(inp)) pos_triplets = Variable(torch.LongTensor(pos_triplets)) neg_triplets = Variable(torch.LongTensor(neg_triplets)) if torch.cuda.is_available(): weight = weight.cuda() inp = inp.cuda() pos_triplets = pos_triplets.cuda() neg_triplets = neg_triplets.cuda() optimizer.zero_grad() loss, sEmb, sGraph = model(inp, pos_triplets, neg_triplets, weight, mean_score) loss.backward() optimizer.step() # if batch_num==1000: # note.write('epoch %2d batch %2d sp=%1.3f %1.3f %1.3f %1.3f %1.3f %1.3f %1.3f loss=%2.5f sEmb=%2.5f sGraph=%2.5f \n'%(t,batch_num,sp1, sp2, sp3, sp4,sp5, sp6,sp7, loss.data[0], sEmb.data[0], sGraph.data[0])) if batch_num % 100 == 0: word_embeddings = model.metaemb.weight.data.cpu().numpy() sp1, sp2, sp3, sp4, sp5, sp6, sp7 = scorefunction( wordindex, word_embeddings) print( 'epoch %2d batch %2d sp=%1.3f %1.3f %1.3f %1.3f %1.3f %1.3f %1.3f loss=%2.5f sEmb=%2.5f sGraph=%2.5f \r' % (t, batch_num, sp1, sp2, sp3, sp4, sp5, sp6, sp7, loss.data[0], sEmb.data[0], sGraph.data[0]), end="") batch_num = batch_num + 1 word_embeddings = model.metaemb.weight.data.cpu().numpy() sp1, sp2, sp3, sp4, sp5, sp6, sp7 = scorefunction( wordindex, word_embeddings) print( 'epoch=%2d sp=%1.3f %1.3f %1.3f %1.3f %1.3f %1.3f %1.3f loss=%2.5f sEmb=%2.5f sGraph=%2.5f \r' % (t, sp1, sp2, sp3, sp4, sp5, sp6, sp7, loss.data[0], sEmb.data[0], sGraph.data[0]), end="") print( 't=%2d sp=%1.3f %1.3f %1.3f %1.3f %1.3f %1.3f %1.3f loss=%7.2f' % (t, sp1, sp2, sp3, sp4, sp5, sp6, sp7, loss.data[0])) fo = open('Trans_multi_loss1_gamma{}'.format(gamma), 'w') for k in range(len(word_embeddings[:-1])): emb = word_embeddings[k] emb = [str(i) for i in emb] fo.write(id2word[k] + ' ' + ' '.join(emb) + '\n')
class word2vec: def __init__(self, inputfile, pre_trained_vocab_reverse = {}, pre_trained_vocab = {}, vocabulary_size=300000, embedding_dim=200, epoch_num=5, batch_size=16, windows_size=5,neg_sample_num=10): self.op = Options(inputfile, pre_trained_vocab_reverse, pre_trained_vocab, vocabulary_size) self.embedding_dim = embedding_dim self.windows_size = windows_size self.vocabulary_size = len(self.op.vocab_words) self.batch_size = batch_size self.epoch_num = epoch_num self.neg_sample_num = neg_sample_num def train(self, pre_trained_model): model = skipgram(self.vocabulary_size, self.embedding_dim, pre_trained_model) if torch.cuda.is_available(): model.cuda() optimizer = optim.SGD(model.parameters(),lr=0.2) loss_history = list() for epoch in range(self.epoch_num): start = time.time() self.op.process = True batch_num = 0 batch_new = 0 while self.op.process: pos_u, pos_v, neg_v = self.op.generate_batch(self.windows_size, self.batch_size, self.neg_sample_num) pos_u = Variable(torch.LongTensor(pos_u)) pos_v = Variable(torch.LongTensor(pos_v)) neg_v = Variable(torch.LongTensor(neg_v)) if torch.cuda.is_available(): pos_u = pos_u.cuda() pos_v = pos_v.cuda() neg_v = neg_v.cuda() optimizer.zero_grad() loss = model(pos_u, pos_v, neg_v,self.batch_size) loss.backward() optimizer.step() if batch_num % 10 == 0: loss_history.append(loss.data[0]) if batch_num % 2000 == 0: end = time.time() word_embeddings = model.input_embeddings() # sp1, sp2 = scorefunction(word_embeddings) print('epoch,batch=%2d %5d: pair/sec = %4.2f loss=%4.3f\r', \ epoch, batch_num, (batch_num - batch_new) * self.batch_size / (end - start), loss.data[0]) batch_new = batch_num start = time.time() batch_num = batch_num + 1 print() torch.save(model.state_dict(), __location__ + '/skipgram.epoch{}.batch{}'.format(epoch, batch_num)) plt.plot(loss_history[::100]) plt.ylabel('loss (stat.ML)') plt.show() print("Optimization Finished!")
class word2vec: def __init__(self, inputfile, vocabulary_size=100000, embedding_dim=100, epoch_num=10, batch_size=32, windows_size=4, neg_sample_num=7): self.op = Options(inputfile, vocabulary_size) self.embedding_dim = embedding_dim self.windows_size = windows_size self.vocabulary_size = vocabulary_size self.batch_size = batch_size self.epoch_num = epoch_num self.neg_sample_num = neg_sample_num def train(self): model = skipgram(self.vocabulary_size, self.embedding_dim) if torch.cuda.is_available(): model.cuda() optimizer = optim.SGD(model.parameters(), lr=0.2) for epoch in range(self.epoch_num): start = time.time() self.op.process = True batch_num = 0 batch_new = 0 while self.op.process: pos_u, pos_v, neg_v = self.op.generate_batch(self.windows_size, self.batch_size, self.neg_sample_num) pos_u = Variable(torch.LongTensor(pos_u)) pos_v = Variable(torch.LongTensor(pos_v)) neg_v = Variable(torch.LongTensor(neg_v)) if torch.cuda.is_available(): pos_u = pos_u.cuda() pos_v = pos_v.cuda() neg_v = neg_v.cuda() optimizer.zero_grad() loss = model(pos_u, pos_v, neg_v, self.batch_size) loss.backward() optimizer.step() if batch_num % 30000 == 0: torch.save(model.state_dict(), './tmp/skipgram.epoch{}.batch{}'.format(epoch, batch_num)) if batch_num % 1000 == 0: end = time.time() # word_embeddings = model.input_embeddings() ## sp1 and sp2 based in distinct words # sp1, sp2 = scorefunction(word_embeddings) ## loss,data[0] to loss.data # print('eporch,batch=%2d %5d: sp=%1.3f %1.3f pair/sec = %4.2f loss=%4.3f\r' \ # % (epoch, batch_num, sp1, sp2, (batch_num - batch_new) * self.batch_size / (end - start), # loss.data), end="") print('eporch,batch=%2d %5d: pair/sec = %4.2f loss=%4.3f\r' \ % (epoch, batch_num, (batch_num - batch_new) * self.batch_size / (end - start), loss.data), end="") batch_new = batch_num start = time.time() print() batch_num = batch_num + 1 # saving each epoch # bell print('\a') model.save_embedding(os.path.join("data", "embed_epoch_" + str(epoch) + ".vec"), self.op.dic_idx2word) print() print("Optimization Finished!")
class word2vec: def __init__(self, inputfile, vocabulary_size=100000, embedding_dim=200, epoch_num=64, batch_size=256, windows_size=5, neg_sample_num=10): self.op = Options(inputfile, vocabulary_size) self.embedding_dim = embedding_dim self.windows_size = windows_size self.vocabulary_size = vocabulary_size self.batch_size = batch_size self.epoch_num = epoch_num self.neg_sample_num = neg_sample_num # pylint: disable=missing-docstring # Function to draw visualization of distance between embeddings. def plot_with_labels(self, low_dim_embs, labels, filename): assert low_dim_embs.shape[0] >= len( labels), 'More labels than embeddings' plt.figure(figsize=(18, 18)) # in inches for i, label in enumerate(labels): x, y = low_dim_embs[i, :] plt.scatter(x, y) plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom') plt.savefig(filename) plt.show() def train(self): # cudnn.benchmark = True model = skipgram(self.vocabulary_size, self.embedding_dim) if torch.cuda.is_available(): print("using cuda") model.cuda() else: print("not using cuda") optimizer = optim.SGD(model.parameters(), lr=0.2) for epoch in range(self.epoch_num): start = time.time() self.op.process = True batch_num = 0 batch_new = 0 while self.op.process: pos_u, pos_v, neg_v = self.op.generate_batch( self.windows_size, self.batch_size, self.neg_sample_num) pos_u = Variable(torch.LongTensor(pos_u)) pos_v = Variable(torch.LongTensor(pos_v)) neg_v = Variable(torch.LongTensor(neg_v)) if torch.cuda.is_available(): pos_u = pos_u.cuda() pos_v = pos_v.cuda() neg_v = neg_v.cuda() optimizer.zero_grad() loss = model(pos_u, pos_v, neg_v, self.batch_size) loss.backward() optimizer.step() if batch_num % 30000 == 0: torch.save( model.state_dict(), './tmp/skipgram.epoch{}.batch{}'.format( epoch, batch_num)) if batch_num % 2000 == 0: end = time.time() word_embeddings = model.input_embeddings() # sp1, sp2 = scorefunction(word_embeddings) print('epoch,batch=%2d %5d: pair/sec = %4.2f loss=%4.3f\r'\ %(epoch, batch_num, (batch_num-batch_new)*self.batch_size/(end-start),loss.data),end="") batch_new = batch_num start = time.time() batch_num = batch_num + 1 print() tsne = TSNE(perplexity=30, n_components=2, init='random', n_iter=5000) embeds = model.u_embeddings.weight.data labels = [] tokens = [] max_size = 1000 for idx in range(min(len(embeds), len(self.op.vocab_words), max_size)): tokens.append(embeds[idx].cpu().numpy()) labels.append(self.op.vocab_words[idx]) pca = PCA(n_components=50) pca_result = pca.fit_transform(tokens) low_dim_embs = tsne.fit_transform(pca_result) self.plot_with_labels(low_dim_embs, labels, 'tsne.png') print("Optimization finished!")
class word2vec: def __init__(self, inputfile, vocabulary_size=1200000, embedding_dim=300, epoch_num=20, batch_size=128, windows_size=5, neg_sample_num=5): self.op = Options(inputfile, vocabulary_size) self.embedding_dim = embedding_dim self.windows_size = windows_size self.vocabulary_size = vocabulary_size self.batch_size = batch_size self.epoch_num = epoch_num self.neg_sample_num = neg_sample_num def train(self): model = skipgram(self.vocabulary_size, self.embedding_dim) if torch.cuda.is_available(): print("CUDA available") model.cuda() else: print("CUDA NOT available") optimizer = optim.SGD(model.parameters(), lr=0.2) for epoch in range(self.epoch_num): start = time.time() self.op.process = True batch_num = 0 batch_new = 0 while self.op.process: pos_u, pos_v, neg_v = self.op.generate_batch( self.windows_size, self.batch_size, self.neg_sample_num) pos_u = Variable(torch.LongTensor(pos_u)) pos_v = Variable(torch.LongTensor(pos_v)) neg_v = Variable(torch.LongTensor(neg_v)) if torch.cuda.is_available(): pos_u = pos_u.cuda() pos_v = pos_v.cuda() neg_v = neg_v.cuda() optimizer.zero_grad() loss = model(pos_u, pos_v, neg_v, self.batch_size) loss.backward() optimizer.step() #if batch_num%30000 == 0: #torch.save(model.state_dict(), './tmp/skipgram.epoch{}.batch{}'.format(epoch,batch_num)) if batch_num % 2000 == 0: end = time.time() word_embeddings = model.input_embeddings() sp1, sp2, num = scorefunction(word_embeddings) print( 'eporch,batch=%2d %5d: sp=%1.3f %1.3f %3d pair/sec = %4.2f loss=%4.3f\r' % (epoch, batch_num, sp1, sp2, num, (batch_num - batch_new) * self.batch_size / (end - start), loss.data[0]), end="") batch_new = batch_num start = time.time() batch_num = batch_num + 1 print() print("Optimization Finished!") model.save_embedding('embed_en_120w_128_ch.txt', self.op.vocab_words)
class word2vec: def __init__(self, inputfile, val_fn, vocabulary_size=4000, embedding_dim=100, epoch_num=2, batch_size=16, windows_size=5, neg_sample_num=10): logger = logging.getLogger() logger.info("Load train data") self.op = Options(inputfile, vocabulary_size) logger.info("Load test data") self.val_op = Options(val_fn, vocabulary_size, dictionary=self.op.dictionary) self.embedding_dim = embedding_dim self.windows_size = windows_size self.vocabulary_size = vocabulary_size self.batch_size = batch_size self.epoch_num = epoch_num self.neg_sample_num = neg_sample_num def train(self): model = skipgram(self.vocabulary_size, self.embedding_dim) if torch.cuda.is_available(): model.cuda() #return model optimizer = optim.SGD(model.parameters(), lr=0.2) for epoch in range(self.epoch_num): epoch_start = time.time() start = time.time() self.op.process = True batch_num = 0 batch_new = 0 while self.op.process: pos_u, pos_v, neg_v = self.op.generate_batch( self.windows_size, self.batch_size, self.neg_sample_num, verbose=False) pos_u = Variable(torch.LongTensor(pos_u)) pos_v = Variable(torch.LongTensor(pos_v)) neg_v = Variable(torch.LongTensor(neg_v)) if torch.cuda.is_available(): pos_u = pos_u.cuda() pos_v = pos_v.cuda() neg_v = neg_v.cuda() optimizer.zero_grad() loss = model(pos_u, pos_v, neg_v, self.batch_size) loss.backward() optimizer.step() if batch_num % 2000 == 0: end = time.time() with torch.no_grad(): total_val_loss = 0. n_val_batch = 0 self.val_op.process = True while self.val_op.process: pos_u, pos_v, neg_v = self.val_op.generate_batch( self.windows_size, self.batch_size, self.neg_sample_num) pos_u = Variable(torch.LongTensor(pos_u)) pos_v = Variable(torch.LongTensor(pos_v)) neg_v = Variable(torch.LongTensor(neg_v)) if torch.cuda.is_available(): pos_u = pos_u.cuda() pos_v = pos_v.cuda() neg_v = neg_v.cuda() val_loss = model(pos_u, pos_v, neg_v, self.batch_size) total_val_loss += val_loss.item() n_val_batch += 1 word_embeddings = model.input_embeddings() print('epoch,batch=%2d %5d: pair/sec = %4.2f loss=%4.3f val_loss=%4.3f\r'\ %(epoch, batch_num, (batch_num-batch_new)*self.batch_size/(end-start),loss.item(), total_val_loss/n_val_batch)) batch_new = batch_num start = time.time() batch_num = batch_num + 1 print("epoch stat, time: %.2f, batch_num: %d" % (time.time() - epoch_start, batch_num)) torch.save(model.state_dict(), './tmp/skipgram.epoch{}'.format(epoch)) print("Optimization Finished!")