def train(args): tf.set_random_seed(19) tf_config = tf.ConfigProto(allow_soft_placement=True) tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: model = SkipGram(sess=sess, **args['dataset'], **args['model'], **args) model.train()
def train_skipgram(): losses = [] loss_fn = nn.MSELoss() model = SkipGram(vocab_size, embed_size) print(model) print('vocab_size:', vocab_size) optimizer = optim.SGD(model.parameters(), lr=learning_rate) skipgram_train_data = create_skipgram_dataset(text) model.train() for epoch in range(n_epoch): total_loss = .0 for in_w, out_w, target in skipgram_train_data: in_w_var = Variable(torch.LongTensor([w2i[in_w]])) out_w_var = Variable(torch.LongTensor([w2i[out_w]])) model.zero_grad() log_probs = model(in_w_var, out_w_var) loss = loss_fn(log_probs[0], Variable(torch.Tensor([target]))) loss.backward() optimizer.step() total_loss += float(loss) losses.append(total_loss) return model, losses
class word2vec(): def __init__(self, mode, vocab_dim, embed_dim, sparse): self.mode = mode if self.mode == 'cbow': self.model = CBOW(vocab_dim, embed_dim, sparse) elif self.mode == 'skip-gram': self.model = SkipGram(vocab_dim, embed_dim, sparse) def train(self, training_data, num_epochs=3, learning_rate=0.025): # Upload Model to GPU device = torch.device('cuda:0') self.model.to(device) # Set Optimizer and Linear Scheduler optimizer = optim.SGD(self.model.parameters(), lr=learning_rate) scheduler_1 = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma = 2/3) scheduler_2 = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma = 1/2) # Set Loss Function loss_function = nn.NLLLoss() # Train for epoch in range(num_epochs): print('Epoch {} Started...'.format(epoch+1)) for i, (X, y) in tqdm(enumerate(training_data)): if X.nelement() != 0: X, y = X.to(device), y.to(device) optimizer.zero_grad() loss = loss_function(self.model.forward(X), y) loss.backward() optimizer.step() if i%50000 == 0: print('Iteration : {}, Loss : {:.6f}'.format(i, loss.item())) if epoch == 0: scheduler_1.step() elif epoch == 1: scheduler_2.step()
def main(): train_set = PTBDataSet() train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True) device = torch.device('cuda') model = SkipGram(train_set.get_token_num(), embedding_dim).to(device) optimizer = optim.Adam(model.parameters(), lr=lr) for epoch in range(epochs): total_loss = 0 for batch_idx, (center, context_negative, mask, label) in enumerate(train_loader): center, context_negative, mask, label = center.to(device), context_negative.to(device), mask.to( device), label.to(device) criteon = nn.BCEWithLogitsLoss(weight=mask.double(), reduction='none').to(device) # pred: [batch_size, max_len] pred = model(center, context_negative) loss = torch.sum(torch.sum(criteon(pred.double(), label.double()), dim=1) / torch.sum(mask.double(), dim=1)) total_loss += loss.item() if batch_idx % 200 == 0: print(f'epoch {epoch+1} batch {batch_idx} loss {loss.item()/pred.shape[0]}') optimizer.zero_grad() loss.backward() optimizer.step() print(f'-->epoch {epoch+1} average loss {total_loss/train_set.__len__()}') model.get_topk_similar_tokens('chip', train_set.index_to_token, train_set.token_to_index, device, show_top_k)
def train(self): if self.model_name == 'SkipGram': model = SkipGram(self.vocabulary_size, self.embedding_dim) elif self.model_name == 'CBOW': return if torch.cuda.is_available(): model.cuda() optimizer = optim.SGD(model.parameters(), lr=0.2) for epoch in range(self.epoch): start = time.time() self.op.process = True batch_num = 0 batch_new = 0 while self.op.process: pos_u, pos_v, neg_v = self.op.generate_batch( self.windows_size, self.batch_size, self.neg_sample_size) pos_u = Variable(torch.LongTensor(pos_u)) pos_v = Variable(torch.LongTensor(pos_v)) neg_v = Variable(torch.LongTensor(neg_v)) if torch.cuda.is_available(): pos_u = pos_u.cuda() pos_v = pos_v.cuda() neg_v = neg_v.cuda() optimizer.zero_grad() loss = model(pos_u, pos_v, neg_v, self.batch_size) loss.backward() optimizer.step() if batch_num % 3000 == 0: end = time.time() print( 'epoch,batch = %2d %5d: pair/sec = %4.2f loss = %4.3f\r' % (epoch, batch_num, (batch_num - batch_new) * self.batch_size / (end - start), loss.data[0]), end="\n") batch_new = batch_num start = time.time() batch_num += 1 model.save_embeddings(self.op.idx2word, 'word_embdding.txt', torch.cuda.is_available())
def main(): args = docopt(__doc__) embedding_dim = int(args['--dim']) max_context = int(args['--max-context']) neg_sample_factor = int(args['--neg-sample-factor']) batch_size = int(args['--batch']) lr = float(args['--lr']) epochs = int(args['--epochs']) np.random.seed(int(args['--seed'])) torch.manual_seed(int(args['--seed'])) torch.cuda.manual_seed_all(int(args['--seed'])) device = torch.device(int(args['--device'])) print(f"{device} will be used") num_workers = int(args['--num-workers']) fpath = args['--file'] backup_interval = int(args['--backup-interval']) dname = args['--dirname'] dset = FixedLengthContextDataset(fpath, max_context, neg_sample_factor) vocabulary_size = dset.num_authors # Symmetric vectors are used to compute cosine similarity if args['symmetric']: model = SymmetricEmbedding(vocabulary_size, embedding_dim) # Word2Vec Skip-gram. Unsymmetric vectors are used to compute cosine similarity elif args['skipgram']: model = SkipGram(vocabulary_size, embedding_dim) if dname == None: tmp = 'symmetric' if args['symmetric'] else 'skipgram' dname = get_dirname(f'embedding_{tmp}') else: os.makedirs(dname) if torch.cuda.is_available(): model = model.to(device) loader = DataLoader(dset, batch_size, num_workers=num_workers) train(model, loader, dname, epochs, lr, backup_interval, device)
def train(self, report=True): model = SkipGram(self.vocabulary_size, self.embedding_dim) loss_list = list() if torch.cuda.is_available(): model.cuda() optimizer = optim.SGD(model.parameters(), lr=0.2) for epoch in range(self.epoch): start = time.time() self.data.process = True batch_num = 0 batch_new = 0 for data_word, data_sentence in self.data_loader(): optimizer.zero_grad() loss = model(data_word) / self.batch_size # loss = model(pos_u, pos_v, neg_v, self.batch_size, target, contex, labels) loss_list.append(loss) loss.backward() optimizer.step() if report and batch_num % 7 == 0: # 3000 end = time.time() print( 'epoch,batch = %2d %5d: batch_size = %5d loss = %4.3f\r' % (epoch, batch_num, self.batch_size, loss.item()), end="\n") batch_new = batch_num start = time.time() batch_num += 1 self.showPlot(loss_list, 'Losses') model.save_embeddings(self.data.idx2word, 'word_embdding.txt')
print("Load corpus from pickle file...") with open('nltk_reuters_corpus.pkl', 'rb') as f: corpus = pickle.load(f) print("Building model...") vocab_size, vocab = get_count_distinct(corpus) word2idx, idx2word = get_vocab_dicts(vocab_size, vocab) window_size = 2 # declare a Tensorflow graph graph = tf.Graph() with graph.as_default() as g: # create an instance of SkipGram model model = SkipGram(vocab_size=vocab_size, embedding_dim=128, window_size=window_size, batch_size=16, graph=g) # configure GPU options sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True # start the Tensorflow session with tf.Session(config=sess_config) as sess: # writer and saver objects writer = tf.summary.FileWriter("./logs") sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() # check for already present checkpoint files
s2i = get_pickle('assets/s2i.pkl') i2s = get_pickle('assets/i2s.pkl') holdings = pd.read_csv('assets/holdings.csv', index_col=6) glove_cor_checkpoint = torch.load('assets/model/model_glove_cor.torch') model_glove = GloVeCor(len(s2i), 300) model_glove.load_state_dict(glove_cor_checkpoint['state_dict']) weights = model_glove.embeddings.weight.detach() np.savetxt('embeddings/glove_cor_tensors.tsv', weights, delimiter='\t') glove_cov_checkpoint = torch.load('assets/model/model_glove_cov.torch') model_glove = GloVeCov(len(s2i), 300) model_glove.load_state_dict(glove_cov_checkpoint['state_dict']) weights = model_glove.embeddings.weight.detach() np.savetxt('embeddings/glove_cov_tensors.tsv', weights, delimiter='\t') skip_checkpoint = torch.load('assets/model/model_skip.torch') model_skip = SkipGram(len(s2i), 300) model_skip.load_state_dict(skip_checkpoint['state_dict']) weights = model_skip.embeddings.weight.detach() np.savetxt('embeddings/skip_tensors.tsv', weights, delimiter='\t') selector = [i2s[e] for e in range(len(weights))] cols = ['Name', 'Sector', 'Industry Group', 'Country', 'Currency'] metadata = holdings.loc[selector, cols] metadata.to_csv('assets/metadata.tsv', sep='\t') cols = ['Name', 'Currency'] metadata = holdings.loc[selector, cols] metadata.to_csv('embeddings/metadata.tsv', sep='\t', index=False)
args = parser.parse_args() tokenize_data = loadData() int_text, word2idx, idx2word, freq, vocab = prepareData(tokenize_data, args.min_freq) if args.mode == "train": vocab_size = sum([freq[k] for k in freq]) subsampled_words = subsampling(freq, args.sampling_threshold, vocab, vocab_size, word2idx) neg_sample = negativeSampling(freq) #print(neg_sample.shape) device='cpu' model = SkipGram(len(word2idx), args.embed_size, neg_sample).to(device) optimizer = optim.Adam(model.parameters(), args.lr) epoch = args.epochs steps = 0 for i in range(epoch): for input_words, target_words in loadBatches(subsampled_words, args.batch_size, args.window_size): steps = steps + 1 inputs = torch.LongTensor(input_words) targets = torch.LongTensor(target_words) #inputs, targets = inputs.to(device), targets.to(device) loss = model.forward(inputs, targets, inputs.shape[0], 2)
def create_model(self): print("Initialize model") vocab_size = len(self.word2idx) self.model = SkipGram(vocab_size=vocab_size, emb_dim=self.embed_dim).to(self.device) self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr)
class Word2Vec: def __init__(self, lang="english", n_epoch=20, batch_size=500, embed_dim=300, window_size=5, neg_sample=10, min_count=5, lr=0.01, report_every=1): self.lang = lang self.n_epoch = n_epoch self.batch_size = batch_size self.embed_dim = embed_dim self.window_size = window_size self.neg_sample = neg_sample self.min_count = min_count self.lr = lr self.report_every = report_every self.model, self.optimizer = None, None self.batches, self.vocab, self.word2idx, self.idx2word = [], [], [], [] # check if GPU available is_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if is_cuda else "cpu") # number of cpu threads for torch workers = multiprocessing.cpu_count() torch.set_num_threads(workers) print("Train session using {}, processor numbers: {}".format(self.device, workers)) def handle_data(self): # get dataset in correct format print("Downloading the data") train_data, dev_data, test_data = get_data(self.lang) # process data for training processor = DataProcess(corpus=train_data, batch_size=self.batch_size, neg_sample=self.neg_sample, window_size=self.window_size, min_freq=self.min_count) print("Processing data") self.batches, self.vocab, self.word2idx, self.idx2word = processor.pipeline() def create_model(self): print("Initialize model") vocab_size = len(self.word2idx) self.model = SkipGram(vocab_size=vocab_size, emb_dim=self.embed_dim).to(self.device) self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr) def train(self): print('Start training') # print(self.data.gen_batch()[0]) for epoch in range(self.n_epoch): total_loss = 0 for minibatch in self.batches: pos_u = torch.tensor(minibatch[0], dtype=torch.long).to(self.device) pos_v = torch.tensor(minibatch[1], dtype=torch.long).to(self.device) neg_v = torch.tensor(minibatch[2], dtype=torch.long).to(self.device) # print(len(pos_u), len(pos_v), len(neg_v)) self.optimizer.zero_grad() loss = self.model.forward(pos_u, pos_v, neg_v) loss.backward() self.optimizer.step() total_loss += loss.item() if ((epoch + 1) % self.report_every) == 0: print('epoch: %d, loss: %.4f' % (epoch + 1, total_loss)) def save_model(self, filepath): print("Saved model in {}".format(filepath)) self.model.save(filepath, self.idx2word)
def main(args): LongTensor = torch.cuda.LongTensor if args.gpu else torch.LongTensor data = get_pickle('assets/dataset.pkl') i2s = get_pickle('assets/i2s.pkl') dataset = skipDataset(data) model = SkipGram(len(i2s), 300) if args.gpu: model.cuda() optimizer = optim.Adam(model.parameters(), lr=args.lr) losses = [] epoch_losses = [np.inf, np.inf, np.inf] total_n = len(dataset) tmplt = "E:{:2d} - i:{:5d}({:4.2f}%) - L:{:5.5f}" for epoch in range(args.epoch): dataloader = DataLoader(dataset, batch_size=args.bs, collate_fn=collate_fn, shuffle=True) model.train() losses = [] for i, batch in enumerate(dataloader): center, target = batch center = LongTensor(center) target = LongTensor(target) loss = model(center, target) loss.backward() optimizer.step() model.zero_grad() losses.append(loss.data) if i % 100 == 0: ml = np.mean(losses) t = tmplt.format(epoch, i, i * args.bs / total_n * 100, ml) print(t) losses = [] model.eval() dataloader = DataLoader(dataset, batch_size=args.bs, collate_fn=collate_fn, shuffle=True) losses = [] for i, batch in enumerate(dataloader): center, target = batch center = torch.LongTensor(center) target = torch.LongTensor(target) loss = model(center, target) losses.append(loss.data) epoch_losses.append(np.mean(losses)) print('Epoch loss {}'.format(epoch_losses[-1])) if epoch_losses[-1] > epoch_losses[-4]: break else: filename = 'assets/model/model_skip.torch' state = dict(state_dict=model.state_dict(), loss=epoch_losses, args=args) torch.save(state, filename)
with open(f'{FILENAME}.vocab') as f: vocab = sorted(f.readline().split()) VOCAB_SIZE = len(vocab) # String2int conversion words_to_idx = {i: j for j, i in enumerate(vocab)} # Create dataset train_dataset = WikiDataset(f'{FILENAME}.parsed', 1, words_to_idx) train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS) # Get random sampler sampler = NegativeSampler(f'{FILENAME}.parsed.count', words_to_idx) for emb_dim in EMBEDDING_DIMS: model = SkipGram(VOCAB_SIZE, emb_dim) model.to(DEVICE) model.device = DEVICE optimizer = torch.optim.SparseAdam(model.parameters()) train(model, train_dataloader, optimizer, sampler, VOCAB_SIZE, epochs=EPOCHS, save_path=f'{SAVE_FOLDER}{emb_dim}_')
def __init__(self, mode, vocab_dim, embed_dim, sparse): self.mode = mode if self.mode == 'cbow': self.model = CBOW(vocab_dim, embed_dim, sparse) elif self.mode == 'skip-gram': self.model = SkipGram(vocab_dim, embed_dim, sparse)