def train(args): tf.set_random_seed(19) tf_config = tf.ConfigProto(allow_soft_placement=True) tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: model = SkipGram(sess=sess, **args['dataset'], **args['model'], **args) model.train()
def train_skipgram(): losses = [] loss_fn = nn.MSELoss() model = SkipGram(vocab_size, embed_size) print(model) print('vocab_size:', vocab_size) optimizer = optim.SGD(model.parameters(), lr=learning_rate) skipgram_train_data = create_skipgram_dataset(text) model.train() for epoch in range(n_epoch): total_loss = .0 for in_w, out_w, target in skipgram_train_data: in_w_var = Variable(torch.LongTensor([w2i[in_w]])) out_w_var = Variable(torch.LongTensor([w2i[out_w]])) model.zero_grad() log_probs = model(in_w_var, out_w_var) loss = loss_fn(log_probs[0], Variable(torch.Tensor([target]))) loss.backward() optimizer.step() total_loss += float(loss) losses.append(total_loss) return model, losses
def main(args): LongTensor = torch.cuda.LongTensor if args.gpu else torch.LongTensor data = get_pickle('assets/dataset.pkl') i2s = get_pickle('assets/i2s.pkl') dataset = skipDataset(data) model = SkipGram(len(i2s), 300) if args.gpu: model.cuda() optimizer = optim.Adam(model.parameters(), lr=args.lr) losses = [] epoch_losses = [np.inf, np.inf, np.inf] total_n = len(dataset) tmplt = "E:{:2d} - i:{:5d}({:4.2f}%) - L:{:5.5f}" for epoch in range(args.epoch): dataloader = DataLoader(dataset, batch_size=args.bs, collate_fn=collate_fn, shuffle=True) model.train() losses = [] for i, batch in enumerate(dataloader): center, target = batch center = LongTensor(center) target = LongTensor(target) loss = model(center, target) loss.backward() optimizer.step() model.zero_grad() losses.append(loss.data) if i % 100 == 0: ml = np.mean(losses) t = tmplt.format(epoch, i, i * args.bs / total_n * 100, ml) print(t) losses = [] model.eval() dataloader = DataLoader(dataset, batch_size=args.bs, collate_fn=collate_fn, shuffle=True) losses = [] for i, batch in enumerate(dataloader): center, target = batch center = torch.LongTensor(center) target = torch.LongTensor(target) loss = model(center, target) losses.append(loss.data) epoch_losses.append(np.mean(losses)) print('Epoch loss {}'.format(epoch_losses[-1])) if epoch_losses[-1] > epoch_losses[-4]: break else: filename = 'assets/model/model_skip.torch' state = dict(state_dict=model.state_dict(), loss=epoch_losses, args=args) torch.save(state, filename)
def train(self): if self.model_name == 'SkipGram': model = SkipGram(self.vocabulary_size, self.embedding_dim) elif self.model_name == 'CBOW': return if torch.cuda.is_available(): model.cuda() optimizer = optim.SGD(model.parameters(), lr=0.2) for epoch in range(self.epoch): start = time.time() self.op.process = True batch_num = 0 batch_new = 0 while self.op.process: pos_u, pos_v, neg_v = self.op.generate_batch( self.windows_size, self.batch_size, self.neg_sample_size) pos_u = Variable(torch.LongTensor(pos_u)) pos_v = Variable(torch.LongTensor(pos_v)) neg_v = Variable(torch.LongTensor(neg_v)) if torch.cuda.is_available(): pos_u = pos_u.cuda() pos_v = pos_v.cuda() neg_v = neg_v.cuda() optimizer.zero_grad() loss = model(pos_u, pos_v, neg_v, self.batch_size) loss.backward() optimizer.step() if batch_num % 3000 == 0: end = time.time() print( 'epoch,batch = %2d %5d: pair/sec = %4.2f loss = %4.3f\r' % (epoch, batch_num, (batch_num - batch_new) * self.batch_size / (end - start), loss.data[0]), end="\n") batch_new = batch_num start = time.time() batch_num += 1 model.save_embeddings(self.op.idx2word, 'word_embdding.txt', torch.cuda.is_available())
def main(): args = docopt(__doc__) embedding_dim = int(args['--dim']) max_context = int(args['--max-context']) neg_sample_factor = int(args['--neg-sample-factor']) batch_size = int(args['--batch']) lr = float(args['--lr']) epochs = int(args['--epochs']) np.random.seed(int(args['--seed'])) torch.manual_seed(int(args['--seed'])) torch.cuda.manual_seed_all(int(args['--seed'])) device = torch.device(int(args['--device'])) print(f"{device} will be used") num_workers = int(args['--num-workers']) fpath = args['--file'] backup_interval = int(args['--backup-interval']) dname = args['--dirname'] dset = FixedLengthContextDataset(fpath, max_context, neg_sample_factor) vocabulary_size = dset.num_authors # Symmetric vectors are used to compute cosine similarity if args['symmetric']: model = SymmetricEmbedding(vocabulary_size, embedding_dim) # Word2Vec Skip-gram. Unsymmetric vectors are used to compute cosine similarity elif args['skipgram']: model = SkipGram(vocabulary_size, embedding_dim) if dname == None: tmp = 'symmetric' if args['symmetric'] else 'skipgram' dname = get_dirname(f'embedding_{tmp}') else: os.makedirs(dname) if torch.cuda.is_available(): model = model.to(device) loader = DataLoader(dset, batch_size, num_workers=num_workers) train(model, loader, dname, epochs, lr, backup_interval, device)
def train(self, report=True): model = SkipGram(self.vocabulary_size, self.embedding_dim) loss_list = list() if torch.cuda.is_available(): model.cuda() optimizer = optim.SGD(model.parameters(), lr=0.2) for epoch in range(self.epoch): start = time.time() self.data.process = True batch_num = 0 batch_new = 0 for data_word, data_sentence in self.data_loader(): optimizer.zero_grad() loss = model(data_word) / self.batch_size # loss = model(pos_u, pos_v, neg_v, self.batch_size, target, contex, labels) loss_list.append(loss) loss.backward() optimizer.step() if report and batch_num % 7 == 0: # 3000 end = time.time() print( 'epoch,batch = %2d %5d: batch_size = %5d loss = %4.3f\r' % (epoch, batch_num, self.batch_size, loss.item()), end="\n") batch_new = batch_num start = time.time() batch_num += 1 self.showPlot(loss_list, 'Losses') model.save_embeddings(self.data.idx2word, 'word_embdding.txt')
def main(): train_set = PTBDataSet() train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True) device = torch.device('cuda') model = SkipGram(train_set.get_token_num(), embedding_dim).to(device) optimizer = optim.Adam(model.parameters(), lr=lr) for epoch in range(epochs): total_loss = 0 for batch_idx, (center, context_negative, mask, label) in enumerate(train_loader): center, context_negative, mask, label = center.to(device), context_negative.to(device), mask.to( device), label.to(device) criteon = nn.BCEWithLogitsLoss(weight=mask.double(), reduction='none').to(device) # pred: [batch_size, max_len] pred = model(center, context_negative) loss = torch.sum(torch.sum(criteon(pred.double(), label.double()), dim=1) / torch.sum(mask.double(), dim=1)) total_loss += loss.item() if batch_idx % 200 == 0: print(f'epoch {epoch+1} batch {batch_idx} loss {loss.item()/pred.shape[0]}') optimizer.zero_grad() loss.backward() optimizer.step() print(f'-->epoch {epoch+1} average loss {total_loss/train_set.__len__()}') model.get_topk_similar_tokens('chip', train_set.index_to_token, train_set.token_to_index, device, show_top_k)
print("Load corpus from pickle file...") with open('nltk_reuters_corpus.pkl', 'rb') as f: corpus = pickle.load(f) print("Building model...") vocab_size, vocab = get_count_distinct(corpus) word2idx, idx2word = get_vocab_dicts(vocab_size, vocab) window_size = 2 # declare a Tensorflow graph graph = tf.Graph() with graph.as_default() as g: # create an instance of SkipGram model model = SkipGram(vocab_size=vocab_size, embedding_dim=128, window_size=window_size, batch_size=16, graph=g) # configure GPU options sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True # start the Tensorflow session with tf.Session(config=sess_config) as sess: # writer and saver objects writer = tf.summary.FileWriter("./logs") sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() # check for already present checkpoint files
s2i = get_pickle('assets/s2i.pkl') i2s = get_pickle('assets/i2s.pkl') holdings = pd.read_csv('assets/holdings.csv', index_col=6) glove_cor_checkpoint = torch.load('assets/model/model_glove_cor.torch') model_glove = GloVeCor(len(s2i), 300) model_glove.load_state_dict(glove_cor_checkpoint['state_dict']) weights = model_glove.embeddings.weight.detach() np.savetxt('embeddings/glove_cor_tensors.tsv', weights, delimiter='\t') glove_cov_checkpoint = torch.load('assets/model/model_glove_cov.torch') model_glove = GloVeCov(len(s2i), 300) model_glove.load_state_dict(glove_cov_checkpoint['state_dict']) weights = model_glove.embeddings.weight.detach() np.savetxt('embeddings/glove_cov_tensors.tsv', weights, delimiter='\t') skip_checkpoint = torch.load('assets/model/model_skip.torch') model_skip = SkipGram(len(s2i), 300) model_skip.load_state_dict(skip_checkpoint['state_dict']) weights = model_skip.embeddings.weight.detach() np.savetxt('embeddings/skip_tensors.tsv', weights, delimiter='\t') selector = [i2s[e] for e in range(len(weights))] cols = ['Name', 'Sector', 'Industry Group', 'Country', 'Currency'] metadata = holdings.loc[selector, cols] metadata.to_csv('assets/metadata.tsv', sep='\t') cols = ['Name', 'Currency'] metadata = holdings.loc[selector, cols] metadata.to_csv('embeddings/metadata.tsv', sep='\t', index=False)
args = parser.parse_args() tokenize_data = loadData() int_text, word2idx, idx2word, freq, vocab = prepareData(tokenize_data, args.min_freq) if args.mode == "train": vocab_size = sum([freq[k] for k in freq]) subsampled_words = subsampling(freq, args.sampling_threshold, vocab, vocab_size, word2idx) neg_sample = negativeSampling(freq) #print(neg_sample.shape) device='cpu' model = SkipGram(len(word2idx), args.embed_size, neg_sample).to(device) optimizer = optim.Adam(model.parameters(), args.lr) epoch = args.epochs steps = 0 for i in range(epoch): for input_words, target_words in loadBatches(subsampled_words, args.batch_size, args.window_size): steps = steps + 1 inputs = torch.LongTensor(input_words) targets = torch.LongTensor(target_words) #inputs, targets = inputs.to(device), targets.to(device) loss = model.forward(inputs, targets, inputs.shape[0], 2)
def create_model(self): print("Initialize model") vocab_size = len(self.word2idx) self.model = SkipGram(vocab_size=vocab_size, emb_dim=self.embed_dim).to(self.device) self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr)
with open(f'{FILENAME}.vocab') as f: vocab = sorted(f.readline().split()) VOCAB_SIZE = len(vocab) # String2int conversion words_to_idx = {i: j for j, i in enumerate(vocab)} # Create dataset train_dataset = WikiDataset(f'{FILENAME}.parsed', 1, words_to_idx) train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS) # Get random sampler sampler = NegativeSampler(f'{FILENAME}.parsed.count', words_to_idx) for emb_dim in EMBEDDING_DIMS: model = SkipGram(VOCAB_SIZE, emb_dim) model.to(DEVICE) model.device = DEVICE optimizer = torch.optim.SparseAdam(model.parameters()) train(model, train_dataloader, optimizer, sampler, VOCAB_SIZE, epochs=EPOCHS, save_path=f'{SAVE_FOLDER}{emb_dim}_')
def __init__(self, mode, vocab_dim, embed_dim, sparse): self.mode = mode if self.mode == 'cbow': self.model = CBOW(vocab_dim, embed_dim, sparse) elif self.mode == 'skip-gram': self.model = SkipGram(vocab_dim, embed_dim, sparse)