def do_infer_sent(args): if not os.path.exists(args.name + '.token'): logging.error('missing {} file (run preprocess mode)'.format(args.name + '.token')) sys.exit() if not os.path.exists(args.name + '.vocab'): logging.error('missing {} file (run preprocess mode)'.format(args.name + '.vocab')) sys.exit() if len(glob.glob(args.name + '.model.?????????.pth')) == 0: logging.error('no model available: {}'.format(args.name + '.model.?????????.pth')) sys.exit() token = OpenNMTTokenizer(args.name + '.token') vocab = Vocab() vocab.read(args.name + '.vocab') args.embedding_size, args.pooling = read_params(args) model = Word2Vec(len(vocab), args.embedding_size, args.pooling, vocab.idx_unk) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=(args.beta1,args.beta2), eps=args.eps) n_steps, model, optimizer = load_model_optim(args.name, args.embedding_size, vocab, model, optimizer) if args.cuda: model.cuda() dataset = Dataset(args, token, vocab, 'infer_sent', skip_subsampling=True) with torch.no_grad(): model.eval() for batch in dataset: snts = model.SentEmbed(batch[0], batch[1], 'iEmb').cpu().detach().numpy().tolist() for i in range(len(snts)): sentence = ["{:.6f}".format(w) for w in snts[i]] print('{}\t{}'.format(batch[2][i]+1, ' '.join(sentence) ))
def do_infer_word(args): if not os.path.exists(args.name + '.token'): logging.error('missing {} file (run preprocess mode)'.format(args.name + '.token')) sys.exit() if not os.path.exists(args.name + '.vocab'): logging.error('missing {} file (run preprocess mode)'.format(args.name + '.vocab')) sys.exit() if len(glob.glob(args.name + '.model.?????????.pth')) == 0: logging.error('no model available: {}'.format(args.name + '.model.?????????.pth')) sys.exit() token = OpenNMTTokenizer(args.name + '.token') vocab = Vocab() vocab.read(args.name + '.vocab') args.embedding_size, args.pooling = read_params(args) model = Word2Vec(len(vocab), args.embedding_size, args.pooling, vocab.idx_unk) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=(args.beta1,args.beta2), eps=args.eps) n_steps, model, optimizer = load_model_optim(args.name, args.embedding_size, vocab, model, optimizer) if args.cuda: model.cuda() if args.sim == 'cos': distance = nn.CosineSimilarity(dim=1, eps=1e-6) elif args.sim == 'pairwise': distance = nn.PairwiseDistance(eps=1e-6) else: logging.error('bad -sim option {}'.format(args.sim)) sys.exit() dataset = Dataset(args, token, vocab, 'infer_word', skip_subsampling=True) with torch.no_grad(): model.eval() voc_i = [i for i in range(0,len(vocab))] voc_e = model.Embed(voc_i,'iEmb') for batch in dataset: #batch[0] batch_wrd #batch[1] batch_isnt #batch[2] batch_iwrd wrd_i = batch[0] wrd_e = model.Embed(wrd_i, 'iEmb') #.cpu().detach().numpy().tolist() for i in range(len(wrd_i)): ### words to find their closest ind_snt = batch[1][i] ind_wrd = batch[2][i] wrd = vocab[wrd_i[i]] out = [] out.append("{}:{}:{}".format(ind_snt,ind_wrd,wrd)) dist_wrd_voc = distance(wrd_e[i].unsqueeze(0),voc_e) mininds = torch.argsort(dist_wrd_voc,dim=0,descending=True) for k in range(1,len(mininds)): ind = mininds[k].item() #cpu().detach().numpy() if i != ind: dis = dist_wrd_voc[ind].item() wrd = vocab[ind] out.append("{:.6f}:{}".format(dis,wrd)) if len(out)-1 == args.k: break print('\t'.join(out))
def do_train(args): if not os.path.exists(args.name + '.token'): logging.error('missing {} file (run preprocess mode)'.format(args.name + '.token')) sys.exit() if not os.path.exists(args.name + '.vocab'): logging.error('missing {} file (run preprocess mode)'.format(args.name + '.vocab')) sys.exit() token = OpenNMTTokenizer(args.name + '.token') vocab = Vocab() vocab.read(args.name + '.vocab') if os.path.exists(args.name + '.param'): args.embedding_size, args.pooling = read_params(args) else: write_params(args) model = Word2Vec(len(vocab), args.embedding_size, args.pooling, vocab.idx_unk) if args.cuda: model.cuda() # optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=(args.beta1,args.beta2), eps=args.eps) # optimizer = torch.optim.SGD(model.parameters(), lr=0.05) optimizer = torch.optim.AdamW(model.parameters(), lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps, weight_decay=0.01, amsgrad=False) n_steps, model, optimizer = load_model_optim(args.name, args.embedding_size, vocab, model, optimizer) dataset = Dataset(args, token, vocab, args.method) n_epochs = 0 losses = [] while True: n_epochs += 1 for batch in dataset: model.train() if args.method == 'skipgram': loss = model.forward_skipgram(batch) elif args.method == 'cbow': loss = model.forward_cbow(batch) elif args.method == 'sbow': loss = model.forward_sbow(batch) optimizer.zero_grad() loss.backward() optimizer.step() n_steps += 1 losses.append(loss.data.cpu().detach().numpy()) if n_steps % args.report_every_n_steps == 0: accum_loss = np.mean(losses) logging.info('{} n_epoch={} n_steps={} Loss={:.6f}'.format(args.method, n_epochs,n_steps,accum_loss)) losses = [] if n_steps % args.save_every_n_steps == 0: save_model_optim(args.name, model, optimizer, n_steps, args.keep_last_n) if n_epochs >= args.max_epochs: logging.info('Stop (max epochs reached)') break save_model_optim(args.name, model, optimizer, n_steps, args.keep_last_n)