def init_training(args): """ Initialize training process """ # load vocabulary vocab = torch.load(args.vocab) # build model transformer = Transformer(args, vocab) # if finetune if args.finetune: print("[Finetune] %s" % args.finetune_model_path) transformer.load_state_dict(torch.load(args.finetune_model_path)) # vocab_mask for masking padding vocab_mask = torch.ones(len(vocab.tgt)) vocab_mask[vocab.tgt[constants.PAD_WORD]] = 0 # loss object cross_entropy_loss = nn.CrossEntropyLoss(weight=vocab_mask, size_average=False) if args.cuda: transformer = transformer.cuda() cross_entropy_loss = cross_entropy_loss.cuda() if args.optimizer == "Warmup_Adam": optimizer = ScheduledOptim( torch.optim.Adam(transformer.get_trainable_parameters(), betas=(0.9, 0.98), eps=1e-09), args.d_model, args.n_warmup_steps) if args.optimizer == "Adam": optimizer = torch.optim.Adam( params=transformer.get_trainable_parameters(), lr=args.lr, betas=(0.9, 0.98), eps=1e-8) if args.optimizer == 'SGD': optimizer = torch.optim.SGD( params=transformer.get_trainable_parameters(), lr=args.lr) # multi gpus if torch.cuda.device_count() > 1: print("[Multi GPU] using", torch.cuda.device_count(), "GPUs\n") transformer = nn.DataParallel(transformer) return vocab, transformer, optimizer, cross_entropy_loss
corpus["train"]["tgt_indexs"], corpus["train"]["tgt_texts"], batch_size=args.batch_size, cuda=use_cuda) validation_data = DataLoader(corpus["valid"]["src_texts"], corpus["valid"]["src_turn"], corpus["valid"]["tgt_indexs"], corpus["valid"]["tgt_texts"], batch_size=args.batch_size, cuda=use_cuda) model = Transformer(args) criterion = CrossEntropy() optimizer = torch.optim.Adam(model.get_trainable_parameters(), lr=args.learning_rate) if use_cuda: model = model.cuda() criterion = criterion.cuda() def get_performance(crit, distributes, gold): loss = crit(distributes, gold) _, predict = distributes.max(dim=-1) n_correct = predict.eq(gold) n_correct = n_correct.data.masked_select(gold.ne(const.PAD)).sum() n_gold = gold.ne(const.PAD).sum()