def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-data', required=True) parser.add_argument('-epoch', type=int, default=100) parser.add_argument('-batch_size', type=int, default=64) #parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=1024) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default=None) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='all') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-multi_gpu', action='store_true') parser.add_argument('-use_ctx', action='store_true') parser.add_argument( '-external_validation_script', type=str, default=None, metavar='PATH', nargs='*', help= "location of validation script (to run your favorite metric for validation) (default: %(default)s)" ) opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model #========= Loading Dataset =========# data = torch.load(opt.data) opt.max_token_seq_len = data['settings'].max_token_seq_len #========= Preparing DataLoader =========# training_data = DataLoader( data['dict']['src'], data['dict']['tgt'], src_insts=data['train']['src'], tgt_insts=data['train']['tgt'], ctx_insts=(data['train']['ctx'] if opt.use_ctx else None), batch_size=opt.batch_size, cuda=opt.cuda, is_train=True, sort_by_length=True) validation_data = DataLoader( data['dict']['src'], data['dict']['tgt'], src_insts=data['valid']['src'], tgt_insts=data['valid']['tgt'], ctx_insts=(data['valid']['ctx'] if opt.use_ctx else None), batch_size=opt.batch_size, shuffle=False, cuda=opt.cuda, is_train=False, sort_by_length=True) opt.src_vocab_size = training_data.src_vocab_size opt.tgt_vocab_size = training_data.tgt_vocab_size #========= Preparing Model =========# if opt.embs_share_weight and training_data.src_word2idx != training_data.tgt_word2idx: print( '[Warning]', 'The src/tgt word2idx table are different but asked to share word embedding.' ) print(opt) transformer = Transformer(opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, proj_share_weight=opt.proj_share_weight, embs_share_weight=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner_hid=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout, use_ctx=opt.use_ctx) #print(transformer) # optimizer = ScheduledOptim( # optim.Adam( # transformer.get_trainable_parameters(), # betas=(0.9, 0.98), eps=1e-09), # opt.d_model, opt.n_warmup_steps) optimizer = ScheduledOptim( optim.Adam(transformer.get_trainable_parameters(), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) def get_criterion(vocab_size): ''' With PAD token zero weight ''' weight = torch.ones(vocab_size) weight[Constants.PAD] = 0 #return nn.CrossEntropyLoss(weight, size_average=False) return nn.NLLLoss(weight, size_average=False) crit = get_criterion(training_data.tgt_vocab_size) logsoftmax = nn.LogSoftmax() if opt.cuda: transformer = transformer.cuda() crit = crit.cuda() logsoftmax = logsoftmax.cuda() if opt.multi_gpu: transformer = nn.DataParallel(transformer) crit = nn.DataParallel(crit) logsoftmax = nn.DataParallel(logsoftmax) train(transformer, training_data, validation_data, crit, logsoftmax, optimizer, opt)
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-data', required=True) parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-batch_size', type=int, default=64) #parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=1024) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default=None) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model #========= Loading Dataset =========# data = torch.load(opt.data) opt.max_token_seq_len = data['settings'].max_token_seq_len #========= Preparing DataLoader =========# training_data = DataLoader(data['dict']['src'], data['dict']['tgt'], src_insts=data['train']['src'], tgt_insts=data['train']['tgt'], batch_size=opt.batch_size, cuda=opt.cuda) validation_data = DataLoader(data['dict']['src'], data['dict']['tgt'], src_insts=data['valid']['src'], tgt_insts=data['valid']['tgt'], batch_size=opt.batch_size, shuffle=False, test=True, cuda=opt.cuda) opt.src_vocab_size = training_data.src_vocab_size opt.tgt_vocab_size = training_data.tgt_vocab_size #========= Preparing Model =========# if opt.embs_share_weight and training_data.src_word2idx != training_data.tgt_word2idx: print( '[Warning]', 'The src/tgt word2idx table are different but asked to share word embedding.' ) print(opt) transformer = Transformer(opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, proj_share_weight=opt.proj_share_weight, embs_share_weight=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner_hid=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout) #print(transformer) optimizer = ScheduledOptim( optim.Adam(transformer.get_trainable_parameters(), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) def get_criterion(vocab_size): ''' With PAD token zero weight ''' weight = torch.ones(vocab_size) weight[Constants.PAD] = 0 return nn.CrossEntropyLoss(weight, size_average=False) crit = get_criterion(training_data.tgt_vocab_size) if opt.cuda: transformer = transformer.cuda() crit = crit.cuda() print("===>TRAIN\n") train(transformer, training_data, validation_data, crit, optimizer, opt)
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-epoch', type=int, default=1) parser.add_argument('-batch_size', type=int, default=4) parser.add_argument('-context_width', type=int, default=1) parser.add_argument('-frame_rate', type=int, default=30) #parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=1024) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-n_warmup_steps', type=int, default=400) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-log', default=None) parser.add_argument('-save_model', default='./exp') parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') opt = parser.parse_args() cfg_path = './config/transformer.cfg' config = configparser.ConfigParser() config.read(cfg_path) #========= Preparing DataLoader =========# training_data = DataLoader('train', config, DEVICE, batch_size=opt.batch_size, context_width=opt.context_width, frame_rate=opt.frame_rate) validation_data = DataLoader('dev', config, DEVICE, batch_size=opt.batch_size, context_width=opt.context_width, frame_rate=opt.frame_rate) test_data = DataLoader('test', config, DEVICE, batch_size=opt.batch_size, context_width=opt.context_width, frame_rate=opt.frame_rate) #========= Preparing Model =========# print(opt) input_dim = training_data.features_dim output_dim = training_data.vocab_size n_inputs_max_seq = max(training_data.inputs_max_seq_lengths, validation_data.inputs_max_seq_lengths, test_data.inputs_max_seq_lengths) n_outputs_max_seq = max(training_data.outputs_max_seq_lengths, validation_data.outputs_max_seq_lengths, test_data.outputs_max_seq_lengths) print('*************************') print('The max length of inputs is %d:' % n_inputs_max_seq) print('The max length of targets is %d' % n_outputs_max_seq) transformer = Transformer(input_dim, output_dim, n_inputs_max_seq, n_outputs_max_seq, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_inner_hid=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout, device=DEVICE) # print(transformer) optimizer = ScheduledOptim( optim.Adam(transformer.get_trainable_parameters(), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) def get_criterion(output_dim): ''' With PAD token zero weight ''' weight = torch.ones(output_dim) weight[Constants.PAD] = 0 return nn.CrossEntropyLoss(weight, size_average=False) crit = get_criterion(training_data.vocab_size) transformer = transformer.to(DEVICE) crit = crit.to(DEVICE) train(transformer, training_data, validation_data, crit, optimizer, opt)