def create_model_transformer_b2a(arg, devices_list, eval=False): from models import Transformer resume_dataset = arg.eval_dataset_transformer if eval else arg.dataset resume_b = arg.eval_split_source_trasformer if eval else arg.split_source resume_a = arg.eval_split_trasformer if eval else arg.split resume_epoch = arg.eval_epoch_transformer if eval else arg.resume_epoch transformer = Transformer(in_channels=boundary_num, out_channels=boundary_num) if resume_epoch > 0: load_path = arg.resume_folder + 'transformer_' + resume_dataset + '_' + resume_b + '2' + resume_a + '_' + str( resume_epoch) + '.pth' print('Loading Transformer from ' + load_path) transformer = load_weights(transformer, load_path, devices_list[0]) else: init_weights(transformer, init_type='transformer') # init_weights(transformer) if arg.cuda: transformer = transformer.cuda(device=devices_list[0]) return transformer
trg_vocab=trg_vocab, d_model=args.d_model, num=args.num, n_heads=args.n_heads) for p in net.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) criterion = nn.CrossEntropyLoss(reduction="mean", ignore_index=1) optimizer = optim.Adam(net.parameters(), lr=args.lr, betas=(0.9, 0.98), eps=1e-9) #scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10,20,30,40,50,100,200], gamma=0.7) scheduler = CosineWithRestarts(optimizer, T_max=500) if cuda: net.cuda() start_epoch, acc = load_state(net, optimizer, scheduler, args.model_no, load_best=False) losses_per_epoch, accuracy_per_epoch = load_results(model_no=args.model_no) logger.info("Starting training process...") for e in range(start_epoch, args.num_epochs): net.train() losses_per_batch = [] total_loss = 0.0 for i, data in enumerate(train_iter): trg_input = data.FR[:, :-1] labels = data.FR[:, 1:].contiguous().view(-1)
norm_pos = mcfg['norm_pos'] seq2seq = Transformer(in_dim, out_dim, max_len, d_model, d_ff, n_layers, n_heads, dropout, norm_pos) elif model_type == 'dynamic_conv': conv_type = mcfg['conv_type'] kernel_sizes = mcfg['kernel_sizes'] d_model = mcfg['d_model'] d_ff = mcfg['d_ff'] n_heads = mcfg['n_heads'] dropout = mcfg['dropout'] norm_pos = mcfg['norm_pos'] seq2seq = DynamicConvS2S(in_dim, out_dim, max_len, conv_type, kernel_sizes, d_model, d_ff, n_heads, dropout, norm_pos) seq2seq.cuda() K = 1024 n_params = utils.num_params(seq2seq) / K / K logger.nofmt(seq2seq) logger.info("# of params = {:.1f} M".format(n_params)) # parameter size tracing if args.param_tracing: # sequential tracing # for name, p in seq2seq.named_parameters(): # numel = p.numel() # unit = 'M' # numel /= 1024*1024 # fmt = "10.3f" if numel < 1.0 else "10.1f" # print("{:50s}\t{:{fmt}}{}".format(name, numel, unit, fmt=fmt))