averaged_model = build_model() averaged_model.to(device) averaged_model.load_state_dict(checkpoint["state_dict"]) for name, param in averaged_model.named_parameters(): if param.requires_grad: ema.register(name, param.data) return model, optimizer, ema model = build_model() model.to(device) optimizer = optim.Adam(model.parameters(), lr=args.learning_rate) criterion = GaussianLoss() ema = ExponentialMovingAverage(args.ema_decay) for name, param in model.named_parameters(): if param.requires_grad: ema.register(name, param.data) global_step, global_epoch = 0, 0 load_step = args.load_step log = open(os.path.join(args.log, '{}.txt'.format(args.model_name)), 'w') state = {k: v for k, v in args._get_kwargs()} if load_step == 0: list_train_loss, list_loss = [], [] log.write(json.dumps(state) + '\n') test_loss = 100.0 else:
averaged_model.load_state_dict(checkpoint["state_dict"]) for name, param in averaged_model.named_parameters(): if param.requires_grad: ema.register(name, param.data) return model, optimizer, ema model = build_model() model.to(device) optimizer = optim.Adam(model.parameters(), lr=args.learning_rate) # optimizer = optim.Adamax(model.parameters(), lr=args.learning_rate) criterion = GaussianLoss() ema = ExponentialMovingAverage( args.ema_decay ) #Maintains moving averages of variables by employing an exponential decay. for name, param in model.named_parameters(): if param.requires_grad: #Every Tensor has a flag: requires_grad that allows for fine grained exclusion of subgraphs from gradient computation and can increase efficiency. ema.register(name, param.data) global_step, global_epoch = 0, 0 load_step = args.load_step log = open(os.path.join(args.log, '{}.txt'.format(args.model_name)), 'w') state = {k: v for k, v in args._get_kwargs()} if load_step == 0: list_train_loss, list_loss = [], [] log.write(json.dumps(state) + '\n') test_loss = 100.0