net = model.DDQN(shape, actions).to(device) tgt_net = ptan.agent.TargetNet(net) selector = ptan.actions.EpsilonGreedyActionSelector() eps_tracker = ptan.actions.EpsilonTracker(selector, params.eps_start, params.eps_final, params.eps_frames) agent = ptan.agent.DQNAgent(net, selector, device=device) exp_src = ptan.experience.ExperienceSourceFirstLast( env, agent, params.gamma, steps_count=1) buffer = ptan.experience.ExperienceReplayBuffer( exp_src, params.buffer_size) mean_monitor = utils.MeanRewardsMonitor( env, net, ALGORITHM, params.solve_rewards) writer = SummaryWriter(logdir=mean_monitor.runs_dir, comment=params.frame_stack) optimizer = torch.optim.Adam(net.parameters(), lr=params.lr) lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=10_000, verbose=True, factor=0.75, min_lr=params.min_lr, cooldown=10_000) print(net) print('*'*10, ' Start Training ', env.game, ' {} '.format(device), '*'*10) frame = 0 episode = 0
params = data.params[args.env] utils.update_params(params, args) params.n_envs = max(params.n_envs, 8) device = 'cuda' if args.cuda else 'cpu' envs = utils.createEnvs(params, stack_frames=2) shape = envs[0].observation_space.shape actions = envs[0].action_space.n net = model.A2CNet(shape, actions) net.to(device) agent = ptan.agent.ActorCriticAgent(net, device=device, apply_softmax=True) exp_src = ptan.experience.ExperienceSourceFirstLast(envs, agent, params.gamma,steps_count=params.steps) generator = utils.BatchGenerator(exp_src, params) mean_monitor = utils.MeanRewardsMonitor(envs[0], net, 'A2C', params.solve_rewards) writer = SummaryWriter(logdir=mean_monitor.runs_dir,comment=params.frame_stack) optimizer = torch.optim.Adam(net.parameters(), lr=params.lr) # lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.75, patience=20000, # cooldown=20000, verbose=True, min_lr=params.min_lr) print('# Parameters: ', utils.count_parameters(net)) print(net) print('*'*10, ' Start Training ', envs[0].game, ' {} '.format(device), '*'*10)