parser.add_argument("-n", "--name", required=True, help="Name of the run") args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") save_path = os.path.join("saves", "d4pg-" + args.name) os.makedirs(save_path, exist_ok=True) # Wrappen der Unity-Umgebung in eine Gym-Umgebung channel = EngineConfigurationChannel() unity_env = UnityEnvironment(ENV_ID, seed=1, side_channels=[channel]) channel.set_configuration_parameters(time_scale=20.0) env = UnityToGymWrapper(unity_env) # Erstellen des Modells nach der D4PG-Architektur act_net = model.D4PGActor(env.observation_space.shape[0], env.action_space.shape[0]).to(device) crt_net = model.D4PGCritic(env.observation_space.shape[0], env.action_space.shape[0], N_ATOMS, Vmin, Vmax).to(device) print(act_net) print(crt_net) tgt_act_net = ptan.agent.TargetNet(act_net) tgt_crt_net = ptan.agent.TargetNet(crt_net) # Erstellen des Agenten mit der PTAN-Bibliothek und des Buffers writer = SummaryWriter(comment="-d4pg_" + args.name) agent = model.AgentD4PG(act_net, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=GAMMA, steps_count=REWARD_STEPS) buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=REPLAY_SIZE) act_opt = optim.Adam(act_net.parameters(), lr=LEARNING_RATE) crt_opt = optim.Adam(crt_net.parameters(), lr=LEARNING_RATE) frame_idx = 0 best_reward = None
def main(): env = KukaGymEnv(renders=True, isDiscrete=False, maxSteps=10000000) save_path = os.path.join("saves", "ddpg-") os.makedirs(save_path, exist_ok=True) device = torch.device("cuda") act_net = model.DDPGActor(env.observation_space.shape[0], env.action_space.shape[0]).to(device) crt_net = model.D4PGCritic(env.observation_space.shape[0], env.action_space.shape[0], N_ATOMS, Vmin, Vmax).to(device) print(act_net) print(crt_net) tgt_act_net = common.TargetNet(act_net) tgt_crt_net = common.TargetNet(crt_net) writer = SummaryWriter(comment="-d4pg_") agent = model.AgentDDPG(act_net, device=device) exp_source = experience.ExperienceSourceFirstLast(env, agent, gamma=GAMMA, steps_count=REWARD_STEPS) buffer = experience.ExperienceReplayBuffer(exp_source, buffer_size=REPLAY_SIZE) act_opt = optim.Adam(act_net.parameters(), lr=LEARNING_RATE) crt_opt = optim.Adam(crt_net.parameters(), lr=LEARNING_RATE) frame_idx = 0 best_reward = None with common.RewardTracker(writer) as tracker: with common.TBMeanTracker(writer, batch_size=10) as tb_tracker: while True: frame_idx += 1 #print("populate") buffer.populate(1) rewards_steps = exp_source.pop_rewards_steps() #print(rewards_steps) if rewards_steps: rewards, steps = zip(*rewards_steps) tb_tracker.track("episode_steps", steps[0], frame_idx) tracker.reward(rewards[0], frame_idx) if len(buffer) < 100: continue batch = buffer.sample(BATCH_SIZE) #print("infer") states_v, actions_v, rewards_v, dones_mask, last_states_v = common.unpack_batch_ddqn( batch, device) #print("train critic")# train critic crt_opt.zero_grad() crt_distr_v = crt_net(states_v, actions_v) last_act_v = tgt_act_net.target_model(last_states_v) last_distr_v = F.softmax(tgt_crt_net.target_model( last_states_v, last_act_v), dim=1) proj_distr_v = distr_projection(last_distr_v, rewards_v, dones_mask, gamma=GAMMA**REWARD_STEPS, device=device) prob_dist_v = -F.log_softmax(crt_distr_v, dim=1) * proj_distr_v critic_loss_v = prob_dist_v.sum(dim=1).mean() critic_loss_v.backward() crt_opt.step() tb_tracker.track("loss_critic", critic_loss_v, frame_idx) #print("train actor") # train actor act_opt.zero_grad() act_opt.zero_grad() cur_actions_v = act_net(states_v) crt_distr_v = crt_net(states_v, cur_actions_v) actor_loss_v = -crt_net.distr_to_q(crt_distr_v) actor_loss_v = actor_loss_v.mean() actor_loss_v.backward() act_opt.step() tb_tracker.track("loss_actor", actor_loss_v, frame_idx) tgt_act_net.alpha_sync(alpha=1 - 1e-3) tgt_crt_net.alpha_sync(alpha=1 - 1e-3) if frame_idx % TEST_ITERS == 0: print("testing") env.reset() ts = time.time() rewards, steps = test_net(act_net, env, device=device) print("Test done in %.2f sec, reward %.3f, steps %d" % (time.time() - ts, rewards, steps)) writer.add_scalar("test_reward", rewards, frame_idx) writer.add_scalar("test_steps", steps, frame_idx) if best_reward is None or best_reward < rewards: if best_reward is not None: print("Best reward updated: %.3f -> %.3f" % (best_reward, rewards)) name = "best_%+.3f_%d.dat" % (rewards, frame_idx) fname = os.path.join(save_path, name) torch.save(act_net.state_dict(), fname) best_reward = rewards
def main(): parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=False, action='store_true', help='Enable CUDA') parser.add_argument("-n", "--name", required=True, help="Name of the run") args = parser.parse_args() device = torch.device( "cuda" if args.cuda and torch.cuda.is_available() else "cpu") save_path = os.path.join("saves", "d4pg-" + args.name) os.makedirs(save_path, exist_ok=True) env = gym.make(ENV_ID) test_env = gym.make(ENV_ID) act_net = model.D4PGActor(env.observation_space.shape[0], env.action_space.shape[0]).to(device) crt_net = model.D4PGCritic(env.observation_space.shape[0], env.action_space.shape[0], N_ATOMS, VMIN, VMAX).to(device) tgt_act_net = ptan.agent.TargetNet(act_net) tgt_crt_net = ptan.agent.TargetNet(crt_net) writer = SummaryWriter(comment="-d4pg_" + args.name) agent = model.AgentD4PG(act_net, device=device) exp_source = ExperienceSourceFirstLast(env, agent, gamma=GAMMA, steps_count=REWARD_STEPS) buffer = PrioritizedReplayBuffer(exp_source, REPLAY_SIZE, PRIO_REPLAY_ALPHA) act_opt = optim.Adam(act_net.parameters(), lr=LEARNING_RATE) crt_opt = optim.Adam(crt_net.parameters(), lr=LEARNING_RATE) frame_idx = 0 best_reward = 0 with RewardTracker(writer) as tracker: with TBMeanTracker(writer, batch_size=10) as tb_tracker: while True: frame_idx += 1 buffer.populate(1) beta = min( 1.0, BETA_START + frame_idx * (1.0 - BETA_START) / BETA_FRAMES) rewards_steps = exp_source.pop_rewards_steps() if rewards_steps: rewards, steps = zip(*rewards_steps) tb_tracker.track('episode_steps', steps[0], frame_idx) tracker.reward(rewards[0], frame_idx) if len(buffer) < REPLAY_INITIAL: continue batch, batch_indices, batch_weights = buffer.sample( BATCH_SIZE, beta) actor_loss, critic_loss, sample_prios = calc_loss( batch, batch_weights, act_net, crt_net, tgt_act_net, tgt_crt_net, device) train(actor_loss, critic_loss, act_net, crt_net, tgt_act_net, tgt_crt_net, act_opt, crt_opt, device) tb_tracker.track('loss_actor', actor_loss, frame_idx) tb_tracker.track('loss_critic', critic_loss, frame_idx) buffer.update_priorities(batch_indices, sample_prios.data.cpu().numpy()) if frame_idx % TEST_ITERS == 0: ts = time.time() rewards, steps = test(act_net, test_env, device=device) print('Test done in %.2f sec, reward %.3f, steps %d' % (time.time() - ts, rewards, steps)) writer.add_scalar('test_reward', rewards, frame_idx) writer.add_scalar('test_steps', steps, frame_idx) if best_reward is None or best_reward < rewards: if best_reward is not None: print('Best reward updated: %.3f -> %.3f' % (best_reward, rewards)) name = 'best_%+.3f_%d.dat' % (rewards, frame_idx) fname = os.path.join(save_path, name) torch.save(act_net.state_dict(), fname) best_reward = rewards writer.close()