def main(): env = KukaGymEnv(renders=True, isDiscrete=False, maxSteps=10000000) save_path = os.path.join("saves", "ddpg-") os.makedirs(save_path, exist_ok=True) device = torch.device("cuda") act_net = model.DDPGActor(env.observation_space.shape[0], env.action_space.shape[0]).to(device) crt_net = model.D4PGCritic(env.observation_space.shape[0], env.action_space.shape[0], N_ATOMS, Vmin, Vmax).to(device) print(act_net) print(crt_net) tgt_act_net = common.TargetNet(act_net) tgt_crt_net = common.TargetNet(crt_net) writer = SummaryWriter(comment="-d4pg_") agent = model.AgentDDPG(act_net, device=device) exp_source = experience.ExperienceSourceFirstLast(env, agent, gamma=GAMMA, steps_count=REWARD_STEPS) buffer = experience.ExperienceReplayBuffer(exp_source, buffer_size=REPLAY_SIZE) act_opt = optim.Adam(act_net.parameters(), lr=LEARNING_RATE) crt_opt = optim.Adam(crt_net.parameters(), lr=LEARNING_RATE) frame_idx = 0 best_reward = None with common.RewardTracker(writer) as tracker: with common.TBMeanTracker(writer, batch_size=10) as tb_tracker: while True: frame_idx += 1 #print("populate") buffer.populate(1) rewards_steps = exp_source.pop_rewards_steps() #print(rewards_steps) if rewards_steps: rewards, steps = zip(*rewards_steps) tb_tracker.track("episode_steps", steps[0], frame_idx) tracker.reward(rewards[0], frame_idx) if len(buffer) < 100: continue batch = buffer.sample(BATCH_SIZE) #print("infer") states_v, actions_v, rewards_v, dones_mask, last_states_v = common.unpack_batch_ddqn( batch, device) #print("train critic")# train critic crt_opt.zero_grad() crt_distr_v = crt_net(states_v, actions_v) last_act_v = tgt_act_net.target_model(last_states_v) last_distr_v = F.softmax(tgt_crt_net.target_model( last_states_v, last_act_v), dim=1) proj_distr_v = distr_projection(last_distr_v, rewards_v, dones_mask, gamma=GAMMA**REWARD_STEPS, device=device) prob_dist_v = -F.log_softmax(crt_distr_v, dim=1) * proj_distr_v critic_loss_v = prob_dist_v.sum(dim=1).mean() critic_loss_v.backward() crt_opt.step() tb_tracker.track("loss_critic", critic_loss_v, frame_idx) #print("train actor") # train actor act_opt.zero_grad() act_opt.zero_grad() cur_actions_v = act_net(states_v) crt_distr_v = crt_net(states_v, cur_actions_v) actor_loss_v = -crt_net.distr_to_q(crt_distr_v) actor_loss_v = actor_loss_v.mean() actor_loss_v.backward() act_opt.step() tb_tracker.track("loss_actor", actor_loss_v, frame_idx) tgt_act_net.alpha_sync(alpha=1 - 1e-3) tgt_crt_net.alpha_sync(alpha=1 - 1e-3) if frame_idx % TEST_ITERS == 0: print("testing") env.reset() ts = time.time() rewards, steps = test_net(act_net, env, device=device) print("Test done in %.2f sec, reward %.3f, steps %d" % (time.time() - ts, rewards, steps)) writer.add_scalar("test_reward", rewards, frame_idx) writer.add_scalar("test_steps", steps, frame_idx) if best_reward is None or best_reward < rewards: if best_reward is not None: print("Best reward updated: %.3f -> %.3f" % (best_reward, rewards)) name = "best_%+.3f_%d.dat" % (rewards, frame_idx) fname = os.path.join(save_path, name) torch.save(act_net.state_dict(), fname) best_reward = rewards
parser.add_argument("--noisy", required=False, action="store_true", help="Enable noisy network extension") args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") save_path = os.path.join("saves", "ddpg-" + args.name) os.makedirs(save_path, exist_ok=True) env = gym.make(ENV_ID) test_env = gym.make(ENV_ID) unroll_step = int(args.step) if args.step else 1 act_net = model.DDPGActor(env.observation_space.shape[0], env.action_space.shape[0], args.noisy).to(device) crt_net = model.DDPGCritic(env.observation_space.shape[0], env.action_space.shape[0]).to(device) print(act_net) print(crt_net) tgt_act_net = ptan.agent.TargetNet(act_net) tgt_crt_net = ptan.agent.TargetNet(crt_net) writer = SummaryWriter(comment="-ddpg_" + args.name) agent = model.AgentDDPG(act_net, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast( env, agent, gamma=GAMMA, steps_count=unroll_step) buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=REPLAY_SIZE) act_opt = optim.Adam(act_net.parameters(), lr=LEARNING_RATE) crt_opt = optim.Adam(crt_net.parameters(), lr=LEARNING_RATE)
ENV_ID = "MinitaurBulletEnv-v0" if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-m", "--model", required=True, help="Model file to load") parser.add_argument("-e", "--env", default=ENV_ID, help="Environment name to use, default=" + ENV_ID) parser.add_argument("-r", "--record", help="If specified, sets the recording dir, default=Disabled") args = parser.parse_args() spec = gym.envs.registry.spec(args.env) spec._kwargs['render'] = False env = gym.make(args.env) if args.record: env = gym.wrappers.Monitor(env, args.record) net = model.DDPGActor(env.observation_space.shape[0], env.action_space.shape[0]) net.load_state_dict(torch.load(args.model)) obs = env.reset() total_reward = 0.0 total_steps = 0 while True: obs_v = torch.FloatTensor([obs]) mu_v = net(obs_v) action = mu_v.squeeze(dim=0).data.numpy() action = np.clip(action, -1, 1) obs, reward, done, _ = env.step(action) total_reward += reward total_steps += 1 if done: break
rewards += reward steps += 1 # If done proceed to next try if is_done: break return rewards / count, steps / count # Create buffer auxiliars Experience = namedtuple('Episode', field_names=['state', 'action', 'reward', 'last_state', 'done']) # Initialize simulator sim = simulator.Agent(random(), random()) # Initialize networks and inteligent agents act_net = model.DDPGActor(OBSERVATION_SPACE, ACTION_SPACE).to(device) crt_net = model.DDPGCritic(OBSERVATION_SPACE, ACTION_SPACE).to(device) tgt_act_net = ptan.agent.TargetNet(act_net) tgt_crt_net = ptan.agent.TargetNet(crt_net) agent = model.AgentDDPG(act_net, device=device) act_opt = optim.Adam(act_net.parameters(), lr=LEARNING_RATE) crt_opt = optim.Adam(crt_net.parameters(), lr=LEARNING_RATE) # Define soft_max function for discrete actions def softmax_function(values): return_values = [max(MIN_PROB_EXPLORATION, np.exp(value)/np.exp(values).sum()) for value in values] return np.random.choice(len(return_values), p=return_values/sum(return_values)) buffer = []