def __init__(self, env, args):
     super(DQNTrainer).__init__()
     self.model = DQN(env, args, Nash=False).to(args.device)
     self.target = DQN(env, args, Nash=False).to(args.device)
     self.replay_buffer = ReplayBuffer(args.buffer_size)
     self.optimizer = optim.Adam(self.model.parameters(), lr=args.lr)
     self.args = args
Beispiel #2
0
    def __init__(self, env, device, model_dir, args):
        self.env = env
        self.env_name = args.env_name
        self.seed = args.seed
        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]
        self.max_action = float(self.env.action_space.high[0])
        self.batch_size = args.batch_size
        self.max_timesteps = args.max_timesteps
        self.gaussian_std = args.gaussian_std
        self.start_timesteps = args.start_timesteps
        self.eval_freq = args.eval_freq
        self.rand_action_p = args.rand_action_p

        self.model_dir = os.path.join(model_dir,
                                      f"{args.env_name}_{args.seed}")

        self.algo = DDPG(self.state_dim, self.action_dim, self.max_action,
                         device)

        self.storage = ReplayBuffer(self.state_dim, self.action_dim, device)

        self.eval_rewards = []

        self.total_steps = 0
        self.episodes = 0
        self.episode_steps = 0
        self.episode_rewards = 0

        self.state = None
class DQNTrainer():
    def __init__(self, env, args):
        super(DQNTrainer).__init__()
        self.model = DQN(env, args, Nash=False).to(args.device)
        self.target = DQN(env, args, Nash=False).to(args.device)
        self.replay_buffer = ReplayBuffer(args.buffer_size)
        self.optimizer = optim.Adam(self.model.parameters(), lr=args.lr)
        self.args = args

    def push(self, s, a, r, s_, d):
        self.replay_buffer.push(s, a, r, s_, np.float32(d))

    def update(self):
        state, action, reward, next_state, done = self.replay_buffer.sample(
            self.args.batch_size)

        state = torch.FloatTensor(np.float32(state)).to(self.args.device)
        next_state = torch.FloatTensor(np.float32(next_state)).to(
            self.args.device)
        action = torch.LongTensor(action).to(self.args.device)
        reward = torch.FloatTensor(reward).to(self.args.device)
        done = torch.FloatTensor(done).to(self.args.device)

        # Q-Learning with target network
        q_values = self.model(state)
        target_next_q_values = self.target(next_state)

        q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
        next_q_value = target_next_q_values.max(1)[0]
        expected_q_value = reward + (
            self.args.gamma**self.args.multi_step) * next_q_value * (1 - done)

        # Huber Loss
        loss = F.smooth_l1_loss(q_value,
                                expected_q_value.detach(),
                                reduction='none')
        loss = loss.mean()

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss.item()

    def act(self, s, args):
        return self.model.act(s, args)

    def save_model(self, model_path):
        torch.save(self.model.state_dict(), model_path + 'dqn')
        torch.save(self.target.state_dict(), model_path + 'dqn_target')
Beispiel #4
0
    def __init__(self, env_name, seed, buffer_dir, summary_dir, max_timesteps, eval_freq,
                 batch_size, state_dim, action_dim, device,
                 gamma, tau, lmbda):

        self.env_name = env_name
        self.seed = seed
        self.device = device
        self.batch_size = batch_size
        self.max_timesteps = max_timesteps
        self.eval_freq = eval_freq

        self.gamma = gamma
        self.tau = tau
        self.lmbda = lmbda

        self.store = ReplayBuffer(batch_size, state_dim, action_dim, device)
        self.store.load(buffer_dir)

        self.training_iters = 0
        self.writer = SummaryWriter(log_dir=summary_dir)
Beispiel #5
0
def main(args, idx):
    # Create summary writer
    writer_path = os.path.join(args.log_dir, args.task_id, args.run_id + '-' + str(idx))
    writer = SummaryWriter(log_dir=writer_path)

    # Create training envs
    envs = make_vec_envs(args.task_id, args.seed, args.num_processes,
                         args.gamma, args.monitor_dir, args.device)
    obs_size = envs.observation_space.shape[0]
    act_size = envs.action_space.shape[0]

    # Create NN
    actor_critic = Policy(obs_size, act_size,
                          action_range=[envs.action_space.low[0], envs.action_space.high[0]])
    actor_critic.to(args.device)

    # Create ppo agent
    agent = PPO(
        actor_critic=actor_critic,
        device=args.device,
        lr=args.lr,
        eps=args.eps,
        max_grad_norm=args.max_grad_norm,
        clip_param=args.clip_param,
        ppo_epoch=args.ppo_epoch,
        num_mini_batch=args.num_mini_batch,
        value_loss_coef=args.value_loss_coef,
        entropy_coef=args.entropy_coef,
    )

    # Create replay buffer
    buffer = ReplayBuffer(args.num_steps, args.num_processes, obs_size, act_size)
    buffer.to(args.device)

    # Reset envs
    obs = envs.reset()
    buffer.obs[0].copy_(obs)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(args.num_env_steps) // args.num_steps // args.num_processes
    for j in tqdm(range(num_updates)):

        if args.use_linear_lr_decay:
            update_linear_schedule(agent.optimizer, j, num_updates, args.lr)

        # Collect trajectories and compute returns
        with torch.no_grad():
            for step in range(args.num_steps):
                # Sample actions
                action = actor_critic(buffer.obs[step])

                # Get trajectories from envs
                obs, reward, done, infos = envs.step(action)
                mask = torch.tensor(
                    [[0.0] if done_ else [1.0] for done_ in done],
                    dtype=torch.float, device=args.device)
                for info in infos:
                    if 'episode' in info.keys():
                        episode_rewards.append(info['episode']['r'])

                # Store trajectories
                buffer.insert(obs, action, reward, mask)

            # Compute returns
            batch_obs = buffer.obs.view(-1, obs_size)
            value = actor_critic.get_value(batch_obs).view(args.num_steps + 1, args.num_processes, 1)
            batch_obs = buffer.obs[:-1].view(-1, obs_size)
            batch_action = buffer.actions.view(-1, act_size)
            action_log_prob = actor_critic.get_act_log_prob(batch_obs, batch_action).view(args.num_steps,
                                                                                          args.num_processes, 1)
            buffer.update_value_log_prob(value, action_log_prob)
            buffer.compute_returns(args.gamma, args.gae_lambda)

        # Update policy
        agent_output = agent.update(buffer)
        buffer.after_update()

        # Log stuff
        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            speed = int(total_num_steps / (end - start))
            print(
                "Updates {}, num timesteps {}, FPS {} \n "
                "Last {} training episodes: mean/median reward {:.1f}/{:.1f}, "
                "min/max reward {:.1f}/{:.1f}\n"
                    .format(j, total_num_steps,
                            speed,
                            len(episode_rewards), np.mean(episode_rewards),
                            np.median(episode_rewards), np.min(episode_rewards),
                            np.max(episode_rewards),
                            ))
            writer.add_scalar('mean_reward', np.mean(episode_rewards), total_num_steps)
            writer.add_scalar('speed', speed, total_num_steps)
            for key in agent_output.keys():
                writer.add_scalar(key, agent_output[key], total_num_steps)

            if args.task_id == 'Pendulum-v0' and np.mean(episode_rewards) > -250:
                break

    envs.close()
    writer.close()
Beispiel #6
0
class Base:
    def __init__(self, env, device, model_dir, args):
        self.env = env
        self.env_name = args.env_name
        self.seed = args.seed
        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]
        self.max_action = float(self.env.action_space.high[0])
        self.batch_size = args.batch_size
        self.max_timesteps = args.max_timesteps
        self.gaussian_std = args.gaussian_std
        self.start_timesteps = args.start_timesteps
        self.eval_freq = args.eval_freq
        self.rand_action_p = args.rand_action_p

        self.model_dir = os.path.join(model_dir,
                                      f"{args.env_name}_{args.seed}")

        self.algo = DDPG(self.state_dim, self.action_dim, self.max_action,
                         device)

        self.storage = ReplayBuffer(self.state_dim, self.action_dim, device)

        self.eval_rewards = []

        self.total_steps = 0
        self.episodes = 0
        self.episode_steps = 0
        self.episode_rewards = 0

        self.state = None

    def iterate(self):
        assert self.state is not None

        self.episode_steps += 1

        if self.is_random_action():
            action = self.env.action_space.sample()
        else:
            action = (self.algo.select_action(np.array(self.state)) +
                      np.random.normal(0,
                                       self.max_action * self.gaussian_std,
                                       size=self.action_dim)).clip(
                                           -self.max_action, self.max_action)

        next_state, reward, done, _ = self.env.step(action)
        done_bool = float(
            done) if self.episode_steps < self.env._max_episode_steps else 0

        self.storage.add(self.state, action, next_state, reward, done_bool)

        self.state = next_state
        self.episode_rewards += reward

        if done:
            print(f"Total T: {self.total_steps + 1} "
                  f"Episode Num: {self.episodes + 1} "
                  f"Episode T: {self.episode_steps} "
                  f"Reward: {self.episode_rewards:.3f}")
            # Reset environment
            self.state = self.env.reset()
            self.episode_rewards = 0
            self.episode_steps = 0
            self.episodes += 1

        self.total_steps += 1

    def evaluate(self, eval_episodes=10):
        eval_env = gym.make(self.env_name)
        eval_env.seed(self.seed + 100)

        avg_reward = 0.
        for _ in range(eval_episodes):
            state, done = eval_env.reset(), False
            while not done:
                action = self.algo.select_action(np.array(state))
                state, reward, done, _ = eval_env.step(action)
                avg_reward += reward

        avg_reward /= eval_episodes

        print("---------------------------------------")
        print(f"Evaluation over {eval_episodes} episodes: {avg_reward:.3f}")
        print("---------------------------------------")
        return avg_reward
Beispiel #7
0
def train(env, args, writer):
    # RL Model for Player 1
    p1_current_model = DQN(env, args).to(args.device)
    p1_target_model = DQN(env, args).to(args.device)
    update_target(p1_current_model, p1_target_model)

    # RL Model for Player 2
    p2_current_model = DQN(env, args).to(args.device)
    p2_target_model = DQN(env, args).to(args.device)
    update_target(p2_current_model, p2_target_model)

    # SL Model for Player 1, 2
    p1_policy = Policy(env).to(args.device)
    p2_policy = Policy(env).to(args.device)

    if args.load_model and os.path.isfile(args.load_model):
        load_model(models={
            "p1": p1_current_model,
            "p2": p2_current_model
        },
                   policies={
                       "p1": p1_policy,
                       "p2": p2_policy
                   },
                   args=args)

    epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final,
                                         args.eps_decay)

    # Replay Buffer for Reinforcement Learning - Best Response
    p1_replay_buffer = ReplayBuffer(args.buffer_size)
    p2_replay_buffer = ReplayBuffer(args.buffer_size)

    # Reservoir Buffer for Supervised Learning - Average Strategy
    # TODO(Aiden): How to set buffer size of SL?
    p1_reservoir_buffer = ReservoirBuffer(args.buffer_size)
    p2_reservoir_buffer = ReservoirBuffer(args.buffer_size)

    # Deque data structure for multi-step learning
    p1_state_deque = deque(maxlen=args.multi_step)
    p1_reward_deque = deque(maxlen=args.multi_step)
    p1_action_deque = deque(maxlen=args.multi_step)

    p2_state_deque = deque(maxlen=args.multi_step)
    p2_reward_deque = deque(maxlen=args.multi_step)
    p2_action_deque = deque(maxlen=args.multi_step)

    # RL Optimizer for Player 1, 2
    p1_rl_optimizer = optim.Adam(p1_current_model.parameters(), lr=args.lr)
    p2_rl_optimizer = optim.Adam(p2_current_model.parameters(), lr=args.lr)

    # SL Optimizer for Player 1, 2
    # TODO(Aiden): Is it necessary to seperate learning rate for RL/SL?
    p1_sl_optimizer = optim.Adam(p1_policy.parameters(), lr=args.lr)
    p2_sl_optimizer = optim.Adam(p2_policy.parameters(), lr=args.lr)

    # Logging
    length_list = []
    p1_reward_list, p1_rl_loss_list, p1_sl_loss_list = [], [], []
    p2_reward_list, p2_rl_loss_list, p2_sl_loss_list = [], [], []
    p1_episode_reward, p2_episode_reward = 0, 0
    tag_interval_length = 0
    prev_time = time.time()
    prev_frame = 1

    # Main Loop
    (p1_state, p2_state) = env.reset()
    for frame_idx in range(1, args.max_frames + 1):
        is_best_response = False
        # TODO(Aiden):
        # Action should be decided by a combination of Best Response and Average Strategy
        if random.random() > args.eta:
            p1_action = p1_policy.act(
                torch.FloatTensor(p1_state).to(args.device))
            p2_action = p2_policy.act(
                torch.FloatTensor(p1_state).to(args.device))
        else:
            is_best_response = True
            epsilon = epsilon_by_frame(frame_idx)
            p1_action = p1_current_model.act(
                torch.FloatTensor(p1_state).to(args.device), epsilon)
            p2_action = p2_current_model.act(
                torch.FloatTensor(p2_state).to(args.device), epsilon)

        actions = {"1": p1_action, "2": p2_action}
        (p1_next_state, p2_next_state), reward, done, info = env.step(actions)
        # print(actions)  # {'1': 3, '2': 2}
        # print(p1_next_state) # [[[127 127 .....
        #print(reward, done, info) # [0 0] False None

        # Save current state, reward, action to deque for multi-step learning
        p1_state_deque.append(p1_state)
        p2_state_deque.append(p2_state)

        p1_reward = reward[0] - 1 if args.negative else reward[0]
        p2_reward = reward[1] - 1 if args.negative else reward[1]
        p1_reward_deque.append(p1_reward)
        p2_reward_deque.append(p2_reward)

        p1_action_deque.append(p1_action)
        p2_action_deque.append(p2_action)

        # Store (state, action, reward, next_state) to Replay Buffer for Reinforcement Learning
        if len(p1_state_deque) == args.multi_step or done:
            n_reward = multi_step_reward(p1_reward_deque, args.gamma)
            n_state = p1_state_deque[0]
            n_action = p1_action_deque[0]
            p1_replay_buffer.push(n_state, n_action, n_reward, p1_next_state,
                                  np.float32(done))

            n_reward = multi_step_reward(p2_reward_deque, args.gamma)
            n_state = p2_state_deque[0]
            n_action = p2_action_deque[0]
            p2_replay_buffer.push(n_state, n_action, n_reward, p2_next_state,
                                  np.float32(done))

        # Store (state, action) to Reservoir Buffer for Supervised Learning
        if is_best_response:
            p1_reservoir_buffer.push(p1_state, p1_action)
            p2_reservoir_buffer.push(p2_state, p2_action)

        (p1_state, p2_state) = (p1_next_state, p2_next_state)

        # Logging
        p1_episode_reward += p1_reward
        p2_episode_reward += p2_reward
        tag_interval_length += 1

        if info is not None:
            length_list.append(tag_interval_length)
            tag_interval_length = 0

        # Episode done. Reset environment and clear logging records
        if done or tag_interval_length >= args.max_tag_interval:
            (p1_state, p2_state) = env.reset()
            p1_reward_list.append(p1_episode_reward)
            p2_reward_list.append(p2_episode_reward)
            writer.add_scalar("p1/episode_reward", p1_episode_reward,
                              frame_idx)
            writer.add_scalar("p2/episode_reward", p2_episode_reward,
                              frame_idx)
            writer.add_scalar("data/tag_interval_length", tag_interval_length,
                              frame_idx)
            p1_episode_reward, p2_episode_reward, tag_interval_length = 0, 0, 0
            p1_state_deque.clear(), p2_state_deque.clear()
            p1_reward_deque.clear(), p2_reward_deque.clear()
            p1_action_deque.clear(), p2_action_deque.clear()

        if (len(p1_replay_buffer) > args.rl_start
                and len(p1_reservoir_buffer) > args.sl_start
                and frame_idx % args.train_freq == 0):

            # Update Best Response with Reinforcement Learning
            loss = compute_rl_loss(p1_current_model, p1_target_model,
                                   p1_replay_buffer, p1_rl_optimizer, args)
            p1_rl_loss_list.append(loss.item())
            writer.add_scalar("p1/rl_loss", loss.item(), frame_idx)

            loss = compute_rl_loss(p2_current_model, p2_target_model,
                                   p2_replay_buffer, p2_rl_optimizer, args)
            p2_rl_loss_list.append(loss.item())
            writer.add_scalar("p2/rl_loss", loss.item(), frame_idx)

            # Update Average Strategy with Supervised Learning
            loss = compute_sl_loss(p1_policy, p1_reservoir_buffer,
                                   p1_sl_optimizer, args)
            p1_sl_loss_list.append(loss.item())
            writer.add_scalar("p1/sl_loss", loss.item(), frame_idx)

            loss = compute_sl_loss(p2_policy, p2_reservoir_buffer,
                                   p2_sl_optimizer, args)
            p2_sl_loss_list.append(loss.item())
            writer.add_scalar("p2/sl_loss", loss.item(), frame_idx)

        if frame_idx % args.update_target == 0:
            update_target(p1_current_model, p1_target_model)
            update_target(p2_current_model, p2_target_model)

        # Logging and Saving models
        if frame_idx % args.evaluation_interval == 0:
            print_log(frame_idx, prev_frame, prev_time,
                      (p1_reward_list, p2_reward_list), length_list,
                      (p1_rl_loss_list, p2_rl_loss_list),
                      (p1_sl_loss_list, p2_sl_loss_list))
            p1_reward_list.clear(), p2_reward_list.clear(), length_list.clear()
            p1_rl_loss_list.clear(), p2_rl_loss_list.clear()
            p1_sl_loss_list.clear(), p2_sl_loss_list.clear()
            prev_frame = frame_idx
            prev_time = time.time()
            save_model(models={
                "p1": p1_current_model,
                "p2": p2_current_model
            },
                       policies={
                           "p1": p1_policy,
                           "p2": p2_policy
                       },
                       args=args)

        # Render if rendering argument is on
        if args.render:
            env.render()

        save_model(models={
            "p1": p1_current_model,
            "p2": p2_current_model
        },
                   policies={
                       "p1": p1_policy,
                       "p2": p2_policy
                   },
                   args=args)
Beispiel #8
0
class Algo(object):
    def __init__(self, env_name, seed, buffer_dir, summary_dir, max_timesteps, eval_freq,
                 batch_size, state_dim, action_dim, device,
                 gamma, tau, lmbda):

        self.env_name = env_name
        self.seed = seed
        self.device = device
        self.batch_size = batch_size
        self.max_timesteps = max_timesteps
        self.eval_freq = eval_freq

        self.gamma = gamma
        self.tau = tau
        self.lmbda = lmbda

        self.store = ReplayBuffer(batch_size, state_dim, action_dim, device)
        self.store.load(buffer_dir)

        self.training_iters = 0
        self.writer = SummaryWriter(log_dir=summary_dir)

    def run(self):
        while self.training_iters < self.max_timesteps:
            self.train(iterations=int(self.eval_freq))

            self.eval_policy(self.env_name, self.seed)

            self.training_iters += self.eval_freq
            print(f"Training iterations: {self.training_iters}")

    def eval_policy(self, env_name, seed, eval_episodes=10):
        eval_env = gym.make(env_name)
        eval_env.seed(seed + 100)

        avg_reward = 0.
        avg_q = 0
        for _ in range(eval_episodes):
            state, done = eval_env.reset(), False
            while not done:
                action, q = self.select_action(np.array(state))
                state, reward, done, _ = eval_env.step(action)
                avg_reward += reward
                avg_q += q

        avg_reward /= eval_episodes
        avg_q /= eval_episodes

        print("---------------------------------------")
        print(f"Evaluation over {eval_episodes} episodes: {avg_reward:.3f}")
        print("---------------------------------------")

        self.writer.add_scalar(
            'eval/return', avg_reward, self.training_iters)
        self.writer.add_scalar(
            'eval/Estimate Q', avg_q, self.training_iters)

    def update_vae(self, state, action):
        recon, mean, std = self.vae(state, action)
        recon_loss = F.mse_loss(recon, action)
        kl_loss = -0.5 * (1 + torch.log(std.pow(2)) - mean.pow(2) - std.pow(2)).mean()
        vae_loss = recon_loss + 0.5 * kl_loss

        # >> norms
        norms = 0
        for param in self.vae.parameters():
            norms += torch.sum(torch.square(param))
        # >> norms

        loss = (
            vae_loss
            # + 1e-4 * norms
        )

        self.vae_optimizer.zero_grad()
        loss.backward()
        self.vae_optimizer.step()

    def update_critic(self, state, action, next_state, next_action, reward, not_done):
        with torch.no_grad():
            next_q1, next_q2 = self.critic_target(
                next_state, next_action)

            next_q = self.lmbda * torch.min(
                next_q1, next_q2) + (1. - self.lmbda) * torch.max(next_q1, next_q2)
            next_q = next_q.reshape(self.batch_size, -1).max(1)[0].reshape(-1, 1)

            target_q = reward + not_done * self.gamma * next_q

        curr_q1, curr_q2 = self.critic(state, action)
        critic_loss = F.mse_loss(curr_q1, target_q) + F.mse_loss(curr_q2, target_q)

        # # >> norms
        # norms = 0
        # for param in self.critic.parameters():
        #     norms += torch.sum(torch.square(param))
        # # >> norms

        loss = (
            critic_loss
            # + 1e-5 * norms
        )

        self.critic_optimizer.zero_grad()
        loss.backward()
        self.critic_optimizer.step()

    def update_actor(self, state):
        sampled_actions = self.vae.decode(state)
        perturbed_actions = self.actor(state, sampled_actions)

        actor_loss = -self.critic.q1(state, perturbed_actions).mean()

        # # >> norms
        # norms = 0
        # for param in self.critic.parameters():
        #     norms += torch.sum(torch.square(param))
        # # >> norms

        loss = (
            actor_loss
            # + 1e-5 * norms
        )

        self.actor_optimizer.zero_grad()
        loss.backward()
        self.actor_optimizer.step()

    def update_targets(self):
        for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

        for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)