Exemple #1
0
def train_expert(env_name):
    """Train expert policy in given environment."""
    if env_name == 'InvertedPendulum-v2':
        env = ExpertInvertedPendulumEnv()
        episode_limit = 200
        return_threshold = 200
    elif env_name == 'InvertedDoublePendulum-v2':
        env = ExpertInvertedDoublePendulumEnv()
        episode_limit = 50
        return_threshold = 460
    elif env_name == 'ThreeReacherEasy-v2':
        env = ThreeReacherEasyEnv()
        episode_limit = 50
        return_threshold = -0.8
    elif env_name == 'ReacherEasy-v2':
        env = ReacherEasyEnv()
        episode_limit = 50
        return_threshold = -0.8
    elif env_name == 'Hopper-v2':
        env = HopperEnv()
        episode_limit = 200
        return_threshold = 600
    elif env_name == 'HalfCheetah-v2':
        env = ExpertHalfCheetahEnv()
        episode_limit = 200
        return_threshold = 1000
    elif env_name == 'StrikerHumanSim-v2':
        env = StrikerHumanSimEnv()
        episode_limit = 200
        return_threshold = -190
    elif env_name == 'PusherHumanSim-v2':
        env = PusherHumanSimEnv()
        episode_limit = 200
        return_threshold = -80
    else:
        raise NotImplementedError
    buffer_size = 1000000
    init_random_samples = 1000
    exploration_noise = 0.2
    learning_rate = 3e-4
    batch_size = 256
    epochs = 200
    steps_per_epoch = 5000
    updates_per_step = 1
    update_actor_every = 1
    start_training = 512
    gamma = 0.99
    polyak = 0.995
    entropy_coefficient = 0.2
    clip_actor_gradients = False
    visual_env = True
    action_size = env.action_space.shape[0]
    tune_entropy_coefficient = True
    target_entropy = -1 * action_size

    def make_actor():
        actor = StochasticActor([
            tf.keras.layers.Dense(256, 'relu'),
            tf.keras.layers.Dense(256, 'relu'),
            tf.keras.layers.Dense(action_size * 2)
        ])
        return actor

    def make_critic():
        critic = Critic([
            tf.keras.layers.Dense(256, 'relu'),
            tf.keras.layers.Dense(256, 'relu'),
            tf.keras.layers.Dense(1)
        ])
        return critic

    optimizer = tf.keras.optimizers.Adam(learning_rate)

    replay_buffer = ReplayBuffer(buffer_size)
    sampler = Sampler(env,
                      episode_limit=episode_limit,
                      init_random_samples=init_random_samples,
                      visual_env=visual_env)
    agent = SAC(make_actor,
                make_critic,
                make_critic,
                actor_optimizer=optimizer,
                critic_optimizer=optimizer,
                gamma=gamma,
                polyak=polyak,
                entropy_coefficient=entropy_coefficient,
                tune_entropy_coefficient=tune_entropy_coefficient,
                target_entropy=target_entropy,
                clip_actor_gradients=clip_actor_gradients)
    if visual_env:
        obs = np.expand_dims(env.reset()['obs'], axis=0)
    else:
        obs = np.expand_dims(env.reset(), axis=0)
    agent(obs)
    agent.summary()

    mean_test_returns = []
    mean_test_std = []
    steps = []

    step_counter = 0
    for e in range(epochs):
        while step_counter < (e + 1) * steps_per_epoch:
            traj_data = sampler.sample_trajectory(agent, exploration_noise)
            replay_buffer.add(traj_data)
            if step_counter > start_training:
                agent.train(replay_buffer,
                            batch_size=batch_size,
                            n_updates=updates_per_step * traj_data['n'],
                            act_delay=update_actor_every)
            step_counter += traj_data['n']
        print('Epoch {}/{} - total steps {}'.format(e + 1, epochs,
                                                    step_counter))
        out = sampler.evaluate(agent, 10)
        mean_test_returns.append(out['mean'])
        mean_test_std.append(out['std'])
        steps.append(step_counter)
        if out['mean'] >= return_threshold:
            print('Early termination due to reaching return threshold')
            break
    plt.errorbar(steps, mean_test_returns, mean_test_std)
    plt.xlabel('steps')
    plt.ylabel('returns')
    plt.show()
    return agent
Exemple #2
0
def main():
    with tf.Session() as sess:

        actor = ActorNetwork(sess, STATE_DIM, ACTION_DIM, ACTION_BOUND,
                             ACTOR_LEARNING_RATE, TAU, MINIBATCH_SIZE)
        critic = CriticNetwork(sess, STATE_DIM, ACTION_DIM,
                               CRITIC_LEARNING_RATE, TAU,
                               actor.get_num_trainable_vars())

        #actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(ACTION_DIM))

        #TODO: Ornstein-Uhlenbeck noise.

        sess.run(tf.global_variables_initializer())

        # initialize target net
        actor.update_target_network()
        critic.update_target_network()

        # initialize replay memory
        replay_buffer = ReplayBuffer(BUFFER_SIZE)

        # main loop.
        for ep in range(MAX_EPISODES):

            episode_reward = 0
            ep_batch_avg_q = 0

            s = ENV.reset()

            for step in range(MAX_EP_STEPS):

                a = actor.predict(np.reshape(s,
                                             (1, STATE_DIM)))  #+ actor_noise()
                s2, r, terminal, info = ENV.step(a[0])
                #print(s2)

                replay_buffer.add(np.reshape(s, (STATE_DIM,)), \
                                np.reshape(a, (ACTION_DIM,)), \
                                r, \
                                terminal, \
                                np.reshape(s2, (STATE_DIM,)))

                # Batch sampling.
                if replay_buffer.size() > MINIBATCH_SIZE and \
                    step % TRAIN_INTERVAL == 0:
                    s_batch, a_batch, r_batch, t_batch, s2_batch = \
                        replay_buffer.sample_batch(MINIBATCH_SIZE)

                    # target Q値を計算.
                    target_action = actor.predict_target(s2_batch)
                    target_q = critic.predict_target(s2_batch, target_action)

                    # critic の target V値を計算.
                    targets = []
                    for i in range(MINIBATCH_SIZE):
                        if t_batch[i]:
                            # terminal
                            targets.append(r_batch[i])
                        else:
                            targets.append(r_batch[i] + GAMMA * target_q[i])

                    # Critic を train.
                    #TODO: predQはepisodeではなくrandom batchなのでepisode_avg_maxという統計は不適切.
                    pred_q, _ = critic.train(
                        s_batch, a_batch,
                        np.reshape(targets, (MINIBATCH_SIZE, 1)))

                    # Actor を train.
                    a_outs = actor.predict(s_batch)
                    grads = critic.action_gradients(s_batch, a_outs)
                    #print(grads[0].shape)
                    #exit(1)
                    actor.train(s_batch, grads[0])

                    # Update target networks.
                    # 数batchに一度にするべき?
                    actor.update_target_network()
                    critic.update_target_network()

                    ep_batch_avg_q += np.mean(pred_q)

                s = s2
                episode_reward += r

                if terminal:
                    print('Episode:', ep, 'Reward:', episode_reward)
                    reward_log.append(episode_reward)
                    q_log.append(ep_batch_avg_q / step)

                    break
Exemple #3
0
class SAC:
    def __init__(self,
                 env,
                 gamma=0.99,
                 tau=0.005,
                 learning_rate=3e-4,
                 buffer_size=50000,
                 learning_starts=100,
                 train_freq=1,
                 batch_size=64,
                 target_update_interval=1,
                 gradient_steps=1,
                 target_entropy='auto',
                 ent_coef='auto',
                 random_exploration=0.0,
                 discrete=True,
                 regularized=True,
                 feature_extraction="cnn"):
        self.env = env
        self.learning_starts = learning_starts
        self.random_exploration = random_exploration
        self.train_freq = train_freq
        self.target_update_interval = target_update_interval
        self.batch_size = batch_size
        self.gradient_steps = gradient_steps
        self.learning_rate = learning_rate

        self.graph = tf.Graph()
        with self.graph.as_default():
            self.sess = tf.Session(graph=self.graph)
            self.replay_buffer = ReplayBuffer(buffer_size)
            self.agent = SACAgent(self.sess,
                                  env,
                                  discrete=discrete,
                                  regularized=regularized,
                                  feature_extraction=feature_extraction)
            self.model = SACModel(self.sess, self.agent, target_entropy,
                                  ent_coef, gamma, tau)
            with self.sess.as_default():
                self.sess.run(tf.global_variables_initializer())
                self.sess.run(self.model.target_init_op)
        self.num_timesteps = 0

    def train(self, learning_rate):
        batch_obs, batch_actions, batch_rewards, batch_next_obs, batch_dones = self.replay_buffer.sample(
            self.batch_size)
        # print("batch_actions:", batch_actions.shape)
        # print("self.agent.actions_ph:", self.agent.actions_ph)

        feed_dict = {
            self.agent.obs_ph: batch_obs,
            self.agent.next_obs_ph: batch_next_obs,
            self.model.rewards_ph: batch_rewards.reshape(self.batch_size, -1),
            self.model.terminals_ph: batch_dones.reshape(self.batch_size, -1),
            self.model.learning_rate_ph: learning_rate
        }
        if not self.agent.discrete:
            feed_dict[self.agent.actions_ph] = batch_actions
        else:
            batch_actions = batch_actions.reshape(-1)
            feed_dict[self.agent.actions_ph] = batch_actions
        policy_loss, qf1_loss, qf2_loss, value_loss, *values = self.sess.run(
            self.model.step_ops, feed_dict)
        return policy_loss, qf1_loss, qf2_loss

    def learn(self, total_timesteps):
        learning_rate = get_schedule_fn(self.learning_rate)
        episode_rewards = [0]
        mb_losses = []
        obs = self.env.reset()
        for step in range(total_timesteps):
            if self.num_timesteps < self.learning_starts or np.random.rand(
            ) < self.random_exploration:
                unscaled_action = self.env.action_space.sample()
                action = scale_action(self.env.action_space, unscaled_action)
            else:
                action = self.agent.step(obs[None]).flatten()
                unscaled_action = unscale_action(self.env.action_space, action)
            # print("\nunscaled_action:", unscaled_action)
            new_obs, reward, done, _ = self.env.step(unscaled_action)
            self.num_timesteps += 1
            self.replay_buffer.add(obs, action, reward, new_obs, done)
            obs = new_obs

            if self.num_timesteps % self.train_freq == 0:
                for grad_step in range(self.gradient_steps):
                    if not self.replay_buffer.can_sample(
                            self.batch_size
                    ) or self.num_timesteps < self.learning_starts:
                        break
                    frac = 1.0 - step / total_timesteps
                    current_lr = learning_rate(frac)
                    mb_losses.append(self.train(current_lr))
                    if (step + grad_step) % self.target_update_interval == 0:
                        self.sess.run(self.model.target_update_op)

            episode_rewards[-1] += reward
            if done:
                obs = self.env.reset()
                episode_rewards.append(0)

            mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1)
            loss_str = "/".join([f"{x:.3f}" for x in np.mean(mb_losses, 0)
                                 ]) if len(mb_losses) > 0 else "NaN"
            print(f"Step {step} - reward: {mean_reward} - loss: {loss_str}",
                  end="\n" if step % 500 == 0 else "\r")
Exemple #4
0
    def learn(self, timesteps=10000, verbose=0, seed=None):
        if seed is not None:
            random.seed(seed)
            np.random.seed(seed)
            torch.manual_seed(seed)

        self.eps_range = self._eps_range(timesteps)
        replay_buffer = ReplayBuffer(self.buffer_size)

        self._init_model()

        obs = self.env.reset()
        for step in range(timesteps):
            # while not done:
            cur_eps = next(self.eps_range, None)
            if cur_eps is None:
                cur_eps = self.final_eps

            action = self._select_action(obs, cur_eps)

            new_obs, rewards, done, info = self.env.step(action)
            if done:
                new_obs = [
                    np.nan
                ] * self.obs_shape[0]  # hacky way to keep dimensions correct
            replay_buffer.add(obs, action, rewards, new_obs)

            obs = new_obs

            # learn gradient
            if step > self.learning_starts:
                if len(replay_buffer.buffer
                       ) < self.batch_size:  # buffer too small
                    continue
                samples = replay_buffer.sample(self.batch_size, self.device)
                obs_batch, actions_batch, rewards_batch, new_obs_batch = samples

                predicted_q_values = self._predictQValue(
                    self.step_model, obs_batch, actions_batch)
                ys = self._expectedLabels(self.target_model, new_obs_batch,
                                          rewards_batch)

                loss = F.smooth_l1_loss(predicted_q_values, ys)

                self.optim.zero_grad()
                loss.backward()
                for i in self.step_model.parameters():
                    i.grad.clamp_(min=-1, max=1)  # exploding gradient
                    # i.grad.clamp_(min=-10, max=10) # exploding gradient
                self.optim.step()

                # update target
                if step % self.target_network_update_freq == 0:
                    self.target_model.load_state_dict(
                        self.step_model.state_dict())

            if done:
                obs = self.env.reset()
            if verbose == 1:
                if step % (timesteps * 0.1) == 0:
                    perc = int(step / (timesteps * 0.1))
                    print(f"At step {step}")
                    print(f"{perc}% done")
class MADDPG:
    def __init__(self,
                 env,
                 state_dim: int,
                 action_dim: int,
                 config: Dict,
                 device=None,
                 writer=None):
        self.logger = logging.getLogger("MADDPG")
        self.device = device if device is not None else DEVICE
        self.writer = writer

        self.env = env
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.agents_number = config['agents_number']

        hidden_layers = config.get('hidden_layers', (400, 300))
        noise_scale = config.get('noise_scale', 0.2)
        noise_sigma = config.get('noise_sigma', 0.1)
        actor_lr = config.get('actor_lr', 1e-3)
        actor_lr_decay = config.get('actor_lr_decay', 0)
        critic_lr = config.get('critic_lr', 1e-3)
        critic_lr_decay = config.get('critic_lr_decay', 0)
        self.actor_tau = config.get('actor_tau', 0.002)
        self.critic_tau = config.get('critic_tau', 0.002)
        create_agent = lambda: DDPGAgent(state_dim,
                                         action_dim,
                                         agents=self.agents_number,
                                         hidden_layers=hidden_layers,
                                         actor_lr=actor_lr,
                                         actor_lr_decay=actor_lr_decay,
                                         critic_lr=critic_lr,
                                         critic_lr_decay=critic_lr_decay,
                                         noise_scale=noise_scale,
                                         noise_sigma=noise_sigma,
                                         device=self.device)
        self.agents = [create_agent() for _ in range(self.agents_number)]

        self.discount = 0.99 if 'discount' not in config else config['discount']
        self.gradient_clip = 1.0 if 'gradient_clip' not in config else config[
            'gradient_clip']

        self.warm_up = 1e3 if 'warm_up' not in config else config['warm_up']
        self.buffer_size = int(
            1e6) if 'buffer_size' not in config else config['buffer_size']
        self.batch_size = config.get('batch_size', 128)
        self.p_batch_size = config.get('p_batch_size',
                                       int(self.batch_size // 2))
        self.n_batch_size = config.get('n_batch_size',
                                       int(self.batch_size // 4))
        self.buffer = ReplayBuffer(self.batch_size, self.buffer_size)

        self.update_every_iterations = config.get('update_every_iterations', 2)
        self.number_updates = config.get('number_updates', 2)

        self.reset()

    def reset(self):
        self.iteration = 0
        self.reset_agents()

    def reset_agents(self):
        for agent in self.agents:
            agent.reset_agent()

    def step(self, state, action, reward, next_state, done) -> None:
        if np.isnan(state).any() or np.isnan(next_state).any():
            print("State contains NaN. Skipping.")
            return

        self.iteration += 1
        self.buffer.add(state, action, reward, next_state, done)

        if self.iteration < self.warm_up:
            return

        if len(self.buffer) > self.batch_size and (
                self.iteration % self.update_every_iterations) == 0:
            self.evok_learning()

    def filter_batch(self, batch, agent_number):
        states, actions, rewards, next_states, dones = batch
        agent_states = states[:, agent_number *
                              self.state_dim:(agent_number + 1) *
                              self.state_dim].clone()
        agent_next_states = next_states[:, agent_number *
                                        self.state_dim:(agent_number + 1) *
                                        self.state_dim].clone()
        agent_rewards = rewards.select(1, agent_number).view(-1, 1).clone()
        agent_dones = dones.select(1, agent_number).view(-1, 1).clone()
        return (agent_states, states, actions, agent_rewards,
                agent_next_states, next_states, agent_dones)

    def evok_learning(self):
        for _ in range(self.number_updates):
            for agent_number in range(self.agents_number):
                batch = self.filter_batch(self.buffer.sample(), agent_number)
                self.learn(batch, agent_number)
                # self.update_targets()

    def act(self, states, noise: Union[None, List] = None):
        """get actions from all agents in the MADDPG object"""

        noise = [0] * self.agents_number if noise is None else noise

        tensor_states = torch.tensor(states).view(-1, self.agents_number,
                                                  self.state_dim)
        with torch.no_grad():
            actions = []
            for agent_number, agent in enumerate(self.agents):
                agent.actor.eval()
                actions += agent.act(tensor_states.select(1, agent_number),
                                     noise[agent_number])
                agent.actor.train()

        return torch.stack(actions)

    def learn(self, samples, agent_number: int) -> None:
        """update the critics and actors of all the agents """

        action_offset = agent_number * self.action_dim
        flatten_actions = lambda a: a.view(
            -1, self.agents_number * self.action_dim)

        # No need to flip since there are no paralle agents
        agent_states, states, actions, rewards, agent_next_states, next_states, dones = samples

        agent = self.agents[agent_number]

        next_actions = actions.clone()
        next_actions[:, action_offset:action_offset +
                     self.action_dim] = agent.target_actor(agent_next_states)

        # critic loss
        Q_target_next = agent.target_critic(next_states,
                                            flatten_actions(next_actions))
        Q_target = rewards + (self.discount * Q_target_next * (1 - dones))
        Q_expected = agent.critic(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_target)

        # Minimize the loss
        agent.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(agent.critic.parameters(),
                                       self.gradient_clip)
        agent.critic_optimizer.step()

        # Compute actor loss
        pred_actions = actions.clone()
        pred_actions[:, action_offset:action_offset +
                     self.action_dim] = agent.actor(agent_states)

        actor_loss = -agent.critic(states,
                                   flatten_actions(pred_actions)).mean()
        agent.actor_optimizer.zero_grad()
        actor_loss.backward()
        agent.actor_optimizer.step()

        if self.writer:
            self.writer.add_scalar(f'agent{agent_number}/critic_loss',
                                   critic_loss.item(), self.iteration)
            self.writer.add_scalar(f'agent{agent_number}/actor_loss',
                                   abs(actor_loss.item()), self.iteration)

        self._soft_update(agent.target_actor, agent.actor, self.actor_tau)
        self._soft_update(agent.target_critic, agent.critic, self.critic_tau)

    def _soft_update(self, target: nn.Module, source: nn.Module, tau) -> None:
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - tau) +
                                    param.data * tau)
Exemple #6
0
def base_train_loop(args: dict, policy, replay_buffer: ReplayBuffer, env):
    evaluations = [
        eval_policy(policy, args.domain_name, args.task_name, args.seed)
    ]

    timestep = env.reset()
    episode_reward = 0
    episode_timesteps = 0
    episode_num = 0

    for t in range(int(args.max_timesteps)):

        episode_timesteps += 1

        state = flat_obs(timestep.observation)

        # Select action randomly or according to policy
        if t < args.start_timesteps:
            action = np.random.uniform(
                env.action_spec().minimum,
                env.action_spec().maximum,
                size=env.action_spec().shape,
            )
        else:
            action = policy.select_action(state).clip(-args.max_action,
                                                      args.max_action)

        # Perform action
        timestep = env.step(action)
        done_bool = float(timestep.last())

        # Store data in replay buffer
        replay_buffer.add(state, action, flat_obs(timestep.observation),
                          timestep.reward, done_bool)

        episode_reward += timestep.reward

        # Train agent after collecting sufficient data
        if t >= args.start_timesteps:
            for _ in range(args.train_steps):
                if args.policy == "MPO":
                    policy.train(replay_buffer, args.batch_size,
                                 args.num_action_samples)
                else:
                    policy.train(replay_buffer, args.batch_size)

        if timestep.last():
            # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True
            print(
                f"Total T: {t+1} Episode Num: {episode_num+1} "
                f"Episode T: {episode_timesteps} Reward: {episode_reward:.3f}")
            # Reset environment
            timestep = env.reset()
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1

        # Evaluate episode
        if (t + 1) % args.eval_freq == 0:
            evaluations.append(
                eval_policy(policy, args.domain_name, args.task_name,
                            args.seed))
            np.save(f"./results/{args.file_name}_{t+1}", evaluations)
        if (t + 1) % args.save_freq == 0:
            if args.save_model:
                policy.save(f"./models/{args.file_name}_{t+1}")
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
#         print("Rewards")
#         print(rewards.shape)
        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"

#         self.qnetwork_local.train()
        local_output = self.qnetwork_local(states).gather(1, actions)

        selected_target_actions = torch.max(self.qnetwork_target(next_states).detach(),1)[0].unsqueeze(1)
        activated = torch.sub(torch.Tensor(np.ones(dones.shape)), dones)
        is_it_done = torch.mul(selected_target_actions, activated)
        target_output = torch.add(torch.mul(is_it_done, gamma), rewards)

        loss = F.mse_loss(local_output, target_output)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Exemple #8
0
    # main loop.
    for ep in range(MAX_EPISODES):

        episode_reward = 0

        s = ENV.reset()

        for step in range(MAX_EP_STEPS):

            a = actor.predict(np.reshape(s, (1, STATE_DIM))) + actor_noise()
            s2, r, terminal, info = ENV.step(a[0])

            replay_buffer.add(np.reshape(s, (STATE_DIM,)), \
                              np.reshape(a, (ACTION_DIM,)), \
                              r, \
                              terminal, \
                              np.reshape(s2, (STATE_DIM,)))

            # Batch sampling.
            if replay_buffer.size() > MINIBATCH_SIZE:
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(MINIBATCH_SIZE)

                # target Q値を計算.
                target_action = actor.predict_target(s2_batch)
                target_q = critic.predict_target(s2_batch, target_action)

                # critic の target V値を計算.
                targets = []
                for i in range(MINIBATCH_SIZE):
Exemple #9
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed, framework, buffer_type):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.framework = framework
        self.buffer_type = buffer_type

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        # def __init__(self, device, buffer_size, batch_size, alpha, beta):
        if self.buffer_type == 'PER_ReplayBuffer':
            self.memory = PER_ReplayBuffer(device, BUFFER_SIZE, BATCH_SIZE,
                                           ALPHA, BETA)
        if self.buffer_type == 'ReplayBuffer':
            self.memory = ReplayBuffer(device, action_size, BUFFER_SIZE,
                                       BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                if self.buffer_type == 'ReplayBuffer':
                    experiences = self.memory.sample()
                    is_weights = None
                    idxs = None
                if self.buffer_type == 'PER_ReplayBuffer':
                    experiences, is_weights, idxs = self.memory.sample()
                    self.criterion = WeightedLoss()
                    #print('debugging:', experiences )
                self.learn(experiences, is_weights, idxs, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()  # use local network

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, is_weights, idxs, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        if self.framework == 'DQN':
            # Get max predicted Q values (for next states) from target model
            Q_targets_next = self.qnetwork_target(next_states).detach(
            ).max(1)[0].unsqueeze(
                1
            )  #target network: which is uploaded slower than the local network
            #print('Q_targets_next is ', Q_targets_next)
        if self.framework == "DDQN":
            #print("DDQN")
            max_actions = self.qnetwork_local(next_states).detach().argmax(
                1).unsqueeze(1)
            #print('max_actions is ', max_actions)
            #print('self.qnetwork_target(next_states).detach() is ',self.qnetwork_target(next_states).detach())
            #Q_targets_next = self.qnetwork_target(next_states).detach().gather(1, max_actions).squeeze(1)
            Q_targets_next = self.qnetwork_target(next_states).detach().gather(
                1, max_actions)
            #print('DDQN, Q', Q_targets_next)
            #print('rewards is ', rewards)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        #------------------------------------------------------------------------------
        #Huber Loss provides better results than MSE
        if is_weights is None:
            loss = F.smooth_l1_loss(Q_expected, Q_targets)

        #Compute Huber Loss manually to utilize is_weights with Prioritization
        else:
            loss, td_errors = self.criterion.huber(Q_expected, Q_targets,
                                                   is_weights)
            self.memory.batch_update(idxs, td_errors)

        # Perform gradient descent
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)