Ejemplo n.º 1
0
class DQNG(DQN):
    def __init__(self, args, env, env_test, logger):
        super(DQNG, self).__init__(args, env, env_test, logger)

    def init(self, args, env):
        names = ['state0', 'action', 'state1', 'reward', 'terminal', 'goal']
        self.buffer = ReplayBuffer(limit=int(1e6), names=names.copy())
        if args['--imit'] != '0':
            names.append('expVal')
            self.bufferImit = ReplayBuffer(limit=int(1e6), names=names.copy())
        self.critic = CriticDQNG(args, env)

    def train(self):

        if self.buffer.nb_entries > self.batch_size:
            exp = self.buffer.sample(self.batch_size)
            s0, a0, s1, r, t, g = [exp[name] for name in self.buffer.names]
            targets_dqn = self.critic.get_targets_dqn(r, t, s1, g)
            inputs = [s0, a0, g]
            loss = self.critic.qvalModel.train_on_batch(inputs, targets_dqn)
            for i, metric in enumerate(self.critic.qvalModel.metrics_names):
                self.metrics[metric] += loss[i]

            if self.args[
                    '--imit'] != '0' and self.bufferImit.nb_entries > self.batch_size:
                exp = self.bufferImit.sample(self.batch_size)
                s0, a0, s1, r, t, g, e = [
                    exp[name] for name in self.bufferImit.names
                ]
                targets_dqn = self.critic.get_targets_dqn(r, t, s1, g)
                targets = [
                    targets_dqn,
                    np.zeros((self.batch_size, 1)),
                    np.zeros((self.batch_size, 1))
                ]
                inputs = [s0, a0, g, e]
                loss = self.critic.imitModel.train_on_batch(inputs, targets)
                for i, metric in enumerate(
                        self.critic.imitModel.metrics_names):
                    self.imitMetrics[metric] += loss[i]

            self.critic.target_train()

    def make_input(self, state, t):
        input = [np.expand_dims(i, axis=0) for i in [state, self.env.goal]]
        # temp = self.env.explor_temp(t)
        input.append(np.expand_dims([0.5], axis=0))
        return input
Ejemplo n.º 2
0
class DDPGG(DDPG):
    def __init__(self, args, env, env_test, logger):
        super(DDPGG, self).__init__(args, env, env_test, logger)

    def init(self, args, env):
        names = ['s0', 'a', 's1', 'r', 't', 'g']
        metrics = ['loss_dqn', 'loss_actor']
        self.buffer = ReplayBuffer(limit=int(1e6),
                                   names=names.copy(),
                                   args=args)
        self.actorCritic = ActorCriticDDPGG(args, env)
        for metric in metrics:
            self.metrics[metric] = 0

    def train(self):

        if self.buffer.nb_entries > self.batch_size:
            exp = self.buffer.sample(self.batch_size)
            targets_dqn = self.actorCritic.get_targets_dqn(
                exp['r'], exp['t'], exp['s1'], exp['g'])
            inputs = [exp['s0'], exp['a'], exp['g'], targets_dqn]
            loss_dqn = self.actorCritic.trainQval(inputs)
            action, criticActionGrads, invertedCriticActionGrads = self.actorCritic.trainActor(
                [exp['s0'], exp['g']])
            self.metrics['loss_dqn'] += np.squeeze(loss_dqn)
            self.actorCritic.target_train()

    def make_input(self, state, mode):
        if mode == 'train':
            input = [np.expand_dims(i, axis=0) for i in [state, self.env.goal]]
        else:
            input = [
                np.expand_dims(i, axis=0) for i in [state, self.env_test.goal]
            ]
        return input

    def reset(self):

        if self.trajectory:
            self.env.end_episode(self.trajectory)
            for expe in self.trajectory:
                self.buffer.append(expe.copy())
            if self.args['--her'] != '0':
                augmented_ep = self.env.augment_episode(self.trajectory)
                for e in augmented_ep:
                    self.buffer.append(e)
            self.trajectory.clear()

        state = self.env.reset()
        self.episode_step = 0

        return state
Ejemplo n.º 3
0
class Qoff(Agent):
    def __init__(self, args, env, env_test, logger):
        super(Qoff, self).__init__(args, env, env_test, logger)
        self.args = args
        self.gamma = 0.99
        self.lr = 0.1
        self.names = ['state0', 'action', 'state1', 'reward', 'terminal']
        self.init(args, env)

    def init(self, args, env):
        self.critic = np.zeros(shape=(5, 5, 4))
        self.buffer = ReplayBuffer(limit=int(1e6), names=self.names)

    def train(self):
        if self.buffer.nb_entries > self.batch_size:
            exp = self.buffer.sample(self.batch_size)
            s0, a0, s1, r, t, g, m = [exp[name] for name in self.names]
            for k in range(self.batch_size):
                target = r[k] + (1 - t[k]) * self.gamma * np.max(
                    self.critic[tuple(s1[k])])
                self.critic[tuple(s0[k])][a0[k]] = self.lr * target + \
                                                       (1 - self.lr) * self.critic[tuple(s0[k])][a0[k]]

    def act(self, state):
        if np.random.rand() < 0.2:
            action = np.random.randint(self.env.action_space.n)
        else:
            action = np.argmax(self.critic[tuple(state)])
        return action

    def reset(self):

        if self.trajectory:
            self.env.processEp(self.trajectory)
            for expe in reversed(self.trajectory):
                self.buffer.append(expe.copy())
            self.trajectory.clear()

        state = self.env.reset()
        self.episode_step = 0

        return state
Ejemplo n.º 4
0
Archivo: sac.py Proyecto: vietbt/RLpp
class SAC:
    def __init__(self,
                 env,
                 gamma=0.99,
                 tau=0.005,
                 learning_rate=3e-4,
                 buffer_size=50000,
                 learning_starts=100,
                 train_freq=1,
                 batch_size=64,
                 target_update_interval=1,
                 gradient_steps=1,
                 target_entropy='auto',
                 ent_coef='auto',
                 random_exploration=0.0,
                 discrete=True,
                 regularized=True,
                 feature_extraction="cnn"):
        self.env = env
        self.learning_starts = learning_starts
        self.random_exploration = random_exploration
        self.train_freq = train_freq
        self.target_update_interval = target_update_interval
        self.batch_size = batch_size
        self.gradient_steps = gradient_steps
        self.learning_rate = learning_rate

        self.graph = tf.Graph()
        with self.graph.as_default():
            self.sess = tf.Session(graph=self.graph)
            self.replay_buffer = ReplayBuffer(buffer_size)
            self.agent = SACAgent(self.sess,
                                  env,
                                  discrete=discrete,
                                  regularized=regularized,
                                  feature_extraction=feature_extraction)
            self.model = SACModel(self.sess, self.agent, target_entropy,
                                  ent_coef, gamma, tau)
            with self.sess.as_default():
                self.sess.run(tf.global_variables_initializer())
                self.sess.run(self.model.target_init_op)
        self.num_timesteps = 0

    def train(self, learning_rate):
        batch_obs, batch_actions, batch_rewards, batch_next_obs, batch_dones = self.replay_buffer.sample(
            self.batch_size)
        # print("batch_actions:", batch_actions.shape)
        # print("self.agent.actions_ph:", self.agent.actions_ph)

        feed_dict = {
            self.agent.obs_ph: batch_obs,
            self.agent.next_obs_ph: batch_next_obs,
            self.model.rewards_ph: batch_rewards.reshape(self.batch_size, -1),
            self.model.terminals_ph: batch_dones.reshape(self.batch_size, -1),
            self.model.learning_rate_ph: learning_rate
        }
        if not self.agent.discrete:
            feed_dict[self.agent.actions_ph] = batch_actions
        else:
            batch_actions = batch_actions.reshape(-1)
            feed_dict[self.agent.actions_ph] = batch_actions
        policy_loss, qf1_loss, qf2_loss, value_loss, *values = self.sess.run(
            self.model.step_ops, feed_dict)
        return policy_loss, qf1_loss, qf2_loss

    def learn(self, total_timesteps):
        learning_rate = get_schedule_fn(self.learning_rate)
        episode_rewards = [0]
        mb_losses = []
        obs = self.env.reset()
        for step in range(total_timesteps):
            if self.num_timesteps < self.learning_starts or np.random.rand(
            ) < self.random_exploration:
                unscaled_action = self.env.action_space.sample()
                action = scale_action(self.env.action_space, unscaled_action)
            else:
                action = self.agent.step(obs[None]).flatten()
                unscaled_action = unscale_action(self.env.action_space, action)
            # print("\nunscaled_action:", unscaled_action)
            new_obs, reward, done, _ = self.env.step(unscaled_action)
            self.num_timesteps += 1
            self.replay_buffer.add(obs, action, reward, new_obs, done)
            obs = new_obs

            if self.num_timesteps % self.train_freq == 0:
                for grad_step in range(self.gradient_steps):
                    if not self.replay_buffer.can_sample(
                            self.batch_size
                    ) or self.num_timesteps < self.learning_starts:
                        break
                    frac = 1.0 - step / total_timesteps
                    current_lr = learning_rate(frac)
                    mb_losses.append(self.train(current_lr))
                    if (step + grad_step) % self.target_update_interval == 0:
                        self.sess.run(self.model.target_update_op)

            episode_rewards[-1] += reward
            if done:
                obs = self.env.reset()
                episode_rewards.append(0)

            mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1)
            loss_str = "/".join([f"{x:.3f}" for x in np.mean(mb_losses, 0)
                                 ]) if len(mb_losses) > 0 else "NaN"
            print(f"Step {step} - reward: {mean_reward} - loss: {loss_str}",
                  end="\n" if step % 500 == 0 else "\r")
Ejemplo n.º 5
0
class DDPG(Agent):
    def __init__(self, args, env, env_test, logger):
        super(DDPG, self).__init__(args, env, env_test, logger)
        self.args = args
        self.init(args, env)
        for metric in self.critic.model.metrics_names:
            self.metrics[self.critic.model.name + '_' + metric] = 0

    def init(self, args, env):
        names = ['state0', 'action', 'state1', 'reward', 'terminal']
        self.buffer = ReplayBuffer(limit=int(1e6), names=names.copy())
        self.actorCritic = ActorCriticDDPG(args, env)
        # self.critic = CriticDDPG(args, env)
        # self.actor = ActorDDPG(args, env)

    def train(self):

        if self.buffer.nb_entries > self.batch_size:
            exp = self.buffer.sample(self.batch_size)
            s0, a0, s1, r, t = [exp[name] for name in self.buffer.names]
            a1 = self.actor.target_model.predict_on_batch(s1)
            a1 = np.clip(a1, self.env.action_space.low,
                         self.env.action_space.high)
            q = self.critic.Tmodel.predict_on_batch([s1, a1])
            targets = r + (1 - t) * self.critic.gamma * np.squeeze(q)
            targets = np.clip(targets, self.env.minR / (1 - self.critic.gamma),
                              self.env.maxR)
            inputs = [s0, a0]
            loss = self.critic.model.train_on_batch(inputs, targets)
            for i, metric in enumerate(self.critic.model.metrics_names):
                self.metrics[metric] += loss[i]

            # a2 = self.actor.model.predict_on_batch(s0)
            # grads = self.critic.gradsModel.predict_on_batch([s0, a2])
            # low = self.env.action_space.low
            # high = self.env.action_space.high
            # for d in range(grads[0].shape[0]):
            #     width = high[d] - low[d]
            #     for k in range(self.batch_size):
            #         if grads[k][d] >= 0:
            #             grads[k][d] *= (high[d] - a2[k][d]) / width
            #         else:
            #             grads[k][d] *= (a2[k][d] - low[d]) / width
            # self.actor.train(s0, grads)

            self.actor.target_train()
            self.critic.target_train()

    def reset(self):

        if self.trajectory:
            T = int(self.trajectory[-1]['terminal'])
            R = np.sum([
                self.env.unshape(exp['reward'], exp['terminal'])
                for exp in self.trajectory
            ])
            S = len(self.trajectory)
            self.env.processEp(R, S, T)
            for expe in reversed(self.trajectory):
                self.buffer.append(expe.copy())

            self.trajectory.clear()

        state = self.env.reset()
        self.episode_step = 0

        return state

    def make_input(self, state):
        input = [np.reshape(state, (1, self.actor.s_dim[0]))]
        return input

    def act(self, state):
        input = self.make_input(state)
        action = self.actor.model.predict(input, batch_size=1)
        noise = np.random.normal(0., 0.1, size=action.shape)
        action = noise + action
        action = np.clip(action, self.env.action_space.low,
                         self.env.action_space.high)
        action = action.squeeze()
        return action
class MADDPG:
    def __init__(self,
                 env,
                 state_dim: int,
                 action_dim: int,
                 config: Dict,
                 device=None,
                 writer=None):
        self.logger = logging.getLogger("MADDPG")
        self.device = device if device is not None else DEVICE
        self.writer = writer

        self.env = env
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.agents_number = config['agents_number']

        hidden_layers = config.get('hidden_layers', (400, 300))
        noise_scale = config.get('noise_scale', 0.2)
        noise_sigma = config.get('noise_sigma', 0.1)
        actor_lr = config.get('actor_lr', 1e-3)
        actor_lr_decay = config.get('actor_lr_decay', 0)
        critic_lr = config.get('critic_lr', 1e-3)
        critic_lr_decay = config.get('critic_lr_decay', 0)
        self.actor_tau = config.get('actor_tau', 0.002)
        self.critic_tau = config.get('critic_tau', 0.002)
        create_agent = lambda: DDPGAgent(state_dim,
                                         action_dim,
                                         agents=self.agents_number,
                                         hidden_layers=hidden_layers,
                                         actor_lr=actor_lr,
                                         actor_lr_decay=actor_lr_decay,
                                         critic_lr=critic_lr,
                                         critic_lr_decay=critic_lr_decay,
                                         noise_scale=noise_scale,
                                         noise_sigma=noise_sigma,
                                         device=self.device)
        self.agents = [create_agent() for _ in range(self.agents_number)]

        self.discount = 0.99 if 'discount' not in config else config['discount']
        self.gradient_clip = 1.0 if 'gradient_clip' not in config else config[
            'gradient_clip']

        self.warm_up = 1e3 if 'warm_up' not in config else config['warm_up']
        self.buffer_size = int(
            1e6) if 'buffer_size' not in config else config['buffer_size']
        self.batch_size = config.get('batch_size', 128)
        self.p_batch_size = config.get('p_batch_size',
                                       int(self.batch_size // 2))
        self.n_batch_size = config.get('n_batch_size',
                                       int(self.batch_size // 4))
        self.buffer = ReplayBuffer(self.batch_size, self.buffer_size)

        self.update_every_iterations = config.get('update_every_iterations', 2)
        self.number_updates = config.get('number_updates', 2)

        self.reset()

    def reset(self):
        self.iteration = 0
        self.reset_agents()

    def reset_agents(self):
        for agent in self.agents:
            agent.reset_agent()

    def step(self, state, action, reward, next_state, done) -> None:
        if np.isnan(state).any() or np.isnan(next_state).any():
            print("State contains NaN. Skipping.")
            return

        self.iteration += 1
        self.buffer.add(state, action, reward, next_state, done)

        if self.iteration < self.warm_up:
            return

        if len(self.buffer) > self.batch_size and (
                self.iteration % self.update_every_iterations) == 0:
            self.evok_learning()

    def filter_batch(self, batch, agent_number):
        states, actions, rewards, next_states, dones = batch
        agent_states = states[:, agent_number *
                              self.state_dim:(agent_number + 1) *
                              self.state_dim].clone()
        agent_next_states = next_states[:, agent_number *
                                        self.state_dim:(agent_number + 1) *
                                        self.state_dim].clone()
        agent_rewards = rewards.select(1, agent_number).view(-1, 1).clone()
        agent_dones = dones.select(1, agent_number).view(-1, 1).clone()
        return (agent_states, states, actions, agent_rewards,
                agent_next_states, next_states, agent_dones)

    def evok_learning(self):
        for _ in range(self.number_updates):
            for agent_number in range(self.agents_number):
                batch = self.filter_batch(self.buffer.sample(), agent_number)
                self.learn(batch, agent_number)
                # self.update_targets()

    def act(self, states, noise: Union[None, List] = None):
        """get actions from all agents in the MADDPG object"""

        noise = [0] * self.agents_number if noise is None else noise

        tensor_states = torch.tensor(states).view(-1, self.agents_number,
                                                  self.state_dim)
        with torch.no_grad():
            actions = []
            for agent_number, agent in enumerate(self.agents):
                agent.actor.eval()
                actions += agent.act(tensor_states.select(1, agent_number),
                                     noise[agent_number])
                agent.actor.train()

        return torch.stack(actions)

    def learn(self, samples, agent_number: int) -> None:
        """update the critics and actors of all the agents """

        action_offset = agent_number * self.action_dim
        flatten_actions = lambda a: a.view(
            -1, self.agents_number * self.action_dim)

        # No need to flip since there are no paralle agents
        agent_states, states, actions, rewards, agent_next_states, next_states, dones = samples

        agent = self.agents[agent_number]

        next_actions = actions.clone()
        next_actions[:, action_offset:action_offset +
                     self.action_dim] = agent.target_actor(agent_next_states)

        # critic loss
        Q_target_next = agent.target_critic(next_states,
                                            flatten_actions(next_actions))
        Q_target = rewards + (self.discount * Q_target_next * (1 - dones))
        Q_expected = agent.critic(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_target)

        # Minimize the loss
        agent.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(agent.critic.parameters(),
                                       self.gradient_clip)
        agent.critic_optimizer.step()

        # Compute actor loss
        pred_actions = actions.clone()
        pred_actions[:, action_offset:action_offset +
                     self.action_dim] = agent.actor(agent_states)

        actor_loss = -agent.critic(states,
                                   flatten_actions(pred_actions)).mean()
        agent.actor_optimizer.zero_grad()
        actor_loss.backward()
        agent.actor_optimizer.step()

        if self.writer:
            self.writer.add_scalar(f'agent{agent_number}/critic_loss',
                                   critic_loss.item(), self.iteration)
            self.writer.add_scalar(f'agent{agent_number}/actor_loss',
                                   abs(actor_loss.item()), self.iteration)

        self._soft_update(agent.target_actor, agent.actor, self.actor_tau)
        self._soft_update(agent.target_critic, agent.critic, self.critic_tau)

    def _soft_update(self, target: nn.Module, source: nn.Module, tau) -> None:
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - tau) +
                                    param.data * tau)
Ejemplo n.º 7
0
class DQNAgent():
    """Deep Q-learning agent."""

    # def __init__(self,
    # env, device=DEVICE, summary_writer=writer,  # noqa
    # hyperparameters=DQN_HYPERPARAMS):  # noqa

    rewards = []
    total_reward = 0
    birth_time = 0
    n_iter = 0
    n_games = 0
    ts_frame = 0
    ts = time.time()

    # Memory = namedtuple(
    # 'Memory', ['obs', 'action', 'new_obs', 'reward', 'done'],
    # verbose=False, rename=False)
    Memory = namedtuple('Memory',
                        ['obs', 'action', 'new_obs', 'reward', 'done'],
                        rename=False)

    def __init__(self, env, hyperparameters, device, summary_writer=None):
        """Set parameters, initialize network."""

        state_space_shape = env.observation_space.shape
        action_space_size = env.action_space.n

        self.env = env

        self.online_network = DQN(state_space_shape,
                                  action_space_size).to(device)

        self.target_network = DQN(state_space_shape,
                                  action_space_size).to(device)

        # XXX maybe not really necesary?
        self.update_target_network()

        self.experience_replay = None

        self.accumulated_loss = []
        self.device = device

        self.optimizer = optim.Adam(self.online_network.parameters(),
                                    lr=hyperparameters['learning_rate'])

        self.double_DQN = hyperparameters['double_DQN']

        # Discount factor
        self.gamma = hyperparameters['gamma']

        # XXX ???
        self.n_multi_step = hyperparameters['n_multi_step']

        self.replay_buffer = ReplayBuffer(hyperparameters['buffer_capacity'],
                                          hyperparameters['n_multi_step'],
                                          hyperparameters['gamma'])

        self.birth_time = time.time()

        self.iter_update_target = hyperparameters['n_iter_update_target']
        self.buffer_start_size = hyperparameters['buffer_start_size']

        self.summary_writer = summary_writer

        # Greedy search hyperparameters
        self.epsilon_start = hyperparameters['epsilon_start']
        self.epsilon = hyperparameters['epsilon_start']
        self.epsilon_decay = hyperparameters['epsilon_decay']
        self.epsilon_final = hyperparameters['epsilon_final']

    def get_max_action(self, obs):
        '''
        Forward pass of the NN to obtain the action of the given observations
        '''
        # convert the observation in tensor
        state_t = torch.tensor(np.array([obs])).to(self.device)

        # forward pass
        q_values_t = self.online_network(state_t)

        # get the maximum value of the output (i.e. the best action to take)
        _, act_t = torch.max(q_values_t, dim=1)

        return int(act_t.item())

    def act(self, obs):
        '''
        Greedy action outputted by the NN in the CentralControl
        '''
        return self.get_max_action(obs)

    def act_eps_greedy(self, obs):
        '''
        E-greedy action
        '''

        # In case of a noisy net, it takes a greedy action
        # if self.noisy_net:
        # return self.act(obs)

        if np.random.random() < self.epsilon:
            return self.env.action_space.sample()
        else:
            return self.act(obs)

    def update_target_network(self):
        """Update target network weights with current online network values."""

        self.target_network.load_state_dict(self.online_network.state_dict())

    def set_optimizer(self, learning_rate):
        self.optimizer = optim.Adam(self.online_network.parameters(),
                                    lr=learning_rate)

    def sample_and_optimize(self, batch_size):
        '''
        Sample batch_size memories from the buffer and optimize them
        '''

        # This should be the part where it waits until it has enough
        # experience
        if len(self.replay_buffer) > self.buffer_start_size:
            # sample
            mini_batch = self.replay_buffer.sample(batch_size)
            # optimize
            # l_loss = self.cc.optimize(mini_batch)
            l_loss = self.optimize(mini_batch)
            self.accumulated_loss.append(l_loss)

        # update target NN
        if self.n_iter % self.iter_update_target == 0:
            self.update_target_network()

    def optimize(self, mini_batch):
        '''
        Optimize the NN
        '''
        # reset the grads
        self.optimizer.zero_grad()
        # caluclate the loss of the mini batch
        loss = self._calulate_loss(mini_batch)
        loss_v = loss.item()

        # do backpropagation
        loss.backward()
        # one step of optimization
        self.optimizer.step()

        return loss_v

    def _calulate_loss(self, mini_batch):
        '''
        Calculate mini batch's MSE loss.
        It support also the double DQN version
        '''

        states, actions, next_states, rewards, dones = mini_batch

        # convert the data in tensors
        states_t = torch.as_tensor(states, device=self.device)
        next_states_t = torch.as_tensor(next_states, device=self.device)
        actions_t = torch.as_tensor(actions, device=self.device)
        rewards_t = torch.as_tensor(rewards,
                                    dtype=torch.float32,
                                    device=self.device)

        done_t = torch.as_tensor(dones, dtype=torch.uint8,
                                 device=self.device)  # noqa

        # Value of the action taken previously (recorded in actions_v)
        # in state_t
        state_action_values = self.online_network(states_t).gather(
            1, actions_t[:, None]).squeeze(-1)

        # NB gather is a differentiable function

        # Next state value with Double DQN. (i.e. get the value predicted
        # by the target nn, of the best action predicted by the online nn)
        if self.double_DQN:
            double_max_action = self.online_network(next_states_t).max(1)[1]
            double_max_action = double_max_action.detach()
            target_output = self.target_network(next_states_t)

            # NB: [:,None] add an extra dimension
            next_state_values = torch.gather(
                target_output, 1, double_max_action[:, None]).squeeze(-1)

        # Next state value in the normal configuration
        else:
            next_state_values = self.target_network(next_states_t).max(1)[0]

        next_state_values = next_state_values.detach()  # No backprop

        # Use the Bellman equation
        expected_state_action_values = rewards_t + \
            (self.gamma**self.n_multi_step) * next_state_values

        # compute the loss
        return nn.MSELoss()(state_action_values, expected_state_action_values)

    def reset_stats(self):
        '''
        Reset the agent's statistics
        '''
        self.rewards.append(self.total_reward)
        self.total_reward = 0
        self.accumulated_loss = []
        self.n_games += 1

    def add_env_feedback(self, obs, action, new_obs, reward, done):
        '''
        Acquire a new feedback from the environment. The feedback is
        constituted by the new observation, the reward and the done boolean.
        '''

        # Create the new memory and update the buffer
        new_memory = self.Memory(obs=obs,
                                 action=action,
                                 new_obs=new_obs,
                                 reward=reward,
                                 done=done)

        # Append it to the replay buffer
        self.replay_buffer.append(new_memory)

        # update the variables
        self.n_iter += 1

        # TODO check this...
        # decrease epsilon
        self.epsilon = max(
            self.epsilon_final,
            self.epsilon_start - self.n_iter / self.epsilon_decay)

        self.total_reward += reward

    def print_info(self):
        '''
        Print information about the agent
        '''

        fps = (self.n_iter - self.ts_frame) / (time.time() - self.ts)

        # TODO replace with proper logger
        print('%d %d rew:%d mean_rew:%.2f eps:%.2f, fps:%d, loss:%.4f' %
              (self.n_iter, self.n_games, self.total_reward,
               np.mean(self.rewards[-40:]), self.epsilon, fps,
               np.mean(self.accumulated_loss)))

        self.ts_frame = self.n_iter
        self.ts = time.time()

        if self.summary_writer is not None:
            self.summary_writer.add_scalar('reward', self.total_reward,
                                           self.n_games)
            self.summary_writer.add_scalar('mean_reward',
                                           np.mean(self.rewards[-40:]),
                                           self.n_games)
            self.summary_writer.add_scalar('10_mean_reward',
                                           np.mean(self.rewards[-10:]),
                                           self.n_games)
            self.summary_writer.add_scalar('epsilon', self.epsilon,
                                           self.n_games)
            self.summary_writer.add_scalar('loss',
                                           np.mean(self.accumulated_loss),
                                           self.n_games)
Ejemplo n.º 8
0
class TD3:
    def __init__(self,
                 env,
                 gamma=0.99,
                 tau=1e-3,
                 pol_lr=1e-4,
                 q_lr=5e-3,
                 batch_size=64,
                 buffer_size=10000,
                 target_noise=0.2,
                 action_noise=0.1,
                 clip_range=0.5,
                 update_delay=2):

        # environment stuff
        self.env = env
        self.num_act = env.action_space.shape[0]
        self.num_obs = env.observation_space.shape[0]
        self.eval_env = copy.deepcopy(env)

        # hyper parameters
        self.gamma = gamma
        self.tau = tau
        self.pol_lr = pol_lr
        self.q_lr = q_lr
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.target_noise = target_noise
        self.action_noise = action_noise
        self.clip_range = clip_range
        self.update_delay = 2

        # networks
        self.pol = Actor(self.num_obs, self.num_act, [400, 300]).double()
        self.q1 = Critic(self.num_obs, self.num_act, [400, 300]).double()
        self.q2 = Critic(self.num_obs, self.num_act, [400, 300]).double()
        self.pol.init_weights()
        self.q1.init_weights()
        self.q2.init_weights()
        self.target_pol = copy.deepcopy(self.pol).double()
        self.target_q1 = copy.deepcopy(self.q1).double()
        self.target_q2 = copy.deepcopy(self.q2).double()

        # optimizers, buffer
        self.pol_opt = torch.optim.Adam(self.pol.parameters(), lr=self.pol_lr)
        self.q1_opt = torch.optim.Adam(
            self.q1.parameters(),
            lr=self.q_lr,
        )
        self.q2_opt = torch.optim.Adam(
            self.q2.parameters(),
            lr=self.q_lr,
        )
        self.buffer = ReplayBuffer(self.buffer_size, 1000)
        self.mse_loss = torch.nn.MSELoss()

        self.cum_q1_loss = 0
        self.cum_q2_loss = 0
        self.cum_obj = 0

    def noise(self, noise, length):
        return torch.tensor(np.random.multivariate_normal(
            mean=np.array([0.0 for i in range(length)]),
            cov=np.diag([noise for i in range(length)])),
                            dtype=torch.double)

    # fill up buffer first
    def prep_buffer(self):
        obs = self.env.reset()
        while not self.buffer.ready:
            pre_obs = obs
            action = self.env.action_space.sample()
            obs, reward, done, _ = self.env.step(action)
            self.buffer.insert((pre_obs, action, reward, obs, done))
            if done: obs = self.env.reset()

    # for clipping off values
    def clip(self, x, l, u):
        if isinstance(l, (list, np.ndarray)):
            lower = torch.tensor(l, dtype=torch.double)
            upper = torch.tensor(u, dtype=torch.double)
        elif isinstance(l, (int, float)):
            lower = torch.tensor([l for i in range(len(x))],
                                 dtype=torch.double)
            upper = torch.tensor([u for i in range(len(x))],
                                 dtype=torch.double)
        else:
            assert (False, "Clipped wrong")

        return torch.max(torch.min(x, upper), lower)

    # update neural net
    def update_networks(self):
        # (pre_obs, action, reward, obs, done)
        pre_obs = torch.tensor(self.batch[0], dtype=torch.double)
        actions = torch.tensor(self.batch[1], dtype=torch.double)
        rewards = torch.tensor(self.batch[2], dtype=torch.double)
        obs = torch.tensor(self.batch[3], dtype=torch.double)
        done = torch.tensor(self.batch[4], dtype=torch.double).unsqueeze(1)

        self.q1_opt.zero_grad()
        self.q2_opt.zero_grad()
        noise = self.clip(
            torch.tensor(self.noise(self.target_noise, self.num_act)),
            -self.clip_range, self.clip_range)
        target_action = self.clip(
            self.target_pol(obs) + noise, self.env.action_space.low,
            self.env.action_space.high)
        target_q1_val = self.target_q1(obs, target_action)
        target_q2_val = self.target_q2(obs, target_action)
        y = rewards + (self.gamma *
                       (1.0 - done) * torch.min(target_q1_val, target_q2_val))
        # loss = torch.sum((y - self.q(pre_obs, actions)) ** 2) / self.batch_size
        q1_loss = self.mse_loss(self.q1(pre_obs, actions), y)
        q2_loss = self.mse_loss(self.q2(pre_obs, actions), y)
        q1_loss.backward(retain_graph=True)
        q2_loss.backward()
        self.q1_opt.step()
        self.q2_opt.step()
        self.cum_q1_loss += q1_loss
        self.cum_q2_loss += q2_loss

        self.pol_opt.zero_grad()
        objective = -self.q1(pre_obs, self.pol(pre_obs)).mean()
        objective.backward()
        self.pol_opt.step()
        self.cum_obj += objective

    # update target networks with tau
    def update_target_networks(self):
        for target, actual in zip(self.target_q1.named_parameters(),
                                  self.q1.named_parameters()):
            target[1].data.copy_(self.tau * actual[1].data +
                                 (1 - self.tau) * target[1].data)
        for target, actual in zip(self.target_q2.named_parameters(),
                                  self.q2.named_parameters()):
            target[1].data.copy_(self.tau * actual[1].data +
                                 (1 - self.tau) * target[1].data)
        for target, actual in zip(self.target_pol.named_parameters(),
                                  self.pol.named_parameters()):
            target[1].data.copy_(self.tau * actual[1].data +
                                 (1 - self.tau) * target[1].data)

    def policy_eval(self):
        state = self.eval_env.reset()
        done = False
        rewards = []
        while not done:
            inp = torch.tensor(state, dtype=torch.double)
            action = self.pol(inp)
            action = action.detach().numpy()
            next_state, r, done, _ = self.eval_env.step(action)
            rewards.append(r)
            # self.eval_env.render()
            # time.sleep(0.1)
            state = next_state

        total = sum(rewards)
        return total

    def train(self, num_iters=200000, eval_len=1000, render=False):
        print("Start")
        if render: self.env.render('human')
        self.prep_buffer()
        obs = self.env.reset()
        iter_info = []

        # train for num_iters
        for i in range(int(num_iters / eval_len)):
            for j in trange(eval_len):
                # one step and put into buffer
                pre_obs = obs
                inp = torch.tensor(obs, dtype=torch.double)
                action = self.pol(inp)
                action = action + self.noise(self.action_noise, self.num_act)
                action = action.detach().numpy()
                obs, reward, done, _ = self.env.step(action)
                self.buffer.insert((pre_obs, action, reward, obs, done))
                if render:
                    self.env.render('human')
                    time.sleep(0.000001)
                if done: obs = self.env.reset()

                # TD3 updates less often
                if j % self.update_delay == 0:
                    # sample from buffer, train one step, update target networks
                    self.batch = self.buffer.sample(self.batch_size)
                    self.update_networks()
                    self.update_target_networks()

            iter_reward = self.policy_eval()
            avg_q1_loss = self.cum_q1_loss / ((i + 1) * eval_len)
            avg_q2_loss = self.cum_q2_loss / ((i + 1) * eval_len)
            avg_obj = self.cum_obj / ((i + 1) * eval_len)
            print("Iteration {}/{}".format((i + 1) * eval_len, num_iters))
            print("Rewards: {} | Q Loss: {}, {} | Policy Objective: {}".format(
                iter_reward, avg_q1_loss, avg_q2_loss, avg_obj))
            iter_info.append((iter_reward, avg_q1_loss, avg_q2_loss, avg_obj))

        return iter_info
Ejemplo n.º 9
0
class DQNAgent():
    '''
	Agent class. It control all the agent functionalities
	'''
    rewards = []
    total_reward = 0
    birth_time = 0
    n_iter = 0
    n_games = 0
    ts_frame = 0
    ts = time.time()

    Memory = namedtuple('Memory',
                        ['obs', 'action', 'new_obs', 'reward', 'done'],
                        rename=False)

    def __init__(self, env, device, hyperparameters, summary_writer=None):
        '''
		Agent initialization. It create the CentralControl that control all the low
		'''

        # The CentralControl is the 'brain' of the agent
        self.cc = CentralControl(env.observation_space.shape,
                                 env.action_space.n, hyperparameters['gamma'],
                                 hyperparameters['n_multi_step'],
                                 hyperparameters['double_DQN'],
                                 hyperparameters['noisy_net'],
                                 hyperparameters['dueling'], device)

        self.cc.set_optimizer(hyperparameters['learning_rate'])

        self.birth_time = time.time()

        self.iter_update_target = hyperparameters['n_iter_update_target']
        self.buffer_start_size = hyperparameters['buffer_start_size']

        self.epsilon_start = hyperparameters['epsilon_start']
        self.epsilon = hyperparameters['epsilon_start']
        self.epsilon_decay = hyperparameters['epsilon_decay']
        self.epsilon_final = hyperparameters['epsilon_final']

        self.accumulated_loss = []
        self.device = device

        # initialize the replay buffer (i.e. the memory) of the agent
        self.replay_buffer = ReplayBuffer(hyperparameters['buffer_capacity'],
                                          hyperparameters['n_multi_step'],
                                          hyperparameters['gamma'])
        self.summary_writer = summary_writer

        self.noisy_net = hyperparameters['noisy_net']

        self.env = env

    def act(self, obs):
        '''
		Greedy action outputted by the NN in the CentralControl
		'''
        return self.cc.get_max_action(obs)

    def act_eps_greedy(self, obs):
        '''
		E-greedy action
		'''

        # In case of a noisy net, it takes a greedy action
        if self.noisy_net:
            return self.act(obs)

        if np.random.random() < self.epsilon:
            return self.env.action_space.sample()
        else:
            return self.act(obs)

    def add_env_feedback(self, obs, action, new_obs, reward, done):
        '''
		Acquire a new feedback from the environment. The feedback is constituted by the new observation, the reward and the done boolean.
		'''

        # Create the new memory and update the buffer
        new_memory = self.Memory(obs=obs,
                                 action=action,
                                 new_obs=new_obs,
                                 reward=reward,
                                 done=done)
        self.replay_buffer.append(new_memory)

        # update the variables
        self.n_iter += 1
        # decrease epsilon
        self.epsilon = max(
            self.epsilon_final,
            self.epsilon_start - self.n_iter / self.epsilon_decay)
        self.total_reward += reward

    def sample_and_optimize(self, batch_size):
        '''
		Sample batch_size memories from the buffer and optimize them
		'''

        if len(self.replay_buffer) > self.buffer_start_size:
            # sample
            mini_batch = self.replay_buffer.sample(batch_size)
            # optimize
            l_loss = self.cc.optimize(mini_batch)
            self.accumulated_loss.append(l_loss)

        # update target NN
        if self.n_iter % self.iter_update_target == 0:
            self.cc.update_target()

    def reset_stats(self):
        '''
		Reset the agent's statistics
		'''
        self.rewards.append(self.total_reward)
        self.total_reward = 0
        self.accumulated_loss = []
        self.n_games += 1

    def print_info(self):
        '''
		Print information about the agent
		'''
        fps = (self.n_iter - self.ts_frame) / (time.time() - self.ts)
        print('%d %d rew:%d mean_rew:%.2f eps:%.2f, fps:%d, loss:%.4f' %
              (self.n_iter, self.n_games, self.total_reward,
               np.mean(self.rewards[-40:]), self.epsilon, fps,
               np.mean(self.accumulated_loss)))

        self.ts_frame = self.n_iter
        self.ts = time.time()

        if self.summary_writer != None:
            self.summary_writer.add_scalar('reward', self.total_reward,
                                           self.n_games)
            self.summary_writer.add_scalar('mean_reward',
                                           np.mean(self.rewards[-40:]),
                                           self.n_games)
            self.summary_writer.add_scalar('10_mean_reward',
                                           np.mean(self.rewards[-10:]),
                                           self.n_games)
            self.summary_writer.add_scalar('esilon', self.epsilon,
                                           self.n_games)
            self.summary_writer.add_scalar('loss',
                                           np.mean(self.accumulated_loss),
                                           self.n_games)
Ejemplo n.º 10
0
class D4PG_Agent:
    """
    PyTorch Implementation of D4PG:
    "Distributed Distributional Deterministic Policy Gradients"
    (Barth-Maron, Hoffman, et al., 2018)
    As described in the paper at: https://arxiv.org/pdf/1804.08617.pdf

    Much thanks also to the original DDPG paper:
    "Continuous Control with Deep Reinforcement Learning"
    (Lillicrap, Hunt, et al., 2016)
    https://arxiv.org/pdf/1509.02971.pdf

    And to:
    "A Distributional Perspective on Reinforcement Learning"
    (Bellemare, Dabney, et al., 2017)
    https://arxiv.org/pdf/1707.06887.pdf

    D4PG utilizes distributional value estimation, n-step returns,
    prioritized experience replay (PER), distributed K-actor exploration,
    and off-policy actor-critic learning to achieve very fast and stable
    learning for continuous control tasks.

    This version of the Agent is written to interact with Udacity's
    Continuous Control robotic arm manipulation environment which provides
    20 simultaneous actors, negating the need for K-actor implementation.
    Thus, this code has no multiprocessing functionality. It could be easily
    added as part of the main.py script.

    In the original D4PG paper, it is suggested in the data that PER does
    not have significant (or perhaps any at all) effect on the speed or
    stability of learning. Thus, it too has been left out of this
    implementation but may be added as a future TODO item.
    """
    def __init__(self,
                 env,
                 args,
                 e_decay=1,
                 e_min=0.05,
                 l2_decay=0.0001,
                 update_type="hard"):
        """
        Initialize a D4PG Agent.
        """

        self.device = args.device
        self.framework = "D4PG"
        self.eval = args.eval
        self.agent_count = env.agent_count
        self.actor_learn_rate = args.actor_learn_rate
        self.critic_learn_rate = args.critic_learn_rate
        self.batch_size = args.batch_size
        self.buffer_size = args.buffer_size
        self.action_size = env.action_size
        self.state_size = env.state_size
        self.C = args.C
        self._e = args.e
        self.e_decay = e_decay
        self.e_min = e_min
        self.gamma = args.gamma
        self.rollout = args.rollout
        self.tau = args.tau
        self.update_type = update_type

        self.num_atoms = args.num_atoms
        self.vmin = args.vmin
        self.vmax = args.vmax
        self.atoms = torch.linspace(self.vmin, self.vmax,
                                    self.num_atoms).to(self.device)

        self.t_step = 0
        self.episode = 0

        # Set up memory buffers, currently only standard replay is implemented #
        self.memory = ReplayBuffer(self.device, self.buffer_size, self.gamma,
                                   self.rollout)

        #                    Initialize ACTOR networks                         #
        self.actor = ActorNet(args.layer_sizes, self.state_size,
                              self.action_size).to(self.device)
        self.actor_target = ActorNet(args.layer_sizes, self.state_size,
                                     self.action_size).to(self.device)
        self._hard_update(self.actor, self.actor_target)
        self.actor_optim = optim.Adam(self.actor.parameters(),
                                      lr=self.actor_learn_rate,
                                      weight_decay=l2_decay)

        #                   Initialize CRITIC networks                         #
        self.critic = CriticNet(args.layer_sizes, self.state_size,
                                self.action_size,
                                self.num_atoms).to(self.device)
        self.critic_target = CriticNet(args.layer_sizes, self.state_size,
                                       self.action_size,
                                       self.num_atoms).to(self.device)
        self._hard_update(self.actor, self.actor_target)
        self.critic_optim = optim.Adam(self.critic.parameters(),
                                       lr=self.critic_learn_rate,
                                       weight_decay=l2_decay)

        self.new_episode()

    def act(self, states, eval=False):
        """
        Predict an action using a policy/ACTOR network π.
        Scaled noise N (gaussian distribution) is added to all actions todo
        encourage exploration.
        """

        states = states.to(self.device)
        with torch.no_grad():
            actions = self.actor(states).detach().cpu().numpy()
        if not eval:
            noise = self._gauss_noise(actions.shape)
            actions += noise
        return np.clip(actions, -1, 1)

    def step(self, states, actions, rewards, next_states, pretrain=False):
        """
        Add the current SARS' tuple into the short term memory, then learn
        """

        # Current SARS' stored in short term memory, then stacked for NStep
        experience = list(zip(states, actions, rewards, next_states))
        self.memory.store_experience(experience)
        self.t_step += 1

        # Learn after done pretraining
        if not pretrain:
            self.learn()

    def learn(self):
        """
        Performs a distributional Actor/Critic calculation and update.
        Actor πθ and πθ'
        Critic Zw and Zw' (categorical distribution)
        """

        # Sample from replay buffer, REWARDS are sum of ROLLOUT timesteps
        # Already calculated before storing in the replay buffer.
        # NEXT_STATES are ROLLOUT steps ahead of STATES
        batch = self.memory.sample(self.batch_size)
        states, actions, rewards, next_states = batch
        atoms = self.atoms.unsqueeze(0)
        # Calculate Yᵢ from target networks using πθ' and Zw'
        # These tensors are not needed for backpropogation, so detach from the
        # calculation graph (literally doubles runtime if this is not detached)
        target_dist = self._get_targets(rewards, next_states).detach()

        # Calculate log probability DISTRIBUTION using Zw w.r.t. stored actions
        log_probs = self.critic(states, actions, log=True)

        # Calculate the critic network LOSS (Cross Entropy), CE-loss is ideal
        # for categorical value distributions as utilized in D4PG.
        # estimates distance between target and projected values
        critic_loss = -(target_dist * log_probs).sum(-1).mean()

        # Predict action for actor network loss calculation using πθ
        predicted_action = self.actor(states)

        # Predict value DISTRIBUTION using Zw w.r.t. action predicted by πθ
        probs = self.critic(states, predicted_action)

        # Multiply probabilities by atom values and sum across columns to get
        # Q-Value
        expected_reward = (probs * atoms).sum(-1)

        # Calculate the actor network LOSS (Policy Gradient)
        # Take the mean across the batch and multiply in the negative to
        # perform gradient ascent
        actor_loss = -expected_reward.mean()

        # Perform gradient ascent
        self.actor_optim.zero_grad()
        actor_loss.backward()
        self.actor_optim.step()

        # Perform gradient descent
        self.critic_optim.zero_grad()
        critic_loss.backward()
        self.critic_optim.step()

        self._update_networks()

        self.actor_loss = actor_loss.item()
        self.critic_loss = critic_loss.item()

    def initialize_memory(self, pretrain_length, env):
        """
        Fills up the ReplayBuffer memory with PRETRAIN_LENGTH number of experiences
        before training begins.
        """

        if len(self.memory) >= pretrain_length:
            print("Memory already filled, length: {}".format(len(self.memory)))
            return

        print("Initializing memory buffer.")
        states = env.states
        while len(self.memory) < pretrain_length:
            actions = np.random.uniform(-1, 1,
                                        (self.agent_count, self.action_size))
            next_states, rewards, dones = env.step(actions)
            self.step(states, actions, rewards, next_states, pretrain=True)
            if self.t_step % 10 == 0 or len(self.memory) >= pretrain_length:
                print("Taking pretrain step {}... memory filled: {}/{}\
                    ".format(self.t_step, len(self.memory), pretrain_length))

            states = next_states
        print("Done!")
        self.t_step = 0

    def _get_targets(self, rewards, next_states):
        """
        Calculate Yᵢ from target networks using πθ' and Zw'
        """

        target_actions = self.actor_target(next_states)
        target_probs = self.critic_target(next_states, target_actions)
        # Project the categorical distribution onto the supports
        projected_probs = self._categorical(rewards, target_probs)
        return projected_probs

    def _categorical(self, rewards, probs):
        """
        Returns the projected value distribution for the input state/action pair

        While there are several very similar implementations of this Categorical
        Projection methodology around github, this function owes the most
        inspiration to Zhang Shangtong and his excellent repository located at:
        https://github.com/ShangtongZhang
        """

        # Create local vars to keep code more concise
        vmin = self.vmin
        vmax = self.vmax
        atoms = self.atoms
        num_atoms = self.num_atoms
        gamma = self.gamma
        rollout = self.rollout

        rewards = rewards.unsqueeze(-1)
        delta_z = (vmax - vmin) / (num_atoms - 1)

        # Rewards were stored with 0->(N-1) summed, take Reward and add it to
        # the discounted expected reward at N (ROLLOUT) timesteps
        projected_atoms = rewards + gamma**rollout * atoms.unsqueeze(0)
        projected_atoms.clamp_(vmin, vmax)
        b = (projected_atoms - vmin) / delta_z

        # It seems that on professional level GPUs (for instance on AWS), the
        # floating point math is accurate to the degree that a tensor printing
        # as 99.00000 might in fact be 99.000000001 in the backend, perhaps due
        # to binary imprecision, but resulting in 99.00000...ceil() evaluating
        # to 100 instead of 99. Forcibly reducing the precision to the minimum
        # seems to be the only solution to this problem, and presents no issues
        # to the accuracy of calculating lower/upper_bound correctly.
        precision = 1
        b = torch.round(b * 10**precision) / 10**precision
        lower_bound = b.floor()
        upper_bound = b.ceil()

        m_lower = (upper_bound +
                   (lower_bound == upper_bound).float() - b) * probs
        m_upper = (b - lower_bound) * probs

        projected_probs = torch.tensor(np.zeros(probs.size())).to(self.device)

        for idx in range(probs.size(0)):
            projected_probs[idx].index_add_(0, lower_bound[idx].long(),
                                            m_lower[idx].double())
            projected_probs[idx].index_add_(0, upper_bound[idx].long(),
                                            m_upper[idx].double())
        return projected_probs.float()

    @property
    def e(self):
        """
        This property ensures that the annealing process is run every time that
        E is called.
        Anneals the epsilon rate down to a specified minimum to ensure there is
        always some noisiness to the policy actions. Returns as a property.
        """

        self._e = max(self.e_min, self._e * self.e_decay)
        return self._e

    def _gauss_noise(self, shape):
        """
        Returns the epsilon scaled noise distribution for adding to Actor
        calculated action policy.
        """

        n = np.random.normal(0, 1, shape)
        return self.e * n

    def new_episode(self):
        """
        Handle any cleanup or steps to begin a new episode of training.
        """

        self.memory.init_n_step()
        self.episode += 1

    def _update_networks(self):
        """
        Updates the network using either DDPG-style soft updates (w/ param \
        TAU), or using a DQN/D4PG style hard update every C timesteps.
        """

        if self.update_type == "soft":
            self._soft_update(self.actor, self.actor_target)
            self._soft_update(self.critic, self.critic_target)
        elif self.t_step % self.C == 0:
            self._hard_update(self.actor, self.actor_target)
            self._hard_update(self.critic, self.critic_target)

    def _soft_update(self, active, target):
        """
        Slowly updated the network using every-step partial network copies
        modulated by parameter TAU.
        """

        for t_param, param in zip(target.parameters(), active.parameters()):
            t_param.data.copy_(self.tau * param.data +
                               (1 - self.tau) * t_param.data)

    def _hard_update(self, active, target):
        """
        Fully copy parameters from active network to target network. To be used
        in conjunction with a parameter "C" that modulated how many timesteps
        between these hard updates.
        """

        target.load_state_dict(active.state_dict())
Ejemplo n.º 11
0
class ACDQNGM(DQNG):
    def __init__(self, args, env, env_test, logger):
        super(ACDQNGM, self).__init__(args, env, env_test, logger)

    def init(self, args, env):
        names = ['s0', 'a', 's1', 'r', 't', 'g', 'm', 'task', 'mcr']
        metrics = ['loss_dqn', 'qval', 'val']
        self.buffer = ReplayBuffer(limit=int(1e6),
                                   names=names.copy(),
                                   args=args)
        self.actorCritic = ActorCriticDQNGM(args, env)
        for metric in metrics:
            self.metrics[metric] = 0
        self.goalcounts = np.zeros((len(self.env.goals), ))

    def train(self):

        if self.buffer.nb_entries > 100 * self.batch_size:

            samples = self.buffer.sample(self.batch_size)
            samples = self.env.augment_samples(samples)
            targets = self.actorCritic.get_targets_dqn(samples['r'],
                                                       samples['t'],
                                                       samples['s1'],
                                                       samples['g'],
                                                       samples['m'])
            inputs = [
                samples['s0'], samples['a'], samples['g'], samples['m'],
                targets
            ]
            metricsCritic = self.actorCritic.trainCritic(inputs)
            self.metrics['loss_dqn'] += np.squeeze(metricsCritic[0])
            self.metrics['qval'] += np.mean(metricsCritic[1])
            self.goalcounts += np.bincount(samples['task'],
                                           minlength=len(self.env.goals))
            metricsActor = self.actorCritic.trainActor(
                [samples['s0'], samples['g'], samples['m']])
            if self.env_step % 1000 == 0:
                print(metricsActor[0], metricsActor[1])
            self.metrics['val'] += np.mean(metricsActor[2])
            self.actorCritic.target_train()

    def get_stats(self):
        sumsamples = np.sum(self.goalcounts)
        if sumsamples != 0:
            for i, goal in enumerate(self.env.goals):
                self.stats['samplecount_{}'.format(goal)] = float(
                    "{0:.3f}".format(self.goalcounts[i] / sumsamples))

    def make_input(self, state, mode):
        if mode == 'train':
            input = [
                np.expand_dims(i, axis=0)
                for i in [state, self.env.goal, self.env.mask]
            ]
        else:
            input = [
                np.expand_dims(i, axis=0)
                for i in [state, self.env_test.goal, self.env_test.mask]
            ]
        return input

    def act(self, exp, mode='train'):
        input = self.make_input(exp['s0'], mode)
        actionProbs = self.actorCritic.probs(input)[0].squeeze()
        # if self.env_step % 1000 == 0: print(actionProbs)
        if mode == 'train':
            action = np.random.choice(range(self.env.action_dim),
                                      p=actionProbs)
        else:
            action = np.argmax(actionProbs[0])
        prob = actionProbs[action]
        action = np.expand_dims(action, axis=1)
        exp['a'] = action
        # exp['p_a'] = prob
        return exp

    def reset(self):

        if self.trajectory:
            augmented_episode = self.env.end_episode(self.trajectory)
            for expe in augmented_episode:
                self.buffer.append(expe)
            # for expe in self.trajectory:
            #     self.buffer.append(expe.copy())
            # augmented_ep = self.env.augment_episode(self.trajectory)
            # for e in augmented_ep:
            #     self.buffer.append(e)
            self.trajectory.clear()

        state = self.env.reset()
        self.episode_step = 0

        return state

    def get_demo(self, rndprop):
        demo = []
        exp = {}
        exp['s0'] = self.env_test.env.reset()
        # obj = np.random.choice(self.env_test.env.objects)
        # goal = np.random.randint(obj.high[2]+1)
        obj = self.env_test.env.light
        goal = 1
        while True:
            if np.random.rand() < rndprop:
                a = np.random.randint(self.env_test.action_dim)
                done = False
            else:
                a, done = self.env_test.env.opt_action(obj, goal)
            if not done:
                exp['a'] = np.expand_dims(a, axis=1)
                exp['s1'] = self.env_test.env.step(exp['a'])[0]
                demo.append(exp.copy())
                exp['s0'] = exp['s1']
            else:
                break
        return demo

    def demo(self):
        if self.env_step % self.demo_freq == 0:
            for i in range(5):
                demo = self.get_demo(rndprop=0.)
                augmented_demo = self.env.augment_demo(demo)
                for exp in augmented_demo:
                    self.buffer.append(exp)
Ejemplo n.º 12
0
class DQN_Agent:
    """
    PyTorch Implementation of DQN/DDQN.
    """
    def __init__(self, state_size, action_size, args):
        """
        Initialize a D4PG Agent.
        """

        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.action_size = action_size
        self.state_size = state_size
        self.framework = args.framework
        self.eval = args.eval
        self.agent_count = 1
        self.learn_rate = args.learn_rate
        self.batch_size = args.batch_size
        self.buffer_size = args.buffer_size
        self.C = args.C
        self._epsilon = args.epsilon
        self.epsilon_decay = args.epsilon_decay
        self.epsilon_min = args.epsilon_min

        self.gamma = 0.99
        self.rollout = args.rollout
        self.tau = args.tau
        self.momentum = 1
        self.l2_decay = 0.0001
        self.update_type = "hard"

        self.t_step = 0
        self.episode = 0
        self.seed = 0

        # Set up memory buffers
        if args.prioritized_experience_replay:
            self.memory = PERBuffer(args.buffersize, self.batchsize,
                                    self.framestack, self.device, args.alpha,
                                    args.beta)
            self.criterion = WeightedLoss()
        else:
            self.memory = ReplayBuffer(self.device, self.buffer_size,
                                       self.gamma, self.rollout)

        #                    Initialize Q networks                         #
        self.q = self._make_model(state_size, action_size, args.pixels)
        self.q_target = self._make_model(state_size, action_size, args.pixels)
        self._hard_update(self.q, self.q_target)
        self.q_optimizer = self._set_optimizer(self.q.parameters(),
                                               lr=self.learn_rate,
                                               decay=self.l2_decay,
                                               momentum=self.momentum)

        self.new_episode()

    @property
    def epsilon(self):
        """
        This property ensures that the annealing process is run every time that
        E is called.
        Anneals the epsilon rate down to a specified minimum to ensure there is
        always some noisiness to the policy actions. Returns as a property.
        """

        self._epsilon = max(self.epsilon_min, self.epsilon_decay**self.t_step)
        return self._epsilon

    def act(self, state, eval=False, pretrain=False):
        """
        Select an action using epsilon-greedy π.
        Always use greedy if not training.
        """

        if np.random.random() > self.epsilon or not eval and not pretrain:
            state = state.to(self.device)
            with torch.no_grad():
                action_values = self.q(state).detach().cpu()
            action = action_values.argmax(dim=1).unsqueeze(0).numpy()
        else:
            action = np.random.randint(self.action_size, size=(1, 1))
        return action.astype(np.long)

    def step(self, state, action, reward, next_state, pretrain=False):
        """
        Add the current SARS' tuple into the short term memory, then learn
        """

        # Current SARS' stored in short term memory, then stacked for NStep
        experience = (state, action, reward, next_state)
        if self.rollout == 1:
            self.memory.store_trajectory(state, torch.from_numpy(action),
                                         torch.tensor([reward]), next_state)
        else:
            self.memory.store_experience(experience)
        self.t_step += 1

        # Learn after done pretraining
        if not pretrain:
            self.learn()

    def learn(self):
        """
        Trains the Deep QNetwork and returns action values.
        Can use multiple frameworks.
        """

        # Sample from replay buffer, REWARDS are sum of (ROLLOUT - 1) timesteps
        # Already calculated before storing in the replay buffer.
        # NEXT_STATES are ROLLOUT steps ahead of STATES
        batch, is_weights, tree_idx = self.memory.sample(self.batch_size)
        states, actions, rewards, next_states, terminal_mask = batch

        q_values = torch.zeros(self.batch_size).to(self.device)
        if self.framework == 'DQN':
            # Max predicted Q values for the next states from the target model
            q_values[terminal_mask] = self.q_target(next_states).detach().max(
                dim=1)[0]

        if self.framework == 'DDQN':
            # Get maximizing ACTION under Q, evaluate actionvalue
            # under q_target

            # Max valued action under active network
            max_actions = self.q(next_states).detach().argmax(1).unsqueeze(1)
            # Use the active network action to get the value of the stable
            # target network
            q_values[terminal_mask] = self.q_target(
                next_states).detach().gather(1, max_actions).squeeze(1)

        targets = rewards + (self.gamma**self.rollout * q_values)

        targets = targets.unsqueeze(1)
        values = self.q(states).gather(1, actions)

        #Huber Loss provides better results than MSE
        if is_weights is None:
            loss = F.smooth_l1_loss(values, targets)

        #Compute Huber Loss manually to utilize is_weights with Prioritization
        else:
            loss, td_errors = self.criterion.huber(values, targets, is_weights)
            self.memory.batch_update(tree_idx, td_errors)

        # Perform gradient descent
        self.q_optimizer.zero_grad()
        loss.backward()
        self.q_optimizer.step()

        self._update_networks()
        self.loss = loss.item()

    def initialize_memory(self, pretrain_length, env):
        """
        Fills up the ReplayBuffer memory with PRETRAIN_LENGTH number of experiences
        before training begins.
        """

        if len(self.memory) >= pretrain_length:
            print("Memory already filled, length: {}".format(len(self.memory)))
            return

        print("Initializing memory buffer.")

        while True:
            done = False
            env.reset()
            state = env.state
            while not done:
                action = self.act(state, pretrain=True)
                next_state, reward, done = env.step(action)
                if done:
                    next_state = None

                self.step(state, action, reward, next_state, pretrain=True)
                states = next_state

                if self.t_step % 50 == 0 or len(
                        self.memory) >= pretrain_length:
                    print("Taking pretrain step {}... memory filled: {}/{}\
                        ".format(self.t_step, len(self.memory),
                                 pretrain_length))
                if len(self.memory) >= pretrain_length:
                    print("Done!")
                    self.t_step = 0
                    self._epsilon = 1
                    return

    def new_episode(self):
        """
        Handle any cleanup or steps to begin a new episode of training.
        """

        self.memory.init_n_step()
        self.episode += 1

    def _update_networks(self):
        """
        Updates the network using either DDPG-style soft updates (w/ param \
        TAU), or using a DQN/D4PG style hard update every C timesteps.
        """

        if self.update_type == "soft":
            self._soft_update(self.q, self.q_target)
        elif self.t_step % self.C == 0:
            self._hard_update(self.q, self.q_target)

    def _soft_update(self, active, target):
        """
        Slowly updated the network using every-step partial network copies
        modulated by parameter TAU.
        """

        for t_param, param in zip(target.parameters(), active.parameters()):
            t_param.data.copy_(self.tau * param.data +
                               (1 - self.tau) * t_param.data)

    def _hard_update(self, active, target):
        """
        Fully copy parameters from active network to target network. To be used
        in conjunction with a parameter "C" that modulated how many timesteps
        between these hard updates.
        """

        target.load_state_dict(active.state_dict())

    def _set_optimizer(self, params, lr, decay, momentum, optimizer="Adam"):
        """
        Sets the optimizer based on command line choice. Defaults to Adam.
        """

        if optimizer == "RMSprop":
            return optim.RMSprop(params, lr=lr, momentum=momentum)
        elif optimizer == "SGD":
            return optim.SGD(params, lr=lr, momentum=momentum)
        else:
            return optim.Adam(params, lr=lr, weight_decay=decay)

    def _make_model(self, state_size, action_size, use_cnn):
        """
        Sets up the network model based on whether state data or pixel data is
        provided.
        """

        if use_cnn:
            return QCNNetwork(state_size, action_size,
                              self.seed).to(self.device)
        else:
            return QNetwork(state_size, action_size, self.seed).to(self.device)
Ejemplo n.º 13
0
class DDPG:
    def __init__(
        self,
        env,
        gamma=0.99,
        tau=1e-3,
        pol_lr=1e-4,
        q_lr=1e-3,
        batch_size=64,
        buffer_size=10000,
    ):

        # environment stuff
        self.env = env
        self.num_act = env.action_space.shape[0]
        self.num_obs = env.observation_space.shape[0]
        self.eval_env = copy.deepcopy(env)

        # hyper parameters
        self.gamma = gamma
        self.tau = tau
        self.pol_lr = pol_lr
        self.q_lr = q_lr
        self.batch_size = batch_size
        self.buffer_size = buffer_size

        # networks
        self.pol = Actor(self.num_obs, self.num_act, [400, 300]).double()
        self.q = Critic(self.num_obs, self.num_act, [400, 300]).double()
        self.pol.init_weights()
        self.q.init_weights()
        self.target_pol = copy.deepcopy(self.pol).double()
        self.target_q = copy.deepcopy(self.q).double()

        # optimizers, buffer
        self.pol_opt = torch.optim.Adam(self.pol.parameters(), lr=self.pol_lr)
        self.q_opt = torch.optim.Adam(
            self.q.parameters(),
            lr=self.q_lr,
        )
        # weight_decay=1e-2)
        self.buffer = ReplayBuffer(self.buffer_size, 1000)
        self.mse_loss = torch.nn.MSELoss()

        self.cum_loss = 0
        self.cum_obj = 0

    # fill up buffer first
    def prep_buffer(self):
        obs = self.env.reset()
        while not self.buffer.ready:
            pre_obs = obs
            action = self.env.action_space.sample()
            obs, reward, done, _ = self.env.step(action)
            self.buffer.insert((pre_obs, action, reward, obs, done))
            if done: obs = self.env.reset()

    # update neural net
    def update_networks(self):
        # (pre_obs, action, reward, obs, done)
        pre_obs = torch.tensor(self.batch[0], dtype=torch.double)
        actions = torch.tensor(self.batch[1], dtype=torch.double)
        rewards = torch.tensor(self.batch[2], dtype=torch.double)
        obs = torch.tensor(self.batch[3], dtype=torch.double)
        done = torch.tensor(self.batch[4], dtype=torch.double).unsqueeze(1)

        self.q_opt.zero_grad()
        y = rewards + (self.gamma *
                       (1.0 - done) * self.target_q(obs, self.target_pol(obs)))
        # loss = torch.sum((y - self.q(pre_obs, actions)) ** 2) / self.batch_size
        loss = self.mse_loss(self.q(pre_obs, actions), y)
        loss.backward()
        self.q_opt.step()
        self.cum_loss += loss

        self.pol_opt.zero_grad()
        objective = -self.q(pre_obs, self.pol(pre_obs)).mean()
        objective.backward()
        self.pol_opt.step()
        self.cum_obj += objective

    # update target networks with tau
    def update_target_networks(self):
        for target, actual in zip(self.target_q.named_parameters(),
                                  self.q.named_parameters()):
            target[1].data.copy_(self.tau * actual[1].data +
                                 (1 - self.tau) * target[1].data)
        for target, actual in zip(self.target_pol.named_parameters(),
                                  self.pol.named_parameters()):
            target[1].data.copy_(self.tau * actual[1].data +
                                 (1 - self.tau) * target[1].data)

    def policy_eval(self):
        state = self.eval_env.reset()
        done = False
        rewards = []
        while not done:
            inp = torch.tensor(state, dtype=torch.double)
            action = self.pol(inp)
            action = action.detach().numpy()
            next_state, r, done, _ = self.eval_env.step(action)
            rewards.append(r)
            # self.eval_env.render()
            # time.sleep(0.1)
            state = next_state

        total = sum(rewards)
        return total

    def train(self, num_iters=200000, eval_len=1000, render=False):
        print("Start")
        if render: self.env.render('human')
        self.prep_buffer()
        obs = self.env.reset()
        iter_info = []

        # train for num_iters
        for i in range(int(num_iters / eval_len)):
            for j in trange(eval_len):
                # one step and put into buffer
                pre_obs = obs
                inp = torch.tensor(obs, dtype=torch.double)
                action = self.pol(inp)
                action = action.detach().numpy(
                ) + np.random.multivariate_normal(mean=np.array([0.0, 0.0]),
                                                  cov=np.array([[0.1, 0.0],
                                                                [0.0, 0.1]]))
                obs, reward, done, _ = self.env.step(action)
                self.buffer.insert((pre_obs, action, reward, obs, done))
                if render:
                    self.env.render('human')
                    time.sleep(0.000001)
                if done: obs = self.env.reset()

                # sample from buffer, train one step, update target networks
                self.batch = self.buffer.sample(self.batch_size)
                self.update_networks()
                self.update_target_networks()

            iter_reward = self.policy_eval()
            avg_loss = self.cum_loss / ((i + 1) * eval_len)
            avg_obj = self.cum_obj / ((i + 1) * eval_len)
            print("Iteration {}/{}".format((i + 1) * eval_len, num_iters))
            print("Rewards: {} | Q Loss: {} | Policy Objective: {}".format(
                iter_reward, avg_loss, avg_obj))
            iter_info.append((iter_reward, avg_loss, avg_obj))

        return iter_info
Ejemplo n.º 14
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed, framework, buffer_type):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.framework = framework
        self.buffer_type = buffer_type

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        # def __init__(self, device, buffer_size, batch_size, alpha, beta):
        if self.buffer_type == 'PER_ReplayBuffer':
            self.memory = PER_ReplayBuffer(device, BUFFER_SIZE, BATCH_SIZE,
                                           ALPHA, BETA)
        if self.buffer_type == 'ReplayBuffer':
            self.memory = ReplayBuffer(device, action_size, BUFFER_SIZE,
                                       BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                if self.buffer_type == 'ReplayBuffer':
                    experiences = self.memory.sample()
                    is_weights = None
                    idxs = None
                if self.buffer_type == 'PER_ReplayBuffer':
                    experiences, is_weights, idxs = self.memory.sample()
                    self.criterion = WeightedLoss()
                    #print('debugging:', experiences )
                self.learn(experiences, is_weights, idxs, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()  # use local network

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, is_weights, idxs, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        if self.framework == 'DQN':
            # Get max predicted Q values (for next states) from target model
            Q_targets_next = self.qnetwork_target(next_states).detach(
            ).max(1)[0].unsqueeze(
                1
            )  #target network: which is uploaded slower than the local network
            #print('Q_targets_next is ', Q_targets_next)
        if self.framework == "DDQN":
            #print("DDQN")
            max_actions = self.qnetwork_local(next_states).detach().argmax(
                1).unsqueeze(1)
            #print('max_actions is ', max_actions)
            #print('self.qnetwork_target(next_states).detach() is ',self.qnetwork_target(next_states).detach())
            #Q_targets_next = self.qnetwork_target(next_states).detach().gather(1, max_actions).squeeze(1)
            Q_targets_next = self.qnetwork_target(next_states).detach().gather(
                1, max_actions)
            #print('DDQN, Q', Q_targets_next)
            #print('rewards is ', rewards)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        #------------------------------------------------------------------------------
        #Huber Loss provides better results than MSE
        if is_weights is None:
            loss = F.smooth_l1_loss(Q_expected, Q_targets)

        #Compute Huber Loss manually to utilize is_weights with Prioritization
        else:
            loss, td_errors = self.criterion.huber(Q_expected, Q_targets,
                                                   is_weights)
            self.memory.batch_update(idxs, td_errors)

        # Perform gradient descent
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Ejemplo n.º 15
0
    def learn(self, timesteps=10000, verbose=0, seed=None):
        if seed is not None:
            random.seed(seed)
            np.random.seed(seed)
            torch.manual_seed(seed)

        self.eps_range = self._eps_range(timesteps)
        replay_buffer = ReplayBuffer(self.buffer_size)

        self._init_model()

        obs = self.env.reset()
        for step in range(timesteps):
            # while not done:
            cur_eps = next(self.eps_range, None)
            if cur_eps is None:
                cur_eps = self.final_eps

            action = self._select_action(obs, cur_eps)

            new_obs, rewards, done, info = self.env.step(action)
            if done:
                new_obs = [
                    np.nan
                ] * self.obs_shape[0]  # hacky way to keep dimensions correct
            replay_buffer.add(obs, action, rewards, new_obs)

            obs = new_obs

            # learn gradient
            if step > self.learning_starts:
                if len(replay_buffer.buffer
                       ) < self.batch_size:  # buffer too small
                    continue
                samples = replay_buffer.sample(self.batch_size, self.device)
                obs_batch, actions_batch, rewards_batch, new_obs_batch = samples

                predicted_q_values = self._predictQValue(
                    self.step_model, obs_batch, actions_batch)
                ys = self._expectedLabels(self.target_model, new_obs_batch,
                                          rewards_batch)

                loss = F.smooth_l1_loss(predicted_q_values, ys)

                self.optim.zero_grad()
                loss.backward()
                for i in self.step_model.parameters():
                    i.grad.clamp_(min=-1, max=1)  # exploding gradient
                    # i.grad.clamp_(min=-10, max=10) # exploding gradient
                self.optim.step()

                # update target
                if step % self.target_network_update_freq == 0:
                    self.target_model.load_state_dict(
                        self.step_model.state_dict())

            if done:
                obs = self.env.reset()
            if verbose == 1:
                if step % (timesteps * 0.1) == 0:
                    perc = int(step / (timesteps * 0.1))
                    print(f"At step {step}")
                    print(f"{perc}% done")
Ejemplo n.º 16
0
class DDPG(Agent):
    def __init__(self, args, env, env_test, logger):
        super(DDPG, self).__init__(args, env, env_test, logger)
        self.args = args
        self.init(args, env)

    def init(self, args, env):
        names = ['s0', 'a', 's1', 'r', 't']
        metrics = ['loss_dqn', 'loss_actor']
        self.buffer = ReplayBuffer(limit=int(1e6),
                                   names=names.copy(),
                                   args=args)
        self.actorCritic = ActorCriticDDPG(args, env)
        for metric in metrics:
            self.metrics[metric] = 0

    def train(self):

        if self.buffer.nb_entries > self.batch_size:
            exp = self.buffer.sample(self.batch_size)
            targets_dqn = self.actorCritic.get_targets_dqn(
                exp['r'], exp['t'], exp['s1'])
            inputs = [exp['s0'], exp['a'], targets_dqn]
            loss_dqn = self.actorCritic.trainQval(inputs)
            action, criticActionGrads, invertedCriticActionGrads = self.actorCritic.trainActor(
                [exp['s0']])
            self.metrics['loss_dqn'] += np.squeeze(loss_dqn)

            # a2 = self.actor.model.predict_on_batch(s0)
            # grads = self.critic.gradsModel.predict_on_batch([s0, a2])
            # low = self.env.action_space.low
            # high = self.env.action_space.high
            # for d in range(grads[0].shape[0]):
            #     width = high[d] - low[d]
            #     for k in range(self.batch_size):
            #         if grads[k][d] >= 0:
            #             grads[k][d] *= (high[d] - a2[k][d]) / width
            #         else:
            #             grads[k][d] *= (a2[k][d] - low[d]) / width
            # self.actor.train(s0, grads)

            self.actorCritic.target_train()

    def make_input(self, state, mode):
        input = [np.expand_dims(state, axis=0)]
        return input

    def reset(self):

        if self.trajectory:
            self.env.end_episode(self.trajectory)
            for expe in self.trajectory:
                self.buffer.append(expe.copy())

            self.trajectory.clear()

        state = self.env.reset()
        self.episode_step = 0

        return state

    def act(self, state, mode='train'):
        input = self.make_input(state, mode)
        action = self.actorCritic.action(input)[0]
        if mode == 'train':
            noise = np.random.normal(0., 0.1, size=action[0].shape)
            action = noise + action
        action = np.clip(action, self.env.action_space.low,
                         self.env.action_space.high)
        action = action.squeeze()
        return action

    def save_model(self):
        self.actorCritic.actionModel.save(os.path.join(self.logger.get_dir(),
                                                       'actor_model'),
                                          overwrite=True)
        self.actorCritic.qvalModel.save(os.path.join(self.logger.get_dir(),
                                                     'qval_model'),
                                        overwrite=True)
Ejemplo n.º 17
0
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
#         print("Rewards")
#         print(rewards.shape)
        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"

#         self.qnetwork_local.train()
        local_output = self.qnetwork_local(states).gather(1, actions)

        selected_target_actions = torch.max(self.qnetwork_target(next_states).detach(),1)[0].unsqueeze(1)
        activated = torch.sub(torch.Tensor(np.ones(dones.shape)), dones)
        is_it_done = torch.mul(selected_target_actions, activated)
        target_output = torch.add(torch.mul(is_it_done, gamma), rewards)

        loss = F.mse_loss(local_output, target_output)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Ejemplo n.º 18
0
class Agent():
    def __init__(self, s_dim, num_actions, lr):
        self.step = 0
        self.epStep = 0
        self.ep = 0
        self.tutorListened = True
        self.tutorInput = ''
        self.sDim = s_dim
        self.num_actions = num_actions
        self.learning_rate = lr
        self.names = ['state0', 'action', 'feedback', 'fWeight']
        self.buffer = ReplayBuffer(limit=int(1e6), names=self.names)
        self.batchSize = 64
        self.episode = deque(maxlen=400)
        self.model = self.create_model()

    def create_model(self):
        state = Input(shape=self.sDim)
        action = Input(shape=(1,), dtype='uint8')
        l1 = Dense(400, activation="relu")(state)
        feedback = Dense(self.num_actions, activation=None, kernel_initializer='random_uniform')(l1)
        feedback = Reshape((1, self.num_actions))(feedback)
        mask = Lambda(K.one_hot, arguments={'num_classes': self.num_actions},
                      output_shape=(self.num_actions,))(action)
        feedback = multiply([feedback, mask])
        feedback = Lambda(K.sum, arguments={'axis': 2})(feedback)
        feedbackModel = Model(inputs=[state, action], outputs=feedback)
        feedbackModel.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return feedbackModel

    def train(self):
        loss = 0
        if self.buffer.nb_entries > self.batchSize:
            samples = self.buffer.sample(self.batchSize)
            s, a, targets, weights = [np.array(samples[name]) for name in self.names]
            loss = self.model.train_on_batch(x=[s,a], y=targets, sample_weight=weights)
        return loss

    def tutorListener(self):
        self.tutorInput = input("> ")
        print("maybe updating...the kbdInput variable is: {}".format(self.tutorInput))
        self.tutorListened = True

    def run(self):
        state0 = np.random.randint(0, 4, size=(5,))
        while self.step < 100000:

            if self.tutorInput != '':
                print("Received new keyboard Input. Setting playing ID to keyboard input value")
                for i in range(1,10):
                    self.episode[-i]['fWeight'] = 1
                    self.episode[-i]['feedback'] = self.tutorInput
                self.tutorInput = ''
            else:
                action = np.random.randint(self.num_actions)
                state1 = np.random.randint(0, 4, size=(5,))
                self.step += 1
                self.epStep += 1
                experience = {'state0': state0, 'action': action, 'fWeight': 0}
                self.episode.append(experience)
                self.loss = self.train()
                state0 = state1
                time.sleep(0.001)

            if self.tutorListened:
                self.tutorListened = False
                self.listener = Thread(target=self.tutorListener)
                self.listener.start()

            if self.epStep >= 200:
                if self.ep > 0:
                    for s in range(self.epStep):
                        exp = self.episode.popleft()
                        if exp['fWeight'] != 0:
                            self.buffer.append(exp)
                self.epStep = 0
                self.ep += 1
                state0 = np.random.randint(0, 4, size=(5,))
            if self.step % 1000 == 0:
                print(self.step, self.loss)

    def input(self):
        while True:
            if input() == '+':
                inputStep = self.step
                time.sleep(2)
                print('input +1, step = ', inputStep)
            elif input() == '-':
                inputStep = self.step
                time.sleep(2)
                print('input -1, step = ', inputStep)
            else:
                print('wrong input')
Ejemplo n.º 19
0
class DQN(Agent):
    def __init__(self, args, env, env_test, logger):
        super(DQN, self).__init__(args, env, env_test, logger)
        self.args = args
        self.init(args, env)

    def init(self, args, env):
        names = ['state0', 'action', 'state1', 'reward', 'terminal']
        self.buffer = ReplayBuffer(limit=int(1e6), names=names.copy())
        self.critic = CriticDQN(args, env)
        for metric_name in ['loss_dqn', 'qval', 'val']:
            self.metrics[metric_name] = 0

    def train(self):

        if self.buffer.nb_entries > self.batch_size:
            exp = self.buffer.sample(self.batch_size)
            s0, a0, s1, r, t = [exp[name] for name in self.buffer.names]
            targets_dqn = self.critic.get_targets_dqn(r, t, s1)
            inputs = [s0, a0]
            loss = self.critic.criticModel.train_on_batch(inputs, targets_dqn)
            for i, metric in enumerate(self.critic.criticModel.metrics_names):
                self.metrics[metric] += loss[i]

            self.critic.target_train()

    def reset(self):

        if self.trajectory:
            R = np.sum([
                self.env.unshape(exp['reward'], exp['terminal'])
                for exp in self.trajectory
            ])
            self.env.processEp(R)
            for expe in reversed(self.trajectory):
                self.buffer.append(expe.copy())

            if self.args['--imit'] != '0':
                Es = [0]
                for i, expe in enumerate(reversed(self.trajectory)):
                    if self.trajectory[-1]['terminal']:
                        Es[0] = Es[0] * self.critic.gamma + expe['reward']
                        expe['expVal'] = Es[0]
                    else:
                        expe['expVal'] = -self.ep_steps
                    self.bufferImit.append(expe.copy())

            self.trajectory.clear()

        state = self.env.reset()
        self.episode_step = 0

        return state

    def make_input(self, state, mode):
        input = [np.reshape(state, (1, self.critic.s_dim[0]))]
        input.append(np.expand_dims([0.5], axis=0))
        return input

    def act(self, state, mode='train'):
        input = self.make_input(state, mode)
        actionProbs = self.critic.actionProbs(input)
        if mode == 'train':
            action = np.random.choice(range(self.env.action_dim),
                                      p=actionProbs[0].squeeze())
        else:
            action = np.argmax(actionProbs[0])
        return np.expand_dims(action, axis=1)