Ejemplo n.º 1
0
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.epsilon = EPSILON_MAX

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed, mu=0, theta=0.15, sigma=0.2)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)

        # Make sure target is with the same weight as the source
        self.hard_update(self.actor_target, self.actor_local)
        self.hard_update(self.critic_target, self.critic_local)

        self.t_step = 0
Ejemplo n.º 2
0
    def __init__(self, env, batchSize = 10, bufferSize = 100,
                 gamma = 0.98, actorLR = 1e-4, criticLR = 1e-3,
                 maxSteps = 200, targetUpdate = 1e-3, epsilon = 1,
                 decay = 0.99, rewardScale = 1e-3, logFile = 'run.log'):
        self.env = env
        self.gamma = gamma
        self.batchSize = batchSize
        self.bufferSize = bufferSize
        self.maxSteps = maxSteps + 1
        self.rewardScale = rewardScale
        self.epsilon = epsilon
        self.decay = decay

        # Useful helpers.
        self.actionDim = self.env.action_space.shape[0]
        self.stateDim = self.env.observation_space.shape[0]
        self.featureDim = self.actionDim + self.stateDim
        self.minAction = self.env.action_space.low
        self.maxAction = self.env.action_space.high

        # For scaling output action values.
        self.actionBiasZeroOne = self.minAction
        self.actionScaleZeroOne = self.maxAction - self.minAction
        self.actionBiasTanH = (self.maxAction + self.minAction) / 2.0
        self.actionScaleTanH = self.maxAction - self.actionBiasTanH 

        # Initialize noise process.
        self.noise = OUNoise(self.actionDim)

        # Initialize replay buffer.
        self.buffer = ReplayBuffer(self.bufferSize)

        # Initialize logging.
        logging.basicConfig(filename = logFile,
                            level = logging.INFO,
                            format = '[%(asctime)s] %(message)s',
                            datefmt = '%m/%d/%Y %I:%M:%S %p')
        logging.info('Initializing DRPG agent with passed settings.')

        # Tensorflow GPU optimization.
        config = tf.ConfigProto() # GPU fix?
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config = config)
        from keras import backend as K
        K.set_session(self.sess)

        # Make actor network (creates target model internally).
        self.actor = Actor(self.sess, self.maxSteps, self.featureDim,
                           self.actionDim, self.batchSize, targetUpdate,
                           actorLR, self.actionScaleTanH, self.actionBiasTanH)

        # Make critic network (creates target model internally).
        self.critic = Critic(self.sess, self.maxSteps, self.featureDim,
                             self.actionDim, self.batchSize, targetUpdate,
                             actorLR)
Ejemplo n.º 3
0
    def __init__(self, policy_params):
        """ arch_parameters is a dictionary like:
        {'state_and_action_dims' : (num1, num2), layers : {'Linear_1' : layer_size_1,..,'Linear_n' : layer_size_n} }
        """
        super(Policy, self).__init__()
        self.policy_params = policy_params
        self.seed_as_int = policy_params['seed']
        torch.manual_seed(self.seed_as_int)
        self.arch_params = policy_params['arch_params']
        self.__state_dim = self.arch_params['state_and_action_dims'][0]
        self.__action_dim = self.arch_params['state_and_action_dims'][1]
        self.eps = policy_params['eps']
        self.min_eps = policy_params['min_eps']
        self.eps_decay = policy_params['eps_decay']
        self.__noise_type = policy_params['noise_type']

        keys = list(self.arch_params['layers'].keys())
        list_of_layers = []

        prev_layer_size = self.__state_dim
        for i in range(len(self.arch_params['layers'])):
            key = keys[i]
            layer_type = key.split('_')[0]
            if layer_type == 'Linear':
                layer_size = self.arch_params['layers'][key]
                list_of_layers.append(nn.Linear(prev_layer_size, layer_size))
                prev_layer_size = layer_size
            elif layer_type == 'LayerNorm':
                list_of_layers.append(nn.LayerNorm(prev_layer_size))
            elif layer_type == 'ReLU':
                list_of_layers.append(nn.ReLU())
            elif layer_type == 'Tanh':
                list_of_layers.append(nn.Tanh())
            else:
                print("Error: got unspecified layer type: '{}'. Check your layers!".format(layer_type))
                break

        self.layers = nn.ModuleList(list_of_layers)

        #noise
        if self.__noise_type == 'action':
            self.__rand_process = OUNoise((self.__action_dim,))

        elif self.__noise_type == 'parameter':
            self.network_params_perturbations = dict()
            for i in range(len(self.layers)):
                if not ('Linear' in str(type(self.layers[i]))):
                    i += 1
                else:
                    self.network_params_perturbations[i] = OUNoise(tuple(self.layers[i].weight.shape))
        else:
            assert ValueError('Got an unspecified type of noise. The only available options are \'parameter\' and \'action\'')
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """
        Initialize an Agent

        Params
        ======
            state_size (int): state dimension
            action_size (int): action dimension
            num_agents (int): simultaneous running agents
            random_seed (int): random seed
        """

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        random.seed(random_seed)

        # Actor Network and its target network
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network and its target network
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise object
        self.noise = OUNoise((num_agents, action_size), random_seed)

        # Replay Memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   EXPERIENCES_PER_SAMPLING, device,
                                   random_seed)

        # Initialize time step (for updating every UPDATE_NN_EVERY steps)
        self.t_step_nn = 0
        # Initialize time step (for updating every UPDATE_MEM_PAR_EVERY steps)
        self.t_step_mem_par = 0
        # Initialize time step (for updating every UPDATE_MEM_EVERY steps)
        self.t_step_mem = 0
Ejemplo n.º 5
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Reward monitoring
        self.best_total_reward = -np.inf

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters
Ejemplo n.º 6
0
    def __init__(self,
                 n_input=70 + 12,
                 n_hidden=256,
                 n_output=1,
                 activation=nn.LeakyReLU):  # input = state+action
        super(DiscriminatorModel, self).__init__()
        self.model = nn.Sequential(nn.Linear(n_input, n_hidden), activation(),
                                   nn.Linear(n_hidden, n_output),
                                   nn.Dropout(p=0.6), nn.Sigmoid())

        self.Noise = OUNoise(n_input)
        self.model.apply(init_weight)
Ejemplo n.º 7
0
    def __init__(self, env, seed):

        self.env = env
        random.seed(seed)
        np.random.seed(seed)
        tf.random.set_seed(seed)
        self.env.seed(seed)
        self.actor = self.createModel()
        self.target_actor = self.createModel()
        self.noise = OUNoise(
            self.env.action_space.shape[0], seed, theta=0.2, sigma=0.5
        )  # noise is actually OpenAI baselines OU Noise wrapped in another OUNoise function
        self.critic = self.createModel((self.env.observation_space.shape[0],
                                        self.env.action_space.shape[0]))
        self.target_critic = self.createModel(
            (self.env.observation_space.shape[0],
             self.env.action_space.shape[0]))
        self.target_critic.set_weights(self.critic.get_weights(
        ))  #ensure inital weights are equal for networks
        self.target_actor.set_weights(self.actor.get_weights())
        self.reset()
Ejemplo n.º 8
0
    def run_n_episodes(self,
                       num_episodes,
                       max_ep_length,
                       minibatch_size,
                       explore=True,
                       num_updates=5,
                       summary_checkpoint=1,
                       eta=0.01,
                       num_updates_ac=1,
                       T=1):  #num_updates from article
        for i in range(num_episodes):
            noise = OUNoise(self.a_dim)
            x = self.env.reset()
            x = x.reshape(1, -1)
            u = np.zeros(self.s_dim)
            t = False
            episodes_reward = 0

            #for REINFORCE
            self.r_rs.append([])
            self.r_xs.append([])
            self.r_us.append([])
            self.episodes_ls.append(0)
            while True:
                self.episodes_ls[-1] = self.episodes_ls[-1] + 1
                if self.det:
                    u, V = self.sess.run((self.model.mu_det, self.model.V),
                                         feed_dict={self.model.inputs_x: x})
                    self.episodes_Vs.append(V)
                else:
                    u, P, sigma, V = self.sess.run(
                        (self.model.mu_norm, self.model.P, self.model.sigma,
                         self.model.V),
                        feed_dict={self.model.inputs_x: x})
                    self.episodes_Ps.append(P)
                    self.episodes_ss.append(sigma)
                    self.episodes_Vs.append(V)
                    if self.separate_V:
                        self.episodes_V_s.append(self.critic.predict_V_sep(x))

                if explore:
                    u += noise.noise()
                    u = np.clip(u, -1.0, 1.0)

                u = u.reshape(1, -1)
                x1, r, t, info = self.env.step(u.reshape(-1))
                self.r_xs[-1].append(x)
                self.r_us[-1].append(u)
                self.r_rs[-1].append(r)
                episodes_reward += r
                self.buffer.add(x.reshape(1, -1), u, r, t, x1.reshape(1, -1))
                self.episodes_xs.append(x)
                self.episodes_us.append(u)
                self.episodes_rs.append(r)
                #Actor-Critic
                x = x1.reshape(1, -1)

                if self.qNAF:
                    for k in range(num_updates):
                        x_batch, u_batch, r_batch, t_batch, x1_batch = \
                            self.buffer.sample_batch(minibatch_size)
                        x_batch, u_batch, r_batch, t_batch, x1_batch = \
                            x_batch.reshape(-1, self.s_dim), u_batch.reshape(-1, self.a_dim), r_batch.reshape(-1, 1),\
                             t_batch.reshape(-1), x1_batch.reshape(-1, self.s_dim)

                        if self.qNAF:
                            y_batch = self.gamma * self.target_model.predict_V(
                                x1_batch) + r_batch
                            self.model.update_Q(x_batch, u_batch, y_batch)

                        self.target_model.soft_update_from(self.model)
                if t:
                    break
            if self.ac:
                r_xs_l = np.array(self.r_xs[-1]).reshape(-1, self.s_dim)
                r_rs_l = np.array(self.r_rs[-1]).reshape(-1, 1)
                for idx in range(2, len(r_rs_l) + 1):
                    r_rs_l[-idx] += self.gamma * r_rs_l[-idx + 1]
                self.r_rs[-1] = r_rs_l
                r_rs_ = np.array(self.r_rs).reshape(-1, 1)
                r_xs_ = np.array(self.r_xs).reshape(-1, self.s_dim)
                r_us_ = np.array(self.r_us).reshape(-1, self.a_dim)
                for _ in range(num_updates_ac):
                    #update V every episode
                    if self.separate_V:
                        self.critic.update_V_sep(r_xs_l, r_rs_l)
                    if i % T == 0:
                        #Q_target = r_rs_[:-1] + self.gamma * self.critic.predict_V_sep(r_xs_[1:])
                        #Q_target = np.vstack((Q_target, (r_rs_[-1])))
                        deltas = r_rs_
                        if self.separate_V:
                            deltas = deltas - self.critic.predict_V_sep(r_xs_)
                        else:
                            deltas = deltas - self.target_model.predict_V(
                                r_xs_)
                        '''
                        loss = self.sess.run((self.model.loss_spg),
                                                              feed_dict={self.model.inputs_x: r_xs_,
                                                                         self.model.inputs_u: r_us_,
                                                                         self.model.inputs_Q: deltas})
                        print('loss before update', loss)                     
                        '''
                        self.model.update_mu(r_xs_, r_us_, deltas)
                        self.target_model.soft_update_from(self.model)
                        '''
                        loss = self.sess.run((self.model.loss_spg),
                                                              feed_dict={self.model.inputs_x: r_xs_,
                                                                         self.model.inputs_u: r_us_,
                                                                         self.model.inputs_Q: deltas})
                        print('loss after update', loss)                     
                        '''
                        #self.target_model.soft_update_from(self.model)
                        self.r_rs = []
                        self.r_xs = []
                        self.r_us = []

            if summary_checkpoint > 0 and i % summary_checkpoint == 0:
                print('| Reward: %.2i' % int(episodes_reward), " | Episode", i)
                self.plot_rewards(self.summary_dir)
            self.rewards.append(episodes_reward)
Ejemplo n.º 9
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 random_seed,
                 fc1_units,
                 fc2_units,
                 weighted=False,
                 individual=False):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.epsilon = EPSILON_MAX

        # Actor Network (w/ Target Network)
        if weighted:
            self.actor_local = Weight_adapter(state_size,
                                              action_size).to(device)
            self.actor_target = Weight_adapter(state_size,
                                               action_size).to(device)
        elif individual:
            self.actor_local = IndividualModel(state_size, action_size,
                                               random_seed,
                                               fc1_units).to(device)
            self.actor_target = IndividualModel(state_size, action_size,
                                                random_seed,
                                                fc1_units).to(device)
        else:
            self.actor_local = Actor(state_size, action_size, random_seed,
                                     fc1_units, fc2_units).to(device)
            self.actor_target = Actor(state_size, action_size, random_seed,
                                      fc1_units, fc2_units).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size,
                             random_seed,
                             mu=0,
                             theta=0.15,
                             sigma=0.2)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

        # Make sure target is with the same weight as the source
        self.hard_update(self.actor_target, self.actor_local)
        self.hard_update(self.critic_target, self.critic_local)

        self.t_step = 0

    def step(self, state, action, reward, next_state, done, timestep):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        if len(self.memory) > LEARN_START:
            # Learn every UPDATE_EVERY time steps.
            self.t_step = (self.t_step + 1) % UPDATE_EVERY
            if self.t_step == 0:
                # Learn, if enough samples are available in memory
                if len(self.memory) > BATCH_SIZE:
                    for _ in range(UPDATES_PER_STEP):
                        experiences = self.memory.sample()
                        self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""

        state = torch.from_numpy(state).float().to(device)

        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        #print(action)
        self.actor_local.train()

        if add_noise:
            tem_noise = self.noise.sample()
            action += self.epsilon * tem_noise
        # print(tem_noise, np.clip(action, -1, 1))
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + ? * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        #torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

        # ---------------------------- update noise ---------------------------- #
        if self.epsilon - EPSILON_DECAY > EPSILON_MIN:
            self.epsilon -= EPSILON_DECAY
        else:
            self.epsilon = EPSILON_MIN

        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        ?_target = t*?_local + (1 - t)*?_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def hard_update(self, target, source):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)
Ejemplo n.º 10
0
class Policy(nn.Module):
    """Actor (Policy) Model."""
    def __init__(self, policy_params):
        """ arch_parameters is a dictionary like:
        {'state_and_action_dims' : (num1, num2), layers : {'Linear_1' : layer_size_1,..,'Linear_n' : layer_size_n} }
        """
        super(Policy, self).__init__()
        self.policy_params = policy_params
        self.seed_as_int = policy_params['seed']
        torch.manual_seed(self.seed_as_int)
        self.arch_params = policy_params['arch_params']
        self.__state_dim = self.arch_params['state_and_action_dims'][0]
        self.__action_dim = self.arch_params['state_and_action_dims'][1]
        self.eps = policy_params['eps']
        self.min_eps = policy_params['min_eps']
        self.eps_decay = policy_params['eps_decay']
        self.__noise_type = policy_params['noise_type']

        keys = list(self.arch_params['layers'].keys())
        list_of_layers = []

        prev_layer_size = self.__state_dim
        for i in range(len(self.arch_params['layers'])):
            key = keys[i]
            layer_type = key.split('_')[0]
            if layer_type == 'Linear':
                layer_size = self.arch_params['layers'][key]
                list_of_layers.append(nn.Linear(prev_layer_size, layer_size))
                prev_layer_size = layer_size
            elif layer_type == 'LayerNorm':
                list_of_layers.append(nn.LayerNorm(prev_layer_size))
            elif layer_type == 'ReLU':
                list_of_layers.append(nn.ReLU())
            elif layer_type == 'Tanh':
                list_of_layers.append(nn.Tanh())
            else:
                print("Error: got unspecified layer type: '{}'. Check your layers!".format(layer_type))
                break

        self.layers = nn.ModuleList(list_of_layers)

        #noise
        if self.__noise_type == 'action':
            self.__rand_process = OUNoise((self.__action_dim,))

        elif self.__noise_type == 'parameter':
            self.network_params_perturbations = dict()
            for i in range(len(self.layers)):
                if not ('Linear' in str(type(self.layers[i]))):
                    i += 1
                else:
                    self.network_params_perturbations[i] = OUNoise(tuple(self.layers[i].weight.shape))
        else:
            assert ValueError('Got an unspecified type of noise. The only available options are \'parameter\' and \'action\'')

    def forward(self, state):  # get action values
        """Build a network that maps state -> action."""


        if self.__noise_type == 'action':
            y = state.float()
            for i in range(len(self.layers)):
                y = self.layers[i](y).float()
            y_perturbed = y + self.eps*torch.from_numpy(self.__rand_process.noise()).float()
            return y, torch.clamp(y_perturbed, min = -1.0, max = 1.0)

        elif self.__noise_type == 'parameter':
            y = state.float()
            y_perturbed = state.float()
            for i in range(len(self.layers)):
                if not ('Linear' in str(type(self.layers[i]))):
                    y = self.layers[i](y).float() #layernorm
                    y_perturbed = self.layers[i](y_perturbed).float()
                else:
                    #weights
                    if (self.layers[i].weight).shape[1] == y.shape[0]: #if there is a single state
                        y = (self.layers[i].weight).matmul(y)
                        n = torch.from_numpy(self.network_params_perturbations[i].noise()).float()
                        y_perturbed = ((self.layers[i].weight) + self.eps*n).matmul(y_perturbed)
                    else:                                              #if there is a batch of states
                        y = y.matmul((self.layers[i].weight).t())
                        n = torch.from_numpy(self.network_params_perturbations[i].noise()).float()
                        y_perturbed = y_perturbed.matmul(((self.layers[i].weight) + self.eps*n).t())
                    #biases
                    y = y + (self.layers[i].bias)
                    y_perturbed = y_perturbed + (self.layers[i].bias)
            return y, y_perturbed
Ejemplo n.º 11
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Reward monitoring
        self.best_total_reward = -np.inf

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

    def reset_episode(self):
        self.total_reward = 0.0
        #self.count = 0
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        self.total_reward += reward

        if self.total_reward > self.best_total_reward:
            self.best_total_reward = self.total_reward

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Ejemplo n.º 12
0
class RDPGAgent:
    def __init__(self, env, batchSize = 10, bufferSize = 100,
                 gamma = 0.98, actorLR = 1e-4, criticLR = 1e-3,
                 maxSteps = 200, targetUpdate = 1e-3, epsilon = 1,
                 decay = 0.99, rewardScale = 1e-3, logFile = 'run.log'):
        self.env = env
        self.gamma = gamma
        self.batchSize = batchSize
        self.bufferSize = bufferSize
        self.maxSteps = maxSteps + 1
        self.rewardScale = rewardScale
        self.epsilon = epsilon
        self.decay = decay

        # Useful helpers.
        self.actionDim = self.env.action_space.shape[0]
        self.stateDim = self.env.observation_space.shape[0]
        self.featureDim = self.actionDim + self.stateDim
        self.minAction = self.env.action_space.low
        self.maxAction = self.env.action_space.high

        # For scaling output action values.
        self.actionBiasZeroOne = self.minAction
        self.actionScaleZeroOne = self.maxAction - self.minAction
        self.actionBiasTanH = (self.maxAction + self.minAction) / 2.0
        self.actionScaleTanH = self.maxAction - self.actionBiasTanH 

        # Initialize noise process.
        self.noise = OUNoise(self.actionDim)

        # Initialize replay buffer.
        self.buffer = ReplayBuffer(self.bufferSize)

        # Initialize logging.
        logging.basicConfig(filename = logFile,
                            level = logging.INFO,
                            format = '[%(asctime)s] %(message)s',
                            datefmt = '%m/%d/%Y %I:%M:%S %p')
        logging.info('Initializing DRPG agent with passed settings.')

        # Tensorflow GPU optimization.
        config = tf.ConfigProto() # GPU fix?
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config = config)
        from keras import backend as K
        K.set_session(self.sess)

        # Make actor network (creates target model internally).
        self.actor = Actor(self.sess, self.maxSteps, self.featureDim,
                           self.actionDim, self.batchSize, targetUpdate,
                           actorLR, self.actionScaleTanH, self.actionBiasTanH)

        # Make critic network (creates target model internally).
        self.critic = Critic(self.sess, self.maxSteps, self.featureDim,
                             self.actionDim, self.batchSize, targetUpdate,
                             actorLR)

    # Train or run for some number of episodes.
    def run(self, numEpisodes, training = False, warmUp = 30):
        for i in range(numEpisodes):
            sequence = []
            totalReward = 0
            totalSteps = 0
            o = self.env.reset()

            # Stores (O1, A1, O2, A2, etc) for prediction.
            history = np.zeros((self.maxSteps * self.featureDim))
            history[:self.stateDim] = o
            for j in range(self.maxSteps - 1):
                # We do this reshaping to get history into (BatchSize, TimeSteps, Dims).
                batchedHistory = np.reshape(history, (self.maxSteps, self.featureDim))
                batchedHistory = np.expand_dims(batchedHistory, axis = 0)

                # Predict action or use random with e-greedy.
                # if (np.random.random_sample() < self.epsilon and training):
                #     a = np.random.random((self.actionDim))
                #     a = a * self.actionScaleZeroOne
                #     a = a + self.actionBiasZeroOne
                # else:
                #     a = self.actor.model.predict(batchedHistory)[0]

                # Predict an action and add noise to it for exploration purposes.
                a = self.actor.model.predict(batchedHistory)[0] + self.epsilon * self.noise.noise()
                a = np.clip(a, self.minAction, self.maxAction)

                # Take a single step.
                oPrime, r, d, _ = self.env.step(a)
                r *= self.rewardScale

                newTimeStart = (j + 1) * self.featureDim
                # Update agent state and ongoing agent history data. History is
                # passed to our actor for prediction, and sequence is for later.
                history[j * self.featureDim + self.stateDim:newTimeStart] = a
                history[newTimeStart:(j + 1) * self.featureDim + self.stateDim] = oPrime
                sequence.append({'o': o, 'a': a, 'r': r, 'd': d})
                totalReward += r
                totalSteps += 1
                o = oPrime

                # Quit early.
                if d: break

            # Anneal epsilon.
            if i > warmUp:
                self.epsilon *= self.decay

            # Print some episode debugging and reward information.
            print('Episode: %03d / Steps: %d / Reward: %f' % (i + 1, totalSteps, totalReward / self.rewardScale))
            logging.info('Episode: %03d / Steps: %d / Reward: %f' % (i + 1, totalSteps, totalReward / self.rewardScale))

            # Simulation only.
            if not training:
                continue

            # Add sequence to buffer.
            self.buffer.add(sequence)

            # Resample sequences from the buffer
            samples = self.buffer.getBatch(self.batchSize)
            numSamples = len(samples)

            # Do not train until we have
            # seen self.warmUp episodes.
            if self.buffer.getCount() < warmUp:
                continue

            # Some more debug info.
            print('Training on sampled sequences from all episodes.')
            logging.info('Training on sampled sequences from all episodes.')

            # All of these do not include time step t = T.
            # Used to store H[i, t] for each episode i and step t.
            sampleHistories = np.zeros((numSamples, self.maxSteps - 1,
                                        self.maxSteps * self.featureDim))
            # Used to store H[i, t + 1] for each episode i and step t.
            sampleHistoriesWithNext = np.zeros((numSamples, self.maxSteps - 1,
                                                self.maxSteps * self.featureDim))
            # Used to store R[i, t] for each episode i and step t.
            sampleRewards = np.zeros((numSamples, self.maxSteps - 1))
            # Used to store NotDone[i, t] for each episode i and step t.
            sampleNotDoneMasks = np.zeros((numSamples, self.maxSteps - 1))
            # Used to store action[i, t] taken for each episode i and step t.
            sampleActions = np.zeros((numSamples, self.maxSteps - 1, self.actionDim))

            # Compute info for each episode i.
            for i in range(numSamples):
                sample = samples[i]
                historySoFar = np.zeros((self.maxSteps * self.featureDim))
                # Iteratively build up historySoFar for each timestep t.
                for t in range(len(sample) - 1):
                    step, nextStep = sample[i], sample[i + 1]
                    # This is (oT, aT), which we are adding to running history.
                    history = np.concatenate([step['o'], step['a']], axis = 0)
                    historySoFar[t * self.featureDim:(t + 1) * self.featureDim] = history

                    # This is (o1, a1, o2, a2 ... ot).
                    sampleHistoryEnd = (t + 1) * self.featureDim - self.actionDim
                    sampleHistories[i, t, :sampleHistoryEnd] = historySoFar[:sampleHistoryEnd]

                    # This is (o1, a1, o2, a2 ... ot, at, ot+1).
                    sampleNextEnd = (t + 1) * self.featureDim
                    sampleHistoriesWithNext[i, t, :sampleNextEnd] = historySoFar[:sampleNextEnd]
                    sampleHistoriesWithNext[i, t, sampleNextEnd:sampleNextEnd + self.stateDim] = nextStep['o']

                    # Set rewards and not done masks.
                    sampleRewards[i, t] = step['r']
                    sampleActions[i, t] = step['a']
                    sampleNotDoneMasks[i, t] = 0 if step['d'] else 1

            # Separate out self.maxSteps since it is the timestep dimension for RNN.
            sampleHistories = np.reshape(sampleHistories, (numSamples, self.maxSteps - 1,
                                                           self.maxSteps, self.featureDim))

            # Separate out self.maxSteps since it is the timestep dimension for RNN.
            sampleHistoriesWithNext = np.reshape(sampleHistoriesWithNext, (numSamples, self.maxSteps - 1,
                                                                           self.maxSteps, self.featureDim))

            # Update models using samples, rewards, and masks.
            self.update(numSamples, sampleHistories, sampleHistoriesWithNext,
                        sampleRewards, sampleActions, sampleNotDoneMasks)

    # Given a bunch of experienced histories, update our models.
    def update(self, numSamples, histories, historiesNext, rewards, chosenActions, notDoneMasks):
        # Reshape [i, t] pairs to a single dimension, which will be the RNN batch dimension.
        historiesBatch = np.reshape(histories, (-1, self.maxSteps, self.featureDim))
        historiesNextBatch = np.reshape(historiesNext, (-1, self.maxSteps, self.featureDim))

        # Compute QSample targets [y] for updating the critic Q[S][A] outputs.
        targetActions = self.actor.target.predict(historiesNextBatch) # (B * (T - 1), F).
        targetQ = self.critic.target.predict([historiesNextBatch, targetActions]) # (B * (T - 1), 1).
        targetQ = np.reshape(targetQ, (numSamples, self.maxSteps - 1)) # (B, T - 1).
        y = rewards + notDoneMasks * (self.gamma * targetQ) # (B, T - 1)
        y = np.reshape(y, (numSamples * (self.maxSteps - 1), 1)) # (B * (T - 1), 1)

        # Train the critic model, passing in both the history and chosen actions.
        chosenActionsFlat = np.reshape(chosenActions, (numSamples * (self.maxSteps - 1), self.actionDim))
        # print (chosenActionsFlat.shape, historiesBatch.shape, historiesNextBatch.shape)
        self.critic.model.train_on_batch([historiesBatch, chosenActionsFlat], y)

        # Compute the gradient of the critic output WRT to its action input.
        # We cannot use chosenActions here since those were noisy predictions.
        currentActionsForGrad = self.actor.model.predict(historiesBatch)
        currentActionsGrad = self.critic.modelActionGradients(historiesBatch, currentActionsForGrad)

        # Train the actor model using the critic gradient WRT action input.
        self.actor.trainModel(historiesBatch, currentActionsGrad)

        # Update target models.
        self.actor.trainTarget()
        self.critic.trainTarget()
Ejemplo n.º 13
0
    def __init__(self, task, train=True):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        self.actor_lr = 1e-5  #.0001
        self.critic_lr = 1e-4  #0.0000001

        self.network = [128, 256, 128]

        self.train = train
        network = self.network
        actor_lr = self.actor_lr
        critic_lr = self.critic_lr

        if (self.train):
            # Actor (Policy) Model
            self.actor_local = Actor(self.state_size, self.action_size,
                                     self.action_low, self.action_high,
                                     actor_lr, network)
            self.actor_target = Actor(self.state_size, self.action_size,
                                      self.action_low, self.action_high,
                                      actor_lr, network)

            # Critic (Value) Model
            self.critic_local = Critic(self.state_size, self.action_size,
                                       critic_lr, network)
            self.critic_target = Critic(self.state_size, self.action_size,
                                        critic_lr, network)

            # Initialize target model parameters with local model parameters
            self.critic_target.model.set_weights(
                self.critic_local.model.get_weights())
            self.actor_target.model.set_weights(
                self.actor_local.model.get_weights())

            # Noise process
            self.exploration_mu = 0  # Mean
            self.exploration_theta = 0.15  #.15 How fast variable reverts to mean
            self.exploration_sigma = 0.2  #.2 Degree of volatility
            self.noise = OUNoise(self.action_size, self.exploration_mu,
                                 self.exploration_theta,
                                 self.exploration_sigma)

            # Replay memory
            self.buffer_size = 5000
            self.batch_size = 16
            self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
            self.targets = ReplayBuffer(self.buffer_size, self.batch_size)

            # Algorithm parameters
            self.gamma = 0.99  # discount factor
            self.tau = 0.01  # for soft update of target parameters

            print("DDPG init", "Actor: ", actor_lr, "Critic: ", critic_lr)
            print("Tau: ", self.tau, "Sigma: ", self.exploration_sigma)
            print(self.actor_local.model.summary())
            print(self.critic_local.model.summary())

            # https://stackoverflow.com/questions/44861149/keras-use-tensorboard-with-train-on-batch?rq=1
            # Create the TensorBoard callback,
            # which we will drive manually
            self.tensorboard = keras.callbacks.TensorBoard(
                log_dir='logdir',
                histogram_freq=0,
                batch_size=self.batch_size,
                write_graph=True,
                write_grads=True)

            self.tensorboard.set_model(self.critic_local.model)
            self.summary_writer = tf.summary.FileWriter("scores")

            self.batch_id = 0
Ejemplo n.º 14
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task, train=True):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        self.actor_lr = 1e-5  #.0001
        self.critic_lr = 1e-4  #0.0000001

        self.network = [128, 256, 128]

        self.train = train
        network = self.network
        actor_lr = self.actor_lr
        critic_lr = self.critic_lr

        if (self.train):
            # Actor (Policy) Model
            self.actor_local = Actor(self.state_size, self.action_size,
                                     self.action_low, self.action_high,
                                     actor_lr, network)
            self.actor_target = Actor(self.state_size, self.action_size,
                                      self.action_low, self.action_high,
                                      actor_lr, network)

            # Critic (Value) Model
            self.critic_local = Critic(self.state_size, self.action_size,
                                       critic_lr, network)
            self.critic_target = Critic(self.state_size, self.action_size,
                                        critic_lr, network)

            # Initialize target model parameters with local model parameters
            self.critic_target.model.set_weights(
                self.critic_local.model.get_weights())
            self.actor_target.model.set_weights(
                self.actor_local.model.get_weights())

            # Noise process
            self.exploration_mu = 0  # Mean
            self.exploration_theta = 0.15  #.15 How fast variable reverts to mean
            self.exploration_sigma = 0.2  #.2 Degree of volatility
            self.noise = OUNoise(self.action_size, self.exploration_mu,
                                 self.exploration_theta,
                                 self.exploration_sigma)

            # Replay memory
            self.buffer_size = 5000
            self.batch_size = 16
            self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
            self.targets = ReplayBuffer(self.buffer_size, self.batch_size)

            # Algorithm parameters
            self.gamma = 0.99  # discount factor
            self.tau = 0.01  # for soft update of target parameters

            print("DDPG init", "Actor: ", actor_lr, "Critic: ", critic_lr)
            print("Tau: ", self.tau, "Sigma: ", self.exploration_sigma)
            print(self.actor_local.model.summary())
            print(self.critic_local.model.summary())

            # https://stackoverflow.com/questions/44861149/keras-use-tensorboard-with-train-on-batch?rq=1
            # Create the TensorBoard callback,
            # which we will drive manually
            self.tensorboard = keras.callbacks.TensorBoard(
                log_dir='logdir',
                histogram_freq=0,
                batch_size=self.batch_size,
                write_graph=True,
                write_grads=True)

            self.tensorboard.set_model(self.critic_local.model)
            self.summary_writer = tf.summary.FileWriter("scores")

            self.batch_id = 0

    def reset_episode(self):
        if (self.train):
            self.noise.reset()
            self.noise_arr = []
            self.noise_matrix = [0., 0., 0., 0.]

        state = self.task.reset()
        self.last_state = state
        return state

    def save_initial_weights(self):
        self.actor_local.model.save_weights('actor_local.h5')
        self.actor_target.model.save_weights('actor_target.h5')
        self.critic_local.model.save_weights('critic_local.h5')
        self.critic_target.model.save_weights('critic_target.h5')

    def load_initial_weights(self):
        self.actor_local.model.load_weights('actor_local.h5')
        self.actor_target.model.load_weights('actor_target.h5')
        self.critic_local.model.load_weights('critic_local.h5')
        self.critic_target.model.load_weights('critic_target.h5')

    def save_model(self):
        # Save the weights
        self.actor_local.model.save_weights('model_weights.h5')

    def load_weights(self, option=None):
        if (option == None):
            self.trained = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high,
                                 self.actor_lr, self.network)
            self.trained.model.load_weights('model_weights.h5')
        else:
            self.trained = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high,
                                 self.actor_lr, self.network)
            self.trained.model.load_weights('weights-best.hdf5')
            print(self.trained.model.summary())

    def predict(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.trained.model.predict(state)[0]
        return action

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if (len(self.memory) > self.batch_size * 2):
            experiences = self.memory.sample()
            self.learn(experiences)

        if (len(self.memory) == self.buffer_size):
            self.memory.memory.clear()
            print("buffer cleared")

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        noise = self.noise.sample()
        action = list(self.actor_local.model.predict(state)[0] + noise)

        return action, noise  # add some noise for exploration

    def learn(self, experiences):  #experiences
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])
        '''
        print("States", states.shape)
        print("actions", actions.shape)
        print("rewards", rewards.shape)
        print("dones", dones.shape)
        print("Next states", next_states.shape)
        '''
        # keep training actor local and critic local
        # use values from target model to update and train local
        # don't train target models, we soft update target

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))

        actions_next = self.actor_target.model.predict_on_batch(
            next_states)  #target

        #Actions predicted by target critic
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])  #target

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)

        critic_loss = self.critic_local.model.train_on_batch(
            x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        actor_loss = self.actor_local.train_fn([states, action_gradients,
                                                1])  # custom training function

        self.tensorboard.on_epoch_end(
            self.batch_id, named_logs(self.critic_local.model, [critic_loss]))
        self.batch_id += 1

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
class Agent:
    """
    Interacts with and learns from the environment.
    """
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """
        Initialize an Agent

        Params
        ======
            state_size (int): state dimension
            action_size (int): action dimension
            num_agents (int): simultaneous running agents
            random_seed (int): random seed
        """

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        random.seed(random_seed)

        # Actor Network and its target network
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network and its target network
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise object
        self.noise = OUNoise((num_agents, action_size), random_seed)

        # Replay Memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   EXPERIENCES_PER_SAMPLING, device,
                                   random_seed)

        # Initialize time step (for updating every UPDATE_NN_EVERY steps)
        self.t_step_nn = 0
        # Initialize time step (for updating every UPDATE_MEM_PAR_EVERY steps)
        self.t_step_mem_par = 0
        # Initialize time step (for updating every UPDATE_MEM_EVERY steps)
        self.t_step_mem = 0

    def step(self, state, action, reward, next_state, done):
        """
        Save experience in replay memory, and use prioritized sample from buffer to learn.
        """

        # Save memory
        for i in range(self.num_agents):
            self.memory.add(state[i, :], action[i, :], reward[i],
                            next_state[i, :], done[i])

        # Learn every UPDATE_NN_EVERY time steps.
        self.t_step_nn = (self.t_step_nn + 1) % UPDATE_NN_EVERY
        self.t_step_mem = (self.t_step_mem + 1) % UPDATE_MEM_EVERY
        self.t_step_mem_par = (self.t_step_mem_par + 1) % UPDATE_MEM_PAR_EVERY

        if self.t_step_mem_par == 0:
            self.memory.update_parameters()
        if self.t_step_nn == 0:
            # Learn from memory if enough samples exist
            if self.memory.experience_count > EXPERIENCES_PER_SAMPLING:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

        if self.t_step_mem == 0:
            self.memory.update_memory_sampling()

    def act(self, states, add_noise=True):
        """
        Returns actions for given state as per current policy.
        """
        states = torch.from_numpy(states).float().to(device)
        actions = np.zeros((self.num_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            for i, state in enumerate(states):
                action = self.actor_local(state).cpu().data.numpy()
                actions[i, :] = action

        self.actor_local.train()
        if add_noise:
            actions += self.noise.sample()

        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """
        Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones, indices = experiences

        # update Critic
        # Get next predicted state, actions, and Q values
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q targets for current state
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute Critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Update Actor
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Update target networks
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

        # Update priorities
        delta = abs(Q_targets - Q_expected).detach().numpy()
        self.memory.update_priorities(delta, indices)

    @staticmethod
    def soft_update(local_model, target_model, tau):
        """
        Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """

        for target_model_param, local_model_param in zip(
                target_model.parameters(), local_model.parameters()):
            target_model_param.data.copy_(tau * local_model_param.data +
                                          (1. - tau) * target_model_param.data)
Ejemplo n.º 16
0
class Agent:
    '''The most perfect DDPG Agent you have ever seen'''
    # Parameters taken from various sources
    epsilon = 0
    epsilon_min = 0
    decay = 0.9

    learn_start = 1000
    gamma = 0.99
    alpha = 0.002
    tau = 0.005

    mem_len = 1e5
    memory = deque(maxlen=int(mem_len))

    def __init__(self, env, seed):

        self.env = env
        random.seed(seed)
        np.random.seed(seed)
        tf.random.set_seed(seed)
        self.env.seed(seed)
        self.actor = self.createModel()
        self.target_actor = self.createModel()
        self.noise = OUNoise(
            self.env.action_space.shape[0], seed, theta=0.2, sigma=0.5
        )  # noise is actually OpenAI baselines OU Noise wrapped in another OUNoise function
        self.critic = self.createModel((self.env.observation_space.shape[0],
                                        self.env.action_space.shape[0]))
        self.target_critic = self.createModel(
            (self.env.observation_space.shape[0],
             self.env.action_space.shape[0]))
        self.target_critic.set_weights(self.critic.get_weights(
        ))  #ensure inital weights are equal for networks
        self.target_actor.set_weights(self.actor.get_weights())
        self.reset()
        # return self.actor

    def createModel(self, input=None):
        '''Generate neural network models based on inputs, defaults to Actor model'''
        last_init = tf.random_uniform_initializer(
            minval=-0.003, maxval=0.003
        )  # To prevent actor network from causing steep gradients
        if input is None:
            input = self.env.observation_space.shape[0]  # Actor
            inputs = keras.layers.Input(shape=(input, ))
            hidden = keras.layers.Dense(256, activation="relu")(inputs)
            hidden = keras.layers.Dense(256, activation="relu")(hidden)
            outputs = keras.layers.Dense(1,
                                         activation="tanh",
                                         kernel_initializer=last_init)(hidden)
            model = Actor(inputs, outputs)
            lr_schedule = keras.optimizers.schedules.ExponentialDecay(
                initial_learning_rate=self.alpha / 2,
                decay_steps=1e9,
                decay_rate=1
            )  #This could allow us to use decaying learning rate
            model.compile(
                loss="huber_loss", optimizer=Adam(learning_rate=lr_schedule)
            )  #Compile model with optimizer so we can apply tape.gradient later
        else:  # Critic
            input_o, input_a = input
            input1 = keras.layers.Input(shape=(input_o, ))
            input2 = keras.layers.Input(shape=(input_a, ))
            input11 = keras.layers.Dense(16, activation="relu")(input1)
            input11 = keras.layers.Dense(32, activation="relu")(input11)
            input21 = keras.layers.Dense(32, activation="relu")(input2)
            cat = keras.layers.Concatenate()([input11, input21])
            hidden = keras.layers.Dense(256, activation="relu")(cat)
            hidden = keras.layers.Dense(256, activation="relu")(hidden)
            outputs = keras.layers.Dense(1,
                                         activation="linear",
                                         kernel_initializer=last_init)(hidden)
            lr_schedule = keras.optimizers.schedules.ExponentialDecay(
                initial_learning_rate=self.alpha / 1,
                decay_steps=1e9,
                decay_rate=1)
            model = Critic([input1, input2], outputs)
            model.compile(loss="mean_squared_error",
                          optimizer=Adam(
                              learning_rate=lr_schedule))  # mean_squared_error
        return model

    def replayBuffer(self, state, action, reward, next_state, terminal):
        ##TODO Implement prioritised buffer
        self.memory.append([state, action, reward, next_state, terminal])

    @tf.function  #EagerExecution for speeeed
    def replay(self, states, actions, rewards,
               next_states):  #, actor, target_actor, critic, target_critic):
        '''tf function that replays sampled experience to update actor and critic networks using gradient'''
        # Very much inspired by Keras tutorial: https://keras.io/examples/rl/ddpg_pendulum/
        with tf.GradientTape() as tape:
            target_actions = self.target_actor(next_states, training=True)
            q_target = rewards + self.gamma * self.target_critic(
                [next_states, target_actions], training=True)
            q_current = self.critic([states, actions], training=True)
            critic_loss = tf.math.reduce_mean(
                tf.math.square(q_target - q_current))

        critic_grad = tape.gradient(critic_loss,
                                    self.critic.trainable_variables)
        self.critic.optimizer.apply_gradients(
            zip(critic_grad, self.critic.trainable_variables))

        with tf.GradientTape() as tape:
            actions_pred = self.actor(states, training=True)
            q_current = self.critic([states, actions_pred], training=True)
            actor_loss = -tf.math.reduce_mean(q_current)

        actor_grad = tape.gradient(actor_loss, self.actor.trainable_variables)
        self.actor.optimizer.apply_gradients(
            zip(actor_grad, self.actor.trainable_variables))

    @tf.function
    def update_weight(self, target_weights, weights, tau):
        '''tf function for updating the weights of selected target network'''
        for (a, b) in zip(target_weights, weights):
            a.assign(b * tau + a * (1 - tau))

    def trainTarget(self):
        '''Standard function to update target networks by tau'''
        self.update_weight(self.target_actor.variables, self.actor.variables,
                           self.tau)
        self.update_weight(self.target_critic.variables, self.critic.variables,
                           self.tau)

    def sample2batch(self, batch_size=64):
        '''Return a set of Tensor samples from the memory buffer of batch_size, default is 64'''
        # batch_size = 64

        if len(
                self.memory
        ) < batch_size:  # return nothing if not enough experiences available
            return
        # Generate batch and emtpy arrays
        samples = random.sample(self.memory, batch_size)
        next_states = np.zeros(
            (batch_size, self.env.observation_space.shape[0]))
        states = np.zeros((batch_size, self.env.observation_space.shape[0]))
        rewards = np.zeros((batch_size, 1))
        actions = np.zeros((batch_size, self.env.action_space.shape[0]))

        # Separate batch into arrays
        for idx, sample in enumerate(samples):
            state, action, reward, next_state, terminal = sample
            states[idx] = state
            actions[idx] = action
            rewards[idx] = reward
            next_states[idx] = next_state

        # Convert arrays to tensors so we can use replay as a callable TensorFlow graph
        states = tf.convert_to_tensor((states))
        rewards = tf.convert_to_tensor((rewards))
        rewards = tf.cast(rewards, dtype=tf.float32)
        actions = tf.convert_to_tensor((actions))
        next_states = tf.convert_to_tensor((next_states))

        return (states, actions, rewards, next_states)

    def train(self, state, action, reward, next_state, terminal, steps):
        '''Function call to update buffer and networks at predetermined intervals'''
        self.replayBuffer(state, action, reward, next_state,
                          terminal)  # Add new data to buffer
        if steps % 1 == 0 and len(
                self.memory) > self.learn_start:  # Sample every X steps
            samples = self.sample2batch()
            states, actions, rewards, next_states = samples
            self.replay(states, actions, rewards, next_states)
        if steps % 1 == 0:  # Update targets only every X steps
            self.trainTarget()

    def reset(self):
        self.epsilon *= self.decay
        self.epsilon = max(self.epsilon_min, self.epsilon)

    def chooseAction(self, state, scale=False):
        '''Choose action based on policy and noise function. Scale option used to limit maximum actions'''
        # self.epsilon *= self.decay
        # self.epsilon = round(max(self.epsilon / 1000, self.epsilon), 5)
        # print(state[0])
        state = tf.expand_dims(tf.convert_to_tensor(state),
                               0)  #convert to tensor for speeeed
        if np.random.random(
        ) < self.epsilon:  # If using epsilon instead of exploration noise
            return random.uniform(-1, 1)
        if scale:
            return np.clip(0.33 * (self.actor(state)) + self.noise.sample(),
                           -1, 1)
        return np.clip(1 * tf.squeeze(self.actor(state)).numpy() +
                       self.noise.sample(), -1,
                       1)  # np.argmax(self.model.predict(state))  # action