Ejemplo n.º 1
0
class Actor_Critic():
    def __init__(self, env, sess):
        self.env = env
        self.sess = sess
        self.memory_buffer = Replay_Buffer(BUFFER_SIZE, BATCH_SIZE)
        self.learning_rate = LR
        self.tau = TAU
        self.buffer_size = BUFFER_SIZE
        self.batch_size = BATCH_SIZE
        self.discount = 0.99
        self.Actor = Actor(self.env, self.sess, self.learning_rate, self.tau,
                           self.discount)
        self.Critic = Critic(self.env, self.sess, self.learning_rate, self.tau,
                             self.discount)

    def update_target(self):
        self.Actor.actor_target_update()
        self.Critic.critic_target_update()

    #def train(self):

    def save(self, prefixe):
        self.Actor.save(prefixe)
        self.Critic.save(prefixe)
        self.memory_buffer.save()
Ejemplo n.º 2
0
class DDPG(object):
    """deep deterministic policy gradient
    """
    def __init__(self,
                 n_state,
                 n_action,
                 a_bound,
                 gamma=0.99,
                 tau=0.01,
                 actor_lr=0.0005,
                 critic_lr=0.001,
                 noise_std=0.1,
                 noise_decay=0.9995,
                 noise_decay_steps=1000,
                 buffer_size=20000,
                 save_interval=5000,
                 assess_interval=10,
                 logger=None,
                 checkpoint_queen=None):
        self.logger = logger
        self.logger.save_config(locals())
        self.n_action = n_action
        self.n_state = n_state
        self.a_bound = a_bound
        self.noise_std = noise_std
        self.noise_decay = noise_decay
        self.noise_decay_steps = noise_decay_steps
        self.pointer = 0
        self.buffer_size = buffer_size
        self.save_interval = save_interval
        self.assess_interval = assess_interval
        self.actor = Actor(self.n_state,
                           self.n_action,
                           gamma=gamma,
                           lr=actor_lr,
                           tau=tau,
                           l2_reg=0)
        self.critic = Critic(self.n_state,
                             self.n_action,
                             gamma=gamma,
                             lr=critic_lr,
                             tau=tau,
                             l2_reg=0)
        self.merge = self._merge_summary()
        self.ckpt_queen = checkpoint_queen

        self.prefix = self.__class__.__name__.lower()

    def _merge_summary(self):
        tf.summary.histogram('critic_output', self.critic.model.output)
        tf.summary.histogram('actor_output', self.actor.model.output)
        tf.summary.histogram('critic_dense1',
                             self.critic.model.get_layer('l1').weights[0])
        tf.summary.histogram('actor_dense1',
                             self.actor.model.get_layer('l1').weights[0])
        tf.summary.histogram('critic_dense2',
                             self.critic.model.get_layer('l2').weights[0])
        tf.summary.histogram('actor_dense2',
                             self.actor.model.get_layer('l2').weights[0])
        return tf.summary.merge_all()

    def policy_action(self, state):
        return self.actor.predict(state)

    def bellman_q_value(self, rewards, q_nexts, dones):
        """ Use the Bellman Equation to compute the critic target
        """
        q_target = np.zeros_like(
            rewards)  # asarry( copy = False), array(cope=True)
        for i in range(rewards.shape[0]):
            if dones[i]:
                q_target[i] = rewards[i]
            else:
                q_target[i] = rewards[i] + self.critic.gamma * q_nexts[i]
        return q_target

    def update_model(self, states, actions, q_values):
        # train critic
        loss_names, loss_values = self.critic.train_on_batch(
            states, actions, q_values)

        # train actor
        # p_actions = self.actor.predict(states)  #actions with no noise
        grad_ys = self.critic.gradients(
            states, self.actor.predict(states))  #(batch, n-action)
        actor_output = self.actor.train(states, self.actor.predict(states),
                                        grad_ys)

        # copy network
        self.actor.copy_weights()
        self.critic.copy_weights()

        # print(grad_ys, grad_ys.shape)
        # print(actor_output[0],actor_output[0].shape)
        # print(np.mean(grad_ys*actor_output[0]))

        return loss_names, loss_values, grad_ys, actor_output

    def save_weights(self, path):
        self.actor.save(path)
        self.critic.save(path)

    def save_model(self, path, file):
        self.actor.model.save(
            os.path.join(path, self.prefix + '_actor_' + file + '.h5'))
        self.critic.model.save(
            os.path.join(path, self.prefix + '_critic_' + file + '.h5'))

    def checkpoint(self, path, step, metric_value):
        signature = str(step) + '_' + '{:.4f}'.format(metric_value)
        to_delete, need_save = self.ckpt_queen.add((metric_value, signature))
        if to_delete:
            actor = os.path.join(
                path, self.prefix + '_actor_' + to_delete[1] + '.h5')
            critic = os.path.join(
                path, self.prefix + '_critic_' + to_delete[1] + '.h5')
            os.remove(actor)
            os.remove(critic)
        if need_save:
            self.save_model(path, signature)

    def train(self,
              args,
              summary_writer,
              train_data=None,
              val_data=None,
              test_data=None):
        results = []
        max_val_rate = 0
        val_data = np.asarray(val_data)  # none will be array(None)
        # First, gather experience
        tqdm_e = tqdm(range(args.batchs),
                      desc='score',
                      leave=True,
                      unit=" epoch")
        if train_data is None:
            dataset = CsvBuffer(args.file_dir,
                                args.reg_pattern,
                                chunksize=args.batch_size)  # 100*(20+1)
            assert dataset.is_buffer_available, 'neither train_data nor csv buffer is available'
        # noise = OrnsteinUhlenbeckProcess(size=self.n_action)
        else:
            dataset = Dataset(train_data, args.batch_size, shuffle=True)
        for e in tqdm_e:
            batch_data = next(dataset)
            states, labels = batch_data[:, :-1], batch_data[:, -1].astype(int)

            a = self.policy_action(states)  #(batch, n_action)
            a = np.clip(a + np.random.normal(0, self.noise_std, size=a.shape),
                        self.a_bound[0], self.a_bound[1])
            # a = np.clip(np.random.normal(a, self.noise_std), self.a_bound[0], self.a_bound[1])
            # a = np.clip(a + noise.generate(time, a.shape[0]), self.a_bound[0], self.a_bound[1])
            llr = np.clip(np.log(a / (1 - a) + 1e-6), -5, 5)
            r = np.where(labels == 1, llr.ravel(), -llr.ravel())  #(batch,)
            # q_nexts = self.critic.target_predict(new_states, self.actor.target_predict(new_states))
            q_ = self.bellman_q_value(rewards=r,
                                      q_nexts=0,
                                      dones=[True] * r.shape[0])  #(batch,)
            loss_names, loss_values, grad_ys, actor_output = self.update_model(
                states, a, q_.reshape(-1, 1))

            score = r.mean()

            if ((e + 1) % self.noise_decay_steps - 1) == 0:
                self.noise_std *= self.noise_decay
                self.logger.log_tabular('noise', self.noise_std)
            if e % self.assess_interval == 0 or e == args.batchs - 1:
                if val_data is not None:
                    val_pred = self.actor.predict(val_data[:, :-1])
                    val_y = val_data[:, -1]
                    val_rate, top_k = top_ratio_hit_rate(
                        val_y.ravel(), val_pred.ravel())
                    self.logger.log_tabular('val_rate', val_rate)
                    self.logger.log_tabular('val_k', int(top_k))
                    self.checkpoint(args.model_path, e, val_rate)
                    max_val_rate = val_rate if val_rate > max_val_rate else max_val_rate
                if test_data is not None:
                    test_pred = self.actor.predict(test_data[:, :-1])
                    test_y = test_data[:, -1]
                    test_rate, top_k = top_ratio_hit_rate(
                        test_y, test_pred.ravel())
                    self.logger.log_tabular('test_rate', test_rate)
                    self.logger.log_tabular('test_k', int(top_k))

                summary_writer.add_summary(tf_summary(['mean-reward'],
                                                      [score]),
                                           global_step=e)
                summary_writer.add_summary(tf_summary(loss_names,
                                                      [loss_values]),
                                           global_step=e)
                merge = keras.backend.get_session().run(
                    self.merge,
                    feed_dict={
                        self.critic.model.input[0]: states,
                        self.critic.model.input[1]: a,
                        self.actor.model.input: states
                    })
                summary_writer.add_summary(merge, global_step=e)

            for name, val in zip(loss_names, [loss_values]):
                self.logger.log_tabular(name, val)
            # print(grad_ys,grad_ys.shape)
            # print(actor_output)
            self.logger.log_tabular(
                'dQ/da', '%.4f+%.4f' %
                (grad_ys.mean(), grad_ys.std()))  # grad_ys (batch,act_dim)
            self.logger.log_tabular(
                'aout',
                '%.4f+%.4f' % (actor_output[0].mean(), actor_output[0].std()))
            self.logger.log_tabular('aloss', '%.4f' % (actor_output[1]))
            self.logger.log_tabular('reward', '%.4f+%.4f' % (score, r.std()))
            self.logger.dump_tabular()
            tqdm_e.set_description("score: " + '{:.4f}'.format(score))
            tqdm_e.set_postfix(noise_std='{:.4f}'.format(self.noise_std),
                               max_val_rate='{:.4f}'.format(max_val_rate),
                               val_rate='{:.4f}'.format(val_rate),
                               top_k=top_k)
            tqdm_e.refresh()

        return results
Ejemplo n.º 3
0
class DDPG(object):
    """ Deep Deterministic Policy Gradient (DDPG) Helper Class
    """
    def __init__(self,
                 action_dim,
                 state_dim,
                 batch_size,
                 step,
                 buffer_size,
                 train_indicator,
                 episode,
                 gamma,
                 lra,
                 lrc,
                 tau,
                 load_weight=True):
        """ Initialization
        """
        # Environment and A2C parameters
        self.action_dim = action_dim
        self.state_dim = state_dim
        self.batch_size = batch_size
        self.step = step
        self.gamma = gamma
        self.lra = lra
        self.lrc = lrc
        self.tau = tau
        self.episode = episode
        self.train_indicator = train_indicator
        # Create actor and critic networks
        self.actor = Actor(state_dim, action_dim, batch_size, lra, tau)
        self.critic = Critic(state_dim, action_dim, batch_size, lrc, tau)
        self.buffer = MemoryBuffer(buffer_size)
        # !: weights folder need to be specified & ensure only one set of A&C weights are in this folder
        self.weights_dir_path = os.getcwd() + r"\saved_model\*.h5"

        if load_weight:
            try:
                weights_actor_path = ""
                weights_critic_path = ""
                weights_file_path = glob.glob(self.weights_dir_path)

                for file_path in weights_file_path:
                    if file_path.find("actor") < 0:
                        weights_critic_path = file_path
                    if file_path.find("critic") < 0:
                        weights_actor_path = file_path

                self.load_weights(weights_actor_path, weights_critic_path)

                print("")
                print("Actor-Critic Models are loaded with weights...")
                print("")
            except:
                print("")
                print(
                    "Weights are failed to be loaded, please check weights loading path..."
                )
                print("")

    def policy_action(self, s):
        """ Use the actor to predict value
        """
        return self.actor.predict(s)[0]

    def bellman(self, rewards, q_values, dones):
        """ Use the Bellman Equation to compute the critic target (one action only)
        """
        critic_target = np.asarray(q_values)
        for i in range(q_values.shape[0]):
            if dones[i]:
                critic_target[i] = rewards[i]
            else:
                critic_target[i] = rewards[i] + self.gamma * q_values[i]
        return critic_target

    def memorize(self, state_old, action, reward, done, state_new):
        """ Store experience in memory buffer
        """
        self.buffer.memorize(state_old, action, reward, done, state_new)

    def sample_batch(self, batch_size):
        return self.buffer.sample_batch(batch_size)

    def update_models(self, states, actions, critic_target):
        """ Update actor and critic networks from sampled experience
        """
        # Train critic
        self.critic.train_on_batch(states, actions, critic_target)
        # Q-Value Gradients under Current Policy
        actions = self.actor.model.predict(states)
        grads = self.critic.gradients(states, actions)
        # Train actor
        self.actor.train(states, actions,
                         np.array(grads).reshape((-1, self.action_dim)))
        # Transfer weights to target networks at rate Tau
        self.actor.transfer_weights()
        self.critic.transfer_weights()

    def run(self, env):
        # First, gather experience
        for e in range(self.episode):
            # Reset episode
            # set initial state
            loss, cumul_reward, cumul_loss = 0, 0, 0
            done = False
            state_old = env.get_vissim_state(
                1, 180 * 5, [45, 55, 60, 65, 70, 75, 80
                             ])  #TODO: make sure states are recieved correctly
            actions, states, rewards = [], [], []

            print("Episode: ", e, " ========================:")

            for t in range(self.step):
                action_original = self.policy_action(state_old)

                #TODO: OU function params?
                noise = OrnsteinUhlenbeckProcess(x0=action_original,
                                                 size=self.action_dim)

                # action = action_orig + noise
                action = noise.apply_ou(t)

                # adjust too-low or too-high action
                adj_action = np.zeros(len(action))
                for index, value in enumerate(action):
                    adj_action[index] = clip(value, -1, 1)

                #action_mapping function
                transformed_action = Transformation.convert_actions(adj_action)

                reward, state_new = env.get_vissim_reward(
                    180 * 5, transformed_action)

                # TODO: if we know what the optimal discharging rate, then we set that as done
                if t == self.step - 1:  #we consider the manually setted last step as done
                    done = True

                # ======================================================================================= Training section
                if (self.train_indicator):
                    # Add outputs to memory buffer
                    self.memorize(state_old, adj_action, reward, done,
                                  state_new)
                    # Sample experience from buffer
                    states_old, actions, rewards, dones, states_new = self.sample_batch(
                        self.batch_size)
                    # Predict target q-values using target networks
                    q_values = self.critic.target_predict(
                        [states_new,
                         self.actor.target_predict(states_new)])
                    # Compute critic target
                    critic_target = self.bellman(rewards, q_values, dones)
                    # Train both networks on sampled batch, update target networks
                    self.update_models(states_old, actions, critic_target)
                    # calculate loss
                    loss = self.critic.train_on_batch(states_old, actions,
                                                      critic_target)
                    state_old = state_new
                    cumul_reward += reward
                    cumul_loss += loss
                # =======================================================================================

                # ======================================================================================= report
                print("|---> Step: ", t, " | Action: ", transformed_action,
                      " | Reward: ", reward, " | Loss: ", loss)
                # =======================================================================================

            # ======================================================================================= save model
            if np.mod(e, 10) == 0:
                print("====================> Saving model...")
                self.save_weights("./saved_model/")
                """
                with open("actormodel.json", "w") as outfile:
                    json.dump(actor.model.to_json(), outfile)
                with open("criticmodel.json", "w") as outfile:
                    json.dump(critic.model.to_json(), outfile)
                """
            # ======================================================================================= save model

            print("")
            print("*-------------------------------------------------*")
            print("Average Accumulated Reward: " +
                  str(cumul_reward / self.step))
            print("Average Accumulated Loss: " + str(cumul_loss / self.step))
            print("*-------------------------------------------------*")
            print("")

            # garbage recycling
            gc.collect()

    def save_weights(self, path):
        t = datetime.datetime.now()
        time = "_" + str(t.date()) + "_" + str(t.hour) + "h-" + str(
            t.minute) + "m"
        path_actor = path + '_LR_{}'.format(self.lra) + time
        path_critic = path + '_LR_{}'.format(self.lrc) + time
        self.actor.save(path_actor)
        self.critic.save(path_critic)

    def load_weights(self, path_actor, path_critic):
        self.actor.load(path_actor)
        self.critic.load(path_critic)
Ejemplo n.º 4
0
class TD3(object):
    """deep deterministic policy gradient
    """
    def __init__(self,
                 n_state,
                 n_action,
                 a_bound,
                 discount=0.99,
                 tau=0.05,
                 actor_lr=0.001,
                 critic_lr=0.001,
                 policy_freq=2,
                 exp_noise_std=0.1,
                 noise_decay=0.9995,
                 noise_decay_steps=1000,
                 smooth_noise_std=0.1,
                 clip=0.2,
                 buffer_size=20000,
                 save_interval=5000,
                 assess_interval=20,
                 logger=None,
                 checkpoint_queen=None):
        #self.__dict__.update(locals())
        self.logger = logger
        self.logger.save_config(locals())
        self.n_action = n_action
        self.n_state = n_state
        self.a_bound = a_bound
        self.noise_std = exp_noise_std
        self.noise_decay = noise_decay
        self.noise_decay_steps = noise_decay_steps
        self.policy_freq = policy_freq
        self.smooth_noise_std = smooth_noise_std
        self.clip = clip
        self.discount = discount

        self.pointer = 0
        self.buffer = MemoryBuffer(buffer_size, with_per=True)
        self.save_interval = save_interval
        self.assess_interval = assess_interval
        self.actor = Actor(self.n_state,
                           self.n_action,
                           gamma=discount,
                           lr=actor_lr,
                           tau=tau)
        self.critic1 = Critic(self.n_state,
                              self.n_action,
                              gamma=discount,
                              lr=critic_lr,
                              tau=tau)
        self.critic2 = Critic(self.n_state,
                              self.n_action,
                              gamma=discount,
                              lr=critic_lr,
                              tau=tau)
        self.merge = self._merge_summary()
        self.ckpt_queen = checkpoint_queen
        self.prefix = self.__class__.__name__

    def _merge_summary(self):
        tf.summary.histogram('critic_output', self.critic1.model.output)
        tf.summary.histogram('actor_output', self.actor.model.output)
        tf.summary.histogram('critic_dense1',
                             self.critic1.model.get_layer('l1').weights[0])
        tf.summary.histogram('actor_dense1',
                             self.actor.model.get_layer('l1').weights[0])
        tf.summary.histogram('critic_dense2',
                             self.critic1.model.get_layer('l2').weights[0])
        tf.summary.histogram('actor_dense2',
                             self.actor.model.get_layer('l2').weights[0])
        return tf.summary.merge_all()

    def select_action(self, state):
        return self.actor.predict(state)

    def bellman_q_value(self, rewards, q_nexts, dones):
        """ Use the Bellman Equation to compute the critic target
        """
        q_target = np.zeros_like(
            rewards)  #asarry( copy = False), array(cope=True)
        for i in range(rewards.shape[0]):
            if dones[i]:
                q_target[i] = rewards[i]
            else:
                q_target[i] = rewards[i] + self.discount * q_nexts[i]
        return q_target

    def memorize(self, state, action, reward, done, new_state):
        """ Store experience in memory buffer
        """
        if (self.buffer.with_per):
            q_val = reward
            q_val_t = self.critic1.target_predict(state, action)
            td_error = abs(q_val_t - q_val)[0]
            # print(td_error)
        else:
            td_error = 0
        state = state.reshape(-1)
        action = action.reshape(-1)
        self.buffer.memorize(state, action, reward, done, new_state, td_error)

    def sample_batch(self, batch_size):
        return self.buffer.sample_batch(batch_size)

    def update_actor(self, states):
        actions = self.actor.predict(states)
        grad_ys = self.critic1.gradients(states, actions)
        actor_output = self.actor.train(states, actions, grad_ys)
        self.actor.copy_weights()
        self.critic1.copy_weights()
        self.critic2.copy_weights()
        return grad_ys, actor_output

    def update_critic(self, states, actions, q_values):
        loss_names, loss_values = self.critic1.train_on_batch(
            states, actions, q_values)
        self.critic2.train_on_batch(states, actions, q_values)
        return loss_names, loss_values

    def save_weights(self, path):
        self.actor.save(path)
        self.critic1.save(path)
        self.critic2.save(path)

    def save_model(self, path, file):
        self.actor.model.save(
            os.path.join(path, self.prefix + '_actor_' + file + '.h5'))
        self.critic1.model.save(
            os.path.join(path, self.prefix + '_critic1_' + file + '.h5'))
        self.critic2.model.save(
            os.path.join(path, self.prefix + '_critic2_' + file + '.h5'))

    def checkpoint(self, path, step, metric_value):
        signature = str(step) + '_' + '{:.4}'.format(metric_value)
        to_delete, need_save = self.ckpt_queen.add((metric_value, signature))
        if to_delete:
            delete_actor = os.path.join(
                path, self.prefix + '_actor_' + to_delete[1] + '.h5')
            delete_critic1 = os.path.join(
                path, self.prefix + '_critic1_' + to_delete[1] + '.h5')
            delete_critic2 = os.path.join(
                path, self.prefix + '_critic2_' + to_delete[1] + '.h5')
            os.remove(delete_actor)
            os.remove(delete_critic1)
            os.remove(delete_critic2)
        if need_save:
            self.save_model(path, signature)

    def train(self,
              args,
              summary_writer,
              train_data=None,
              val_data=None,
              test_data=None):
        results = []
        max_val_rate = 0
        val_data = np.asarray(val_data)  # none will be array(None)
        # First, gather experience
        tqdm_e = tqdm(range(args.batchs),
                      desc='score',
                      leave=True,
                      unit="epoch")
        if train_data is None:
            dataset = CsvBuffer(args.file_dir,
                                args.reg_pattern,
                                chunksize=args.batch_size)  # 100*(20+1)
            assert dataset.is_buffer_available, 'neither train_data nor csv buffer is available'
        # noise = OrnsteinUhlenbeckProcess(size=self.n_action)
        else:
            dataset = Dataset(train_data, 1, shuffle=True)

        warm_up = 20 * args.batch_size
        for e in tqdm_e:
            batch_data = next(dataset)
            states, labels = batch_data[:, :-1], batch_data[:, -1].astype(int)

            a = self.select_action(states)  #(batch, n_action)
            a = np.clip(a + np.random.normal(0, self.noise_std, size=a.shape),
                        self.a_bound[0], self.a_bound[1])
            llr = np.clip(np.log(a / (1 - a) + 1e-6), -5, 5)
            # rewards = np.where(labels==1, llr.ravel(), -llr.ravel())  #(batch,)
            rewards = np.where(labels == 1,
                               np.where(llr > 0, llr.ravel(), 2 * llr.ravel()),
                               np.where(llr < 0, -llr.ravel(),
                                        -2 * llr.ravel()))  #(batch,)
            # print(rewards)

            # a_ = self.actor.target_predict(next_states)
            # noise = np.clip(np.random.normal(0, self.smooth_noise_std), 0, self.clip)
            # a_ = a_ + noise
            # q_next1 = self.critic1.target_predict(new_states, a_)
            # q_next2 = self.critic2.target_predict(new_states,a_)
            # q_nexts = np.where(q_next1<q_next2, q_next1, q_next2)
            self.memorize(states, a, rewards, True, None)
            if e < warm_up:
                continue

            states, a, rewards, _, _, _ = self.sample_batch(args.batch_size)
            # print(states.shape, a.shape, rewards.shape)

            q_ = self.bellman_q_value(rewards=rewards,
                                      q_nexts=0,
                                      dones=[True] *
                                      rewards.shape[0])  #(batch,)

            loss_names, loss_values = self.update_critic(
                states, a, q_.reshape(-1, 1))

            if e % self.policy_freq == 0 or e == warm_up:
                grad_ys, actor_output = self.update_actor(states)

            if ((e + 1) % self.noise_decay_steps - 1) == 0 or e == warm_up:
                self.noise_std *= self.noise_decay
                self.logger.log_tabular('noise', self.noise_std)
            if e % self.assess_interval == 0 or e == args.batchs - 1 or e == warm_up:
                if val_data is not None:
                    val_pred = self.actor.predict(val_data[:, :-1])
                    val_y = val_data[:, -1]
                    # print(val_pred.shape,val_pred[:10])
                    # print(val_y.shape, val_y[:10])
                    val_rate, top_k = top_ratio_hit_rate(
                        val_y.ravel(), val_pred.ravel())
                    self.logger.log_tabular('val_rate', val_rate)
                    self.logger.log_tabular('val_k', int(top_k))
                    self.checkpoint(args.model_path, e, val_rate)
                    max_val_rate = val_rate if val_rate > max_val_rate else max_val_rate
                if test_data is not None:
                    test_pred = self.actor.predict(test_data[:, :-1])
                    test_y = test_data[:, -1]
                    test_rate, top_k = top_ratio_hit_rate(
                        test_y, test_pred.ravel())
                    self.logger.log_tabular('test_rate', test_rate)
                    self.logger.log_tabular('test_k', int(top_k))

            score = rewards.mean()
            summary_writer.add_summary(tf_summary(['mean-reward'], [score]),
                                       global_step=e)
            summary_writer.add_summary(tf_summary(loss_names, [loss_values]),
                                       global_step=e)
            merge = keras.backend.get_session().run(
                self.merge,
                feed_dict={
                    self.critic1.model.input[0]: states,
                    self.critic1.model.input[1]: a,
                    self.actor.model.input: states
                })
            summary_writer.add_summary(merge, global_step=e)

            for name, val in zip(loss_names, [loss_values]):
                self.logger.log_tabular(name, val)

            self.logger.log_tabular(
                'dQ/da', '%.4f+%.4f' %
                (grad_ys.mean(), grad_ys.std()))  # grad_ys (batch,act_dim)
            self.logger.log_tabular(
                'aout',
                '%.4f+%.4f' % (actor_output[0].mean(), actor_output[0].std()))
            self.logger.log_tabular('aloss', '%.4f' % (actor_output[1]))
            self.logger.log_tabular('reward',
                                    '%.4f+%.4f' % (score, rewards.std()))
            self.logger.dump_tabular()
            tqdm_e.set_description("score: " + '{:.4f}'.format(score))
            tqdm_e.set_postfix(noise_std='{:.4}'.format(self.noise_std),
                               max_val_rate='{:.4}'.format(max_val_rate),
                               val_rate='{:.4}'.format(val_rate),
                               top_k=top_k)
            tqdm_e.refresh()

        return results
Ejemplo n.º 5
0
class DDPGAgent:
    def __init__(self,
                 state_space_dim,
                 action_space_dim,
                 min_action_val,
                 max_action_val,
                 hidden_layer_size=512,
                 gamma=0.99,
                 tau=0.0001,
                 path_to_load=None):
        self.gamma = gamma
        self.tau = tau
        self.min_action_val = min_action_val
        self.max_action_val = max_action_val
        self.buffer = Buffer(state_space_dim, action_space_dim)
        self.noise_generator = GaussianNoise(0., 0.2, action_space_dim)

        self.actor = Actor(state_space_dim, action_space_dim, max_action_val,
                           hidden_layer_size)
        self.critic = Critic(state_space_dim, action_space_dim,
                             hidden_layer_size)

        if path_to_load is not None:
            if os.path.exists(path_to_load + "_actor.h5") and \
                    os.path.exists(path_to_load + "_critic.h5"):
                self.load(path_to_load)

        self.target_actor = Actor(state_space_dim, action_space_dim,
                                  max_action_val, hidden_layer_size)
        self.target_critic = Critic(state_space_dim, action_space_dim,
                                    hidden_layer_size)

        self.target_actor.model.set_weights(self.actor.model.get_weights())
        self.target_critic.model.set_weights(self.critic.model.get_weights())

        critic_lr = 0.002
        actor_lr = 0.001

        self.critic_optimizer = tf.keras.optimizers.Adam(critic_lr)
        self.actor_optimizer = tf.keras.optimizers.Adam(actor_lr)

    @tf.function
    def _apply_gradients(self, states, actions, next_states, rewards):
        with tf.GradientTape() as tape:
            target_actions = self.target_actor.forward(next_states)
            y = tf.cast(rewards,
                        tf.float32) + self.gamma * self.target_critic.forward(
                            [next_states, target_actions])
            critic_value = self.critic.forward([states, actions])
            critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))

        critic_grad = tape.gradient(critic_loss,
                                    self.critic.model.trainable_variables)
        self.critic_optimizer.apply_gradients(
            zip(critic_grad, self.critic.model.trainable_variables))

        with tf.GradientTape() as tape:
            actions = self.actor.forward(states)
            critic_value = self.critic.forward([states, actions])
            actor_loss = -tf.math.reduce_mean(critic_value)

        actor_grad = tape.gradient(actor_loss,
                                   self.actor.model.trainable_variables)
        self.actor_optimizer.apply_gradients(
            zip(actor_grad, self.actor.model.trainable_variables))

    def learn(self):
        states, actions, next_states, rewards = self.buffer.sample()
        self._apply_gradients(states, actions, next_states, rewards)

    def remember_step(self, info):
        self.buffer.remember(info)

    def update_targets(self):
        new_weights = []
        target_variables = self.target_critic.model.weights
        for i, variable in enumerate(self.critic.model.weights):
            new_weights.append(variable * self.tau + target_variables[i] *
                               (1 - self.tau))

        self.target_critic.model.set_weights(new_weights)

        new_weights = []
        target_variables = self.target_actor.model.weights
        for i, variable in enumerate(self.actor.model.weights):
            new_weights.append(variable * self.tau + target_variables[i] *
                               (1 - self.tau))

        self.target_actor.model.set_weights(new_weights)

    def get_best_action(self, state):
        tf_state = tf.expand_dims(tf.convert_to_tensor(state), 0)
        return tf.squeeze(self.actor.forward(tf_state)).numpy()

    def get_action(self, state):
        actions = self.get_best_action(
            state) + self.noise_generator.get_noise()
        return np.clip(actions, self.min_action_val, self.max_action_val)

    def save(self, path):
        print(f"Model has been saved as '{path}'")
        self.actor.save(path)
        self.critic.save(path)

    def load(self, path):
        print(f"Model has been loaded from '{path}'")
        self.actor.load(path)
        self.critic.load(path)
Ejemplo n.º 6
0
class Agent(object):
    def __init__(self, n_states, n_actions, lr_actor, lr_critic, tau, gamma,
                 mem_size, actor_l1_size, actor_l2_size, critic_l1_size,
                 critic_l2_size, batch_size):

        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(mem_size, n_states, n_actions)
        self.batch_size = batch_size

        self.actor = Actor(lr_actor, n_states, n_actions, actor_l1_size,
                           actor_l2_size)
        self.critic = Critic(lr_critic, n_states, n_actions, critic_l1_size,
                             critic_l2_size)

        self.target_actor = Actor(lr_actor, n_states, n_actions, actor_l1_size,
                                  actor_l2_size)
        self.target_critic = Critic(lr_critic, n_states, n_actions,
                                    critic_l1_size, critic_l2_size)

        self.noise = OUActionNoise(mu=np.zeros(n_actions), sigma=0.005)

        self.update_network_parameters(tau=1)

    def choose_action(self, observation):
        self.actor.eval()
        observation = torch.tensor(observation,
                                   dtype=torch.float).to(self.actor.device)
        mu = self.actor.forward(observation).to(self.actor.device)

        # add noise to action - for exploration
        mu_prime = mu + torch.tensor(self.noise(), dtype=torch.float).to(
            self.actor.device)
        self.actor.train()

        return mu_prime.cpu().detach().numpy()

    def choose_action_no_train(self, observation):
        self.actor.eval()
        observation = torch.tensor(observation,
                                   dtype=torch.float).to(self.actor.device)
        mu = self.actor.forward(observation).to(self.actor.device)

        return mu.cpu().detach().numpy()

    def remember(self, state, action, reward, new_state, done):
        self.memory.push(state, action, reward, new_state, done)

    def learn(self):
        if self.memory.idx_last < self.batch_size:
            # not enough data in replay buffer
            return

        # select random events
        state, action, reward, new_state, done = self.memory.sample_buffer(
            self.batch_size)

        reward = torch.tensor(reward, dtype=torch.float).to(self.critic.device)
        done = torch.tensor(done).to(self.critic.device)
        new_state = torch.tensor(new_state,
                                 dtype=torch.float).to(self.critic.device)
        action = torch.tensor(action, dtype=torch.float).to(self.critic.device)
        state = torch.tensor(state, dtype=torch.float).to(self.critic.device)

        self.target_actor.eval()
        self.target_critic.eval()
        self.critic.eval()
        target_actions = self.target_actor.forward(new_state)
        critic_value_ = self.target_critic.forward(new_state, target_actions)
        critic_value = self.critic.forward(state, action)

        target = []
        for j in range(self.batch_size):
            target.append(reward[j] + self.gamma * critic_value_[j] * done[j])
        target = torch.tensor(target).to(self.critic.device)
        target = target.view(self.batch_size, 1)

        self.critic.train()
        self.critic.optimizer.zero_grad()
        critic_loss = F.mse_loss(target, critic_value)
        critic_loss.backward()
        self.critic.optimizer.step()

        self.critic.eval()
        self.actor.optimizer.zero_grad()
        mu = self.actor.forward(state)
        self.actor.train()
        actor_loss = -self.critic.forward(state, mu)
        actor_loss = torch.mean(actor_loss)
        actor_loss.backward()
        self.actor.optimizer.step()

        self.update_network_parameters()

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        actor_params = self.actor.named_parameters()
        critic_params = self.critic.named_parameters()
        target_actor_params = self.target_actor.named_parameters()
        target_critic_params = self.target_critic.named_parameters()

        critic_state_dict = dict(critic_params)
        actor_state_dict = dict(actor_params)
        target_critic_dict = dict(target_critic_params)
        target_actor_dict = dict(target_actor_params)

        for name in critic_state_dict:
            critic_state_dict[name] = tau*critic_state_dict[name].clone() + \
                                      (1-tau)*target_critic_dict[name].clone()

        self.target_critic.load_state_dict(critic_state_dict)

        for name in actor_state_dict:
            actor_state_dict[name] = tau*actor_state_dict[name].clone() + \
                                      (1-tau)*target_actor_dict[name].clone()
        self.target_actor.load_state_dict(actor_state_dict)

    def save_models(self):
        timestamp = time.strftime("%Y%m%d-%H%M%S")

        self.actor.save("actor_" + timestamp)
        self.target_actor.save("target_actor_" + timestamp)
        self.critic.save("critic_" + timestamp)
        self.target_critic.save("target_critic_" + timestamp)

    def load_models(self, fn_actor, fn_target_actor, fn_critic,
                    fn_target_critic):
        self.actor.load_checkpoint(fn_actor)
        self.target_actor.load_checkpoint(fn_target_actor)
        self.critic.load_checkpoint(fn_critic)
        self.target_critic.load_checkpoint(fn_target_critic)