Esempio n. 1
0
def train_network(config: MuZeroConfig, storage: SharedStorage, replay_buffer: ReplayBuffer):
    
    network = storage.latest_network() # recover the latest network to be updated
    learning_rate = config.lr_init * config.lr_decay_rate**(network.training_steps()/config.lr_decay_steps)
    network.optimiser.learning_rate = learning_rate
    
    for i in range(config.training_steps+1):
        
        if i % config.checkpoint_interval == 0:
            storage.save_network(network.training_steps(), network)

        batch = replay_buffer.sample_batch(config.num_unroll_steps, config.td_steps, config.prediction_interval) 

        l = network.update_weights(batch, config.weight_decay, config.hidden_state_dampen)

        if i % 100 == 0:
            print((i, l))
            
    storage.save_network(network.training_steps(), network)
    
    return i

##


        
Esempio n. 2
0
class DrlAgent:
    def __init__(self,
                 sess,
                 is_train,
                 dim_state,
                 dim_action,
                 num_paths,
                 actor_learn_rate,
                 critic_learn_rate,
                 tau,
                 buffer_size,
                 mini_batch,
                 ep_begin,
                 epsilon_end,
                 gamma,
                 max_epoch,
                 seed=66):
        self.__is_train = is_train
        self.__dim_state = dim_state
        self.__dim_action = dim_action
        self.__mini_batch = mini_batch
        self.__ep_begin = ep_begin
        self.__gamma = gamma
        self.__max_epoch = max_epoch

        self.__actor = ActorNetwork(sess, dim_state, dim_action, 1.0,
                                    actor_learn_rate, tau, num_paths)
        self.__critic = CriticNetwork(sess, dim_state, dim_action,
                                      critic_learn_rate, tau)

        self.__replay = ReplayBuffer(buffer_size, seed)

        self.__explorer = Explorer(ep_begin, epsilon_end, max_epoch,
                                   dim_action, num_paths, seed)

        self.__state_curt = np.zeros(dim_state)
        self.__action_curt = self.__explorer.convert_action(
            np.ones(dim_action))

        self.__episode = 0
        self.__step = 0

    def target_paras_init(self):
        self.__actor.update_target_paras()
        self.__critic.update_target_paras()

    def predict(self, state, reward):
        action_original = self.__actor.predict([state])[0]
        if not self.__is_train:
            return action_original

        action = self.__explorer.get_act(action_original)
        self.__replay.add(self.__state_curt, self.__action_curt, reward, state)
        self.__state_curt = state
        self.__action_curt = action

        if len(self.__replay) > self.__mini_batch:
            self.train()

        self.__step += 1
        if self.__step >= self.__max_epoch:
            self.__step = 0
            self.__episode += 1
            self.__explorer.reset_ep(self.__ep_begin)
        return action

    def train(self):
        batch_state, batch_action, batch_reward, batch_state_next = self.__replay.sample_batch(
            self.__mini_batch)
        weights = [1.0] * self.__mini_batch
        weights = np.expand_dims(weights, axis=1)
        target_q = self.__critic.predict_target(
            batch_state_next, self.__actor.predict_target(batch_state_next))
        value_q = self.__critic.predict(batch_state, batch_action)

        batch_y = []
        batch_error = []
        for k in range(len(batch_reward)):
            target_y = batch_reward[k] + self.__gamma * target_q[k]
            batch_error.append(abs(target_y - value_q[k]))
            batch_y.append(target_y)

        predicted_q, _ = self.__critic.train(batch_state, batch_action,
                                             batch_y, weights)
        a_outs = self.__actor.predict(batch_state)
        grads = self.__critic.calculate_gradients(batch_state, a_outs)
        weighted_grads = weights * grads[0]
        self.__actor.train(batch_state, weighted_grads)
        self.__actor.update_target_paras()
        self.__critic.update_target_paras()
Esempio n. 3
0
class DDPG:
    def __init__(self,
                 env=gym.make('Pendulum-v0'),
                 s_dim=2,
                 a_dim=1,
                 gamma=0.99,
                 episodes=100,
                 tau=0.001,
                 buffer_size=1e06,
                 minibatch_size=64,
                 actor_lr=0.001,
                 critic_lr=0.001,
                 save_name='final_weights',
                 render=False):
        self.save_name = save_name
        self.render = render
        self.env = env
        self.upper_bound = env.action_space.high[0]
        self.lower_bound = env.action_space.low[0]
        self.EPISODES = episodes
        self.MAX_TIME_STEPS = 200
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.GAMMA = gamma
        self.TAU = tau
        self.buffer_size = buffer_size
        self.minibatch_size = minibatch_size
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr

        self.ou_noise = OUNoise(mean=np.zeros(1))

        self.actor = Actor(self.s_dim, self.a_dim).model()
        self.target_actor = Actor(self.s_dim, self.a_dim).model()
        self.actor_opt = tf.keras.optimizers.Adam(learning_rate=self.actor_lr)
        self.target_actor.set_weights(self.actor.get_weights())

        self.critic = Critic(self.s_dim, self.a_dim).model()
        self.critic_opt = tf.keras.optimizers.Adam(
            learning_rate=self.critic_lr)
        self.target_critic = Critic(self.s_dim, self.a_dim).model()
        self.target_critic.set_weights(self.critic.get_weights())

        self.replay_buffer = ReplayBuffer(self.buffer_size)

    def update_target(self):
        # Two methods to update the target actor
        # Method 1:
        self.target_actor.set_weights(
            np.array(self.actor.get_weights()) * self.TAU +
            np.array(self.target_actor.get_weights()) * (1 - self.TAU))
        self.target_critic.set_weights(
            np.array(self.critic.get_weights()) * self.TAU +
            np.array(self.target_critic.get_weights()) * (1 - self.TAU))
        """
        # Method 2:
        new_weights = []
        target_variables = self.target_critic.weights
        for i, variable in enumerate(self.critic.weights):
            new_weights.append(variable * self.TAU + target_variables[i] * (1 - self.TAU))

        self.target_critic.set_weights(new_weights)
        new_weights = []
        target_variables = self.target_actor.weights
        for i, variable in enumerate(self.actor.weights):
            new_weights.append(variable * self.TAU + target_variables[i] * (1 - self.TAU))
        self.target_actor.set_weights(new_weights)
        """

    def train_step(self):
        s_batch, a_batch, r_batch, d_batch, s2_batch = self.replay_buffer.sample_batch(
            self.minibatch_size)
        """
        mu_prime = self.target_actor(s2_batch)  # predictions by target actor
        Q_prime = self.target_critic([s2_batch, mu_prime])  # predictions by target critic
        y = np.zeros_like(Q_prime)
        for k in range(self.minibatch_size):
            if d_batch[k]:
                y[k] = r_batch[k]
            else:
                y[k] = r_batch[k] + self.GAMMA * Q_prime[k]
        # y = r_batch + gamma * Q_prime

        checkpoint_path = "training/cp_critic.ckpt"
        checkpoint_dir = os.path.dirname(checkpoint_path)
        # Create a callback that saves the model's weights
        cp_callback1 = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_dir,
                                                          save_weights_only=True,
                                                          verbose=1)
        self.critic.train_on_batch([s_batch, a_batch], y)
        # self.critic.fit([s_batch, a_batch], y, verbose=0, steps_per_epoch=8, callbacks=[cp_callback1])

        with tf.GradientTape(persistent=True) as tape:
            a = self.actor(s_batch)
            tape.watch(a)
            theta = self.actor.trainable_variables
            q = self.critic([s_batch, a])
        dq_da = tape.gradient(q, a)
        da_dtheta = tape.gradient(a, theta, output_gradients=-dq_da)
        self.actor_opt.apply_gradients(zip(da_dtheta, self.actor.trainable_variables))
        """

        with tf.GradientTape() as tape:
            target_actions = self.target_actor(s2_batch)
            y = r_batch + self.GAMMA * self.target_critic(
                [s2_batch, target_actions])
            critic_value = self.critic([s_batch, a_batch])
            critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))
        critic_grad = tape.gradient(critic_loss,
                                    self.critic.trainable_variables)
        self.critic_opt.apply_gradients(
            zip(critic_grad, self.critic.trainable_variables))

        with tf.GradientTape() as tape:
            actions = self.actor(s_batch)
            q = self.critic([s_batch, actions])  # critic_value
            # Used `-value` as we want to maximize the value given
            # by the critic for our actions
            actor_loss = -tf.math.reduce_mean(q)
        actor_grad = tape.gradient(actor_loss, self.actor.trainable_variables)
        self.actor_opt.apply_gradients(
            zip(actor_grad, self.actor.trainable_variables))
        self.update_target()
        return np.mean(q)

    def policy(self, s):
        # since batch normalization is done on self.actor, it is multiplied with upper_bound
        if s.ndim == 1:
            s = s[None, :]
        action = self.actor(s) * self.upper_bound + self.ou_noise()
        action = np.clip(action, self.lower_bound, self.upper_bound)
        return action

    def train(self):
        # To store reward history of each episode
        ep_reward_list = []
        # To store average reward history of last few episodes
        avg_reward_list = []
        monitor = Monitor([1, 1], titles=['Reward', 'Loss'], log=2)
        with Loop_handler(
        ) as interruption:  # to properly save even if ctrl+C is pressed
            for eps in range(self.EPISODES):
                episode_reward = 0
                s = self.env.reset()
                """
                if an env is created using the "gym.make" method, it will terminate after 200 steps
                """
                for t in range(self.MAX_TIME_STEPS):
                    # done = False
                    # while not done:
                    if self.render:
                        self.env.render()
                    a = self.policy(s)
                    s_, r, done, _ = self.env.step(a)
                    self.replay_buffer.add(np.reshape(s, (self.s_dim, )),
                                           np.reshape(a, (self.a_dim, )),
                                           r, done,
                                           np.reshape(s_, (self.s_dim, )))
                    episode_reward += r
                    if self.replay_buffer.size() > self.minibatch_size:
                        q = self.train_step()
                    s = s_.reshape(1, -1)
                    if interruption():
                        break
                ep_reward_list.append(episode_reward)
                # Mean of last 40 episodes
                avg_reward = np.mean(ep_reward_list[-40:])
                print("Episode * {} * Avg Reward is ==> {}".format(
                    eps, avg_reward))
                avg_reward_list.append(avg_reward)
                monitor.add_data(avg_reward, q)

            self.save_weights(
                save_name=self.save_name)  # if you want to save weights
            self.plot_results(avg_reward=avg_reward_list, train=True)

    def save_weights(self, save_name='final_weights'):
        self.actor.save_weights("training/%s_actor.h5" % save_name)
        self.critic.save_weights("training/%s_critic.h5" % save_name)
        self.target_actor.save_weights("training/%s_target_actor.h5" %
                                       save_name)
        self.target_critic.save_weights("training/%s_target_critic.h5" %
                                        save_name)

        # to save in other format
        self.target_actor.save_weights('training/%s_actor_weights' % save_name,
                                       save_format='tf')
        self.target_critic.save_weights('training/%s_critic_weights' %
                                        save_name,
                                        save_format='tf')
        print('Training completed and network weights saved')

    # For evaluation of the policy learned
    def collect_data(self, act_net, iterations=1000):
        a_all, states_all = [], []
        obs = self.env.reset()
        for t in range(iterations):
            obs = np.squeeze(obs)
            if obs.ndim == 1:
                a = act_net(obs[None, :])
            else:
                a = act_net(obs)
            obs, _, done, _ = self.env.step(a)
            states_all.append(obs)
            a_all.append(a)
            # self.env.render()  # Uncomment this to see the actor in action (But not in python notebook)
            # if done:
            #     break
        states = np.squeeze(
            np.array(states_all))  # cos(theta), sin(theta), theta_dot
        a_all = np.squeeze(np.array(a_all))
        return states, a_all

    def plot_results(self,
                     avg_reward=None,
                     actions=None,
                     states=None,
                     train=False,
                     title=None):
        # An additional way to visualize the avg episode rewards
        if train:
            plt.figure()
            plt.plot(avg_reward)
            plt.xlabel("Episode")
            plt.ylabel("Avg. Epsiodic Reward")
            plt.show()
        else:  # work only for Pendulum-v0 environment
            fig, ax = plt.subplots(3, sharex=True)
            theta = np.arctan2(states[:, 1], states[:, 0])
            ax[0].set_ylabel('u')
            ax[0].plot(np.squeeze(actions))
            ax[1].set_ylabel(u'$\\theta$')
            ax[1].plot(theta)
            # ax[1].plot(states[:, 0])
            ax[2].set_ylabel(u'$\omega$')
            ax[2].plot(states[:, 2])  # ang velocity
            fig.canvas.set_window_title(title)
Esempio n. 4
0
def train(sess, env, actor, critic):
    # Set up summary ops
    summary_ops, summary_vars = build_summaries()

    # Initialize Tensorflow variables
    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph)

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)

    for i in xrange(MAX_EPISODES):

        s = env.reset()

        episode_reward = 0
        episode_ave_max_q = 0

        noise = ExplorationNoise.ou_noise(OU_THETA, OU_MU, OU_SIGMA,
                                          MAX_STEPS_EPISODE)
        noise = ExplorationNoise.exp_decay(noise, EXPLORATION_TIME)

        for j in xrange(MAX_STEPS_EPISODE):

            if RENDER_ENV:
                env.render()

            # Add exploratory noise according to Ornstein-Uhlenbeck process to action
            # Decay exploration exponentially from 1 to 0 in EXPLORATION_TIME steps
            if i < EXPLORATION_TIME:
                a = actor.predict(
                    np.reshape(s,
                               (1, env.observation_space.shape[0]))) + noise[j]
            else:
                a = actor.predict(
                    np.reshape(s, (1, env.observation_space.shape[0])))

            s2, r, terminal, info = env.step(a[0])

            replay_buffer.add(np.reshape(s, actor.state_dim),
                              np.reshape(a, actor.action_dim), r, terminal,
                              np.reshape(s2, actor.state_dim))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > MINIBATCH_SIZE:
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(MINIBATCH_SIZE)

                # Calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                y_i = []
                for k in xrange(MINIBATCH_SIZE):
                    # If state is terminal assign reward only
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    # Else assgin reward + net target Q
                    else:
                        y_i.append(r_batch[k] + GAMMA * target_q[k])

                # Update the critic given the targets
                predicted_q_value, _ = \
                    critic.train(s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))

                episode_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                a_grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, a_grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

            s = s2
            episode_reward += r

            if terminal or j == MAX_STEPS_EPISODE - 1:
                summary_str = sess.run(summary_ops,
                                       feed_dict={
                                           summary_vars[0]: episode_reward,
                                           summary_vars[1]: episode_ave_max_q
                                       })

                writer.add_summary(summary_str, i)
                writer.flush()

                print 'Reward: %.2i' % int(episode_reward), ' | Episode', i, \
                      '| Qmax: %.4f' % (episode_ave_max_q / float(j))

                break
Esempio n. 5
0
class DDPGagent(object):
    def __init__(self, env):

        self.sess = tf.Session()
        K.set_session(self.sess)

        ## hyperparameters
        self.GAMMA = 0.95
        self.BATCH_SIZE = 64
        self.BUFFER_SIZE = 20000
        self.ACTOR_LEARNING_RATE = 0.0001
        self.CRITIC_LEARNING_RATE = 0.001
        self.TAU = 0.001

        self.env = env
        # get state dimension
        self.state_dim = env.observation_space.shape[0]
        # get action dimension
        self.action_dim = env.action_space.shape[0]
        # get action bound
        self.action_bound = env.action_space.high[0]

        ## create actor and critic networks
        self.actor = Actor(self.sess, self.state_dim, self.action_dim,
                           self.action_bound, self.TAU,
                           self.ACTOR_LEARNING_RATE)
        self.critic = Critic(self.sess, self.state_dim, self.action_dim,
                             self.TAU, self.CRITIC_LEARNING_RATE)

        ## initialize for later gradient calculation
        self.sess.run(
            tf.global_variables_initializer())  #<-- no problem without it

        ## initialize replay buffer
        self.buffer = ReplayBuffer(self.BUFFER_SIZE)

        # save the results
        self.save_epi_reward = []

    ## Ornstein Uhlenbeck Noise
    def ou_noise(self, x, rho=0.15, mu=0, dt=1e-1, sigma=0.2, dim=1):
        return x + rho * (
            mu - x) * dt + sigma * np.sqrt(dt) * np.random.normal(size=dim)

    ## computing TD target: y_k = r_k + gamma*Q(s_k+1, a_k+1)
    def td_target(self, rewards, q_values, dones):
        y_k = np.asarray(q_values)
        for i in range(q_values.shape[0]):  # number of batch
            if dones[i]:
                y_k[i] = rewards[i]
            else:
                y_k[i] = rewards[i] + self.GAMMA * q_values[i]
        return y_k

    ## train the agent
    def train(self, max_episode_num):

        # initial transfer model weights to target model network
        self.actor.update_target_network()
        self.critic.update_target_network()

        for ep in range(int(max_episode_num)):
            # reset OU noise
            pre_noise = np.zeros(self.action_dim)
            # reset episode
            time, episode_reward, done = 0, 0, False
            # reset the environment and observe the first state
            state = self.env.reset()
            while not done:
                # visualize the environment
                #self.env.render()
                # pick an action: shape = (1,)
                action = self.actor.predict(state)
                noise = self.ou_noise(pre_noise, dim=self.action_dim)
                # clip continuous action to be within action_bound
                action = np.clip(action + noise, -self.action_bound,
                                 self.action_bound)
                # observe reward, new_state
                next_state, reward, done, _ = self.env.step(action)
                # add transition to replay buffer
                train_reward = (reward + 8) / 8
                self.buffer.add_buffer(state, action, train_reward, next_state,
                                       done)

                if self.buffer.buffer_size > 1000:  # start train after buffer has some amounts

                    # sample transitions from replay buffer
                    states, actions, rewards, next_states, dones = self.buffer.sample_batch(
                        self.BATCH_SIZE)
                    # predict target Q-values
                    target_qs = self.critic.target_predict(
                        [next_states,
                         self.actor.target_predict(next_states)])
                    # compute TD targets
                    y_i = self.td_target(rewards, target_qs, dones)
                    # train critic using sampled batch
                    self.critic.train_on_batch(states, actions, y_i)
                    # Q gradient wrt current policy
                    s_actions = self.actor.model.predict(
                        states)  # shape=(batch, 1),
                    # caution: NOT self.actor.predict !
                    # self.actor.model.predict(state) -> shape=(1,1)
                    # self.actor.predict(state) -> shape=(1,) -> type of gym action
                    s_grads = self.critic.dq_da(states, s_actions)
                    dq_das = np.array(s_grads).reshape((-1, self.action_dim))
                    # train actor
                    self.actor.train(states, dq_das)
                    # update both target network
                    self.actor.update_target_network()
                    self.critic.update_target_network()

                # update current state
                pre_noise = noise
                state = next_state
                episode_reward += reward
                time += 1

            ## display rewards every episode
            print('Episode: ', ep + 1, 'Time: ', time, 'Reward: ',
                  episode_reward)

            self.save_epi_reward.append(episode_reward)

            ## save weights every episode
            #print('Now save')
            self.actor.save_weights("./save_weights/pendulum_actor.h5")
            self.critic.save_weights("./save_weights/pendulum_critic.h5")

        np.savetxt('./save_weights/pendulum_epi_reward.txt',
                   self.save_epi_reward)
        print(self.save_epi_reward)

    ## save them to file if done
    def plot_result(self):
        plt.plot(self.save_epi_reward)
        plt.show()
Esempio n. 6
0
class DDPG:
    def __init__(self, env, sess, low_action_bound_list,
                 high_action_bound_list):
        self.env = env
        self.sess = sess
        self.low_action_bound_list = low_action_bound_list  # depends on the env
        self.high_action_bound_list = high_action_bound_list
        self.action_range_bound = [
            hi - lo for hi, lo in zip(self.high_action_bound_list,
                                      self.low_action_bound_list)
        ]
        self.learning_rate = 0.0001  #TODO move these to configs
        self.epsilon = 1.0
        self.epsilon_min = 0.1
        self.epsilon_decay = 1e-6
        self.gamma = 0.99
        self.tau = 0.001
        self.buffer_size = 1000000
        self.batch_size = 128
        self.theta = 0.15
        self.ou = 0
        self.sigma = 0.3

        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = len(self.low_action_bound_list
                              )  #self.env.action_space, make this into input
        self.continuous_action_space = True

        # Initialize replay buffer
        self.replay_buffer = ReplayBuffer(self.buffer_size)

        # Creating ACTOR model
        actor_ = Actor(self.state_dim, self.action_dim, self.learning_rate)
        self.actor_state_input, self.actor_model = actor_.create_actor_model()
        _, self.target_actor_model = actor_.create_actor_model()

        self.actor_critic_grad = tf.placeholder(tf.float32,
                                                [None, self.action_dim])

        actor_model_weights = self.actor_model.trainable_weights
        self.actor_grads = tf.gradients(self.actor_model.output,
                                        actor_model_weights,
                                        -self.actor_critic_grad)

        grads = zip(self.actor_grads, actor_model_weights)
        self.optimize = tf.train.AdamOptimizer(
            self.learning_rate).apply_gradients(grads)

        # Creating CRITIC model
        critic_ = Critic(self.state_dim, self.action_dim, self.learning_rate)
        self.critic_state_input, self.critic_action_input, self.critic_model = critic_.create_critic_model(
        )
        _, _, self.target_critic_model = critic_.create_critic_model()

        self.critic_grads = tf.gradients(self.critic_model.output,
                                         self.critic_action_input)

        self.noise = OrnsteinUhlenbeckProcess(size=self.action_dim)
        self.noise.reset()

        self.sess.run(tf.initialize_all_variables())

    def __repr__(self):
        return 'DDPG_gamma{}_tau{}'.format(self.gamma, self.tau)

    # TRAINING FUNCTIONS
    def train_actor(self, samples):
        current_states, actions, rewards, next_states, dones = samples

        predicted_actions = self.actor_model.predict(current_states)

        grads = self.sess.run(self.critic_grads,
                              feed_dict={
                                  self.critic_state_input: current_states,
                                  self.critic_action_input: predicted_actions
                              })[0]

        self.sess.run(self.optimize,
                      feed_dict={
                          self.actor_state_input: current_states,
                          self.actor_critic_grad: grads
                      })

        if self.epsilon - self.epsilon_decay > self.epsilon_min:
            self.epsilon -= self.epsilon_decay

        self.noise.reset()

    def train_critic(self, samples):
        current_states, actions, rewards, next_states, dones = samples

        target_actions = self.target_actor_model.predict(next_states)
        target_q_values = self.target_critic_model.predict(
            [next_states, target_actions])

        rewards = rewards + self.gamma * target_q_values * (1 - dones)

        evaluation = self.critic_model.fit([current_states, actions],
                                           rewards,
                                           verbose=0)

    def train(self):
        if self.replay_buffer.size() > self.batch_size:
            samples = self.replay_buffer.sample_batch(self.batch_size)
            self.train_actor(samples)
            self.train_critic(samples)

    # TARGET MODEL UPDATES
    def update_actor_target(self):
        actor_model_weights = self.actor_model.get_weights()
        target_actor_model_weights = self.target_actor_model.get_weights()

        for i in range(len(target_actor_model_weights)):
            target_actor_model_weights[i] = actor_model_weights[
                i] * self.tau + target_actor_model_weights[i] * (1.0 -
                                                                 self.tau)
        self.target_actor_model.set_weights(target_actor_model_weights)

    def update_critic_target(self):
        critic_model_weights = self.critic_model.get_weights()
        target_critic_model_weights = self.target_critic_model.get_weights()

        for i in range(len(target_critic_model_weights)):
            target_critic_model_weights[i] = critic_model_weights[
                i] * self.tau + target_critic_model_weights[i] * (1.0 -
                                                                  self.tau)
        self.target_critic_model.set_weights(target_critic_model_weights)

    def update_target_models(self):
        self.update_actor_target()
        self.update_critic_target()

    # ACTING FUNCTION
    def act(self, current_epsiode, current_state):
        noise = self.epsilon * self.noise.generate()
        action = self.actor_model.predict(
            current_state
        ) * self.high_action_bound_list + noise  #TODO add linear mapping for affine space
        return np.clip(action, self.low_action_bound_list,
                       self.high_action_bound_list)
Esempio n. 7
0
class Agent:
    def __init__(self, env, gamma, batch_size, buffer_size, lr_rate, tau):

        self.env = env
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        self.action_bound = env.action_space.high[0]

        self.gamma = gamma
        self.batch_size = batch_size
        self.buffer_size = buffer_size

        self.actor = Actor(self.state_dim, self.action_dim, self.action_bound,
                           lr_rate[0], tau)
        self.critic = Critic(self.state_dim, self.action_dim, lr_rate[1], tau)

        self.buffer = ReplayBuffer(self.buffer_size)
        self.save_epi_reward = []

    def ou_noise(self, x, rho=0.15, mu=0., dt=1e-1, sigma=0.2, dim=1):
        rho = torch.FloatTensor([rho])
        mu = torch.FloatTensor([mu])
        dt = torch.FloatTensor([dt])
        return x + rho * (mu - x) * dt + torch.sqrt(dt) * torch.normal(
            0., sigma, size=(dim, ))

    def td_target(self, rewards, q_values, dones):
        y_k = torch.zeros(q_values.shape)

        for i in range(q_values.shape[0]):
            if dones[i]:
                y_k[i] = rewards[i]
            else:
                y_k[i] = rewards[i] + self.gamma * q_values[i]
        return y_k

    def train(self, max_episode_num, save_path, save_names):
        self.actor.update_target_network()
        self.critic.update_target_network()

        for episode in range(max_episode_num):
            time, episode_reward, done = 0, 0, False
            state = self.env.reset()
            state = torch.from_numpy(state).type(torch.FloatTensor)

            pre_noise = torch.zeros(self.action_dim)

            while not done:
                #env.render()
                action = self.actor.predict(state)[0]
                noise = self.ou_noise(pre_noise, dim=self.action_dim)

                action = np.array([action.item()])
                action = np.clip(action, -self.action_bound, self.action_bound)

                next_state, reward, done, _ = self.env.step(action)
                next_state = torch.from_numpy(next_state).type(
                    torch.FloatTensor)
                action = torch.from_numpy(action).type(torch.FloatTensor)
                reward = torch.FloatTensor([reward])
                train_reward = torch.FloatTensor([(reward + 8) / 8])

                state = state.view(1, self.state_dim)
                next_state = next_state.view(1, self.state_dim)
                action = action.view(1, self.action_dim)
                reward = reward.view(1, 1)
                train_reward = reward.view(1, 1)

                self.buffer.add_buffer(state, action, train_reward, next_state,
                                       done)
                if self.buffer.buffer_size > 1000:
                    states, actions, rewards, next_states, dones = self.buffer.sample_batch(
                        self.batch_size)

                    actions_ = self.actor.target_predict(next_states)
                    actions_ = actions_.view(next_states.shape[0],
                                             self.action_dim)
                    target_qs = self.critic.target_predict(
                        next_states, actions_)
                    y_i = self.td_target(rewards, target_qs, dones)
                    self.critic.train(states, actions, y_i)

                    s_actions = self.actor.predict(states)
                    policy_loss = self.critic.predict(states, s_actions)
                    self.actor.train(policy_loss)

                    self.actor.update_target_network()
                    self.critic.update_target_network()

                pre_noise = noise
                state = next_state[0]
                episode_reward += reward[0]
                time += 1

            self.save_epi_reward.append(episode_reward.item())

            if len(self.save_epi_reward) < 20:
                print('Episode:', episode + 1, 'Time:',
                      time, 'Reward(ave of recent20):',
                      np.mean(self.save_epi_reward))
            else:
                print('Episode:', episode + 1, 'Time:', time,
                      'Reward(ave of recent20):',
                      np.mean(self.save_epi_reward[-20:]))

            if episode % 10 == 0:
                self.actor.save(save_path, save_names[0])
                self.critic.save(save_path, save_names[1])
Esempio n. 8
0
class TD3:
    def __init__(self, env, sess, low_action_bound_list,
                 high_action_bound_list):
        self.env = env
        self.sess = sess
        self.low_action_bound_list = low_action_bound_list  # depends on the env
        self.high_action_bound_list = high_action_bound_list
        self.action_range_bound = [
            hi - lo for hi, lo in zip(self.high_action_bound_list,
                                      self.low_action_bound_list)
        ]
        self.learning_rate = 0.0001
        self.exploration_noise = 0.1
        self.gamma = 0.90
        self.tau = 0.01
        self.buffer_size = 10000
        self.batch_size = 128
        self.policy_noise = 0.1
        self.noise_clip = 0.05
        self.exploration_episodes = 10
        # self.policy_freq = 2

        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = len(self.low_action_bound_list
                              )  #self.env.action_space, make this into input
        self.continuous_action_space = True

        # Initialize replay buffer
        self.replay_buffer = ReplayBuffer(self.buffer_size)

        # Creating ACTOR model
        actor_ = Actor(self.state_dim, self.action_dim, self.learning_rate)
        self.actor_state_input, self.actor_model = actor_.create_actor_model()
        _, self.target_actor_model = actor_.create_actor_model()

        self.actor_critic_grad = tf.placeholder(tf.float32,
                                                [None, self.action_dim])

        actor_model_weights = self.actor_model.trainable_weights
        self.actor_grads = tf.gradients(self.actor_model.output,
                                        actor_model_weights,
                                        -self.actor_critic_grad)

        grads = zip(self.actor_grads, actor_model_weights)
        self.optimize = tf.train.AdamOptimizer(
            self.learning_rate).apply_gradients(grads)

        # Creating FIRST CRITIC model, this is the one we train/optimize against
        critic_ = Critic(self.state_dim, self.action_dim, self.learning_rate)
        self.critic_state_input, self.critic_action_input, self.critic_model = critic_.create_critic_model(
        )
        self.critic_model.compile(optimizer=Adam(lr=critic_.learning_rate),
                                  loss='')

        _, _, self.target_critic_model = critic_.create_critic_model()
        self.target_critic_model.compile(
            optimizer=Adam(lr=critic_.learning_rate), loss='')

        self.critic_grads = tf.gradients(self.critic_model.output[0],
                                         self.critic_action_input)

        self.sess.run(tf.initialize_all_variables())

    def __repr__(self):
        return 'TD3_gamma{}_tau{}'.format(self.gamma, self.tau)

    # TRAINING FUNCTIONS
    def train_actor(self):
        if self.replay_buffer.size() > self.batch_size:
            samples = self.replay_buffer.sample_batch(self.batch_size)

            current_states, actions, rewards, next_states, dones = samples

            predicted_actions = self.actor_model.predict(
                current_states
            ) * self.high_action_bound_list  #TODO create linear mapping for affine space

            grads = self.sess.run(self.critic_grads,
                                  feed_dict={
                                      self.critic_state_input: current_states,
                                      self.critic_action_input:
                                      predicted_actions
                                  })[0]

            self.sess.run(self.optimize,
                          feed_dict={
                              self.actor_state_input: current_states,
                              self.actor_critic_grad: grads
                          })

    def train_critic(self):
        if self.replay_buffer.size() > self.batch_size:
            samples = self.replay_buffer.sample_batch(self.batch_size)

            current_states, actions, rewards, next_states, dones = samples

            target_actions = self.target_actor_model.predict(
                next_states) * self.high_action_bound_list

            # CCOMPUTING FIRST CRITIC
            # introduce area of noise to action for smoothing purposes
            noise = np.random.normal(
                size=len(self.action_range_bound)) * self.policy_noise
            clipped_noise = np.clip(noise, -self.noise_clip, self.noise_clip)

            # added above noise to target_actions and clip to be in range of valid actions
            target_actions = np.clip((target_actions + clipped_noise),
                                     self.low_action_bound_list,
                                     self.high_action_bound_list)
            target_q1_values, target_q2_values = self.target_critic_model.predict(
                [
                    next_states, target_actions,
                    np.random.rand(self.batch_size, 1)
                ])

            target_q_values = np.minimum(target_q1_values, target_q2_values)

            target_q = rewards + self.gamma * target_q_values * (1 - dones)

            # current_q1, current_q2 = self.critic_model.predict([current_states, actions, np.random.rand(self.batch_size, 1)])

            history = self.critic_model.fit(
                [current_states, actions, target_q], verbose=0)
            # print('Loss: ',history.history['loss'])

    def train(self):
        if self.replay_buffer.size() > self.batch_size:
            samples = self.replay_buffer.sample_batch(self.batch_size)
            self.train_actor()
            self.train_critic()

    # TARGET MODEL UPDATES
    def update_actor_target(self):
        actor_model_weights = self.actor_model.get_weights()
        target_actor_model_weights = self.target_actor_model.get_weights()

        for i in range(len(target_actor_model_weights)):
            target_actor_model_weights[i] = actor_model_weights[
                i] * self.tau + target_actor_model_weights[i] * (1.0 -
                                                                 self.tau)
        self.target_actor_model.set_weights(target_actor_model_weights)

    def update_critic_target(self):
        critic_model_weights = self.critic_model.get_weights()
        target_critic_model_weights = self.target_critic_model.get_weights()

        for i in range(len(target_critic_model_weights)):
            target_critic_model_weights[i] = critic_model_weights[
                i] * self.tau + target_critic_model_weights[i] * (1.0 -
                                                                  self.tau)
        self.target_critic_model.set_weights(target_critic_model_weights)

    def update_target_models(self):
        self.update_actor_target()
        self.update_critic_target()

    # ACTING FUNCTION with epsilon greedy
    def act(self, current_epsiode, current_state):
        if current_epsiode < self.exploration_episodes:
            return np.random.uniform(
                self.low_action_bound_list,
                self.high_action_bound_list) * self.high_action_bound_list
        else:
            action = self.actor_model.predict(
                current_state) * self.high_action_bound_list + np.random.normal(
                    0, [
                        self.exploration_noise * hi
                        for hi in self.high_action_bound_list
                    ])
            return np.clip(action, self.low_action_bound_list,
                           self.high_action_bound_list)