Example #1
0
class DDPG():
    def __init__(self, task, sess):
        self.sess = sess
        self.env = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        self.actor_lr = 0.0001
        self.tau = 0.001
        self.minibatch_size = 64
        self.critic_lr = 0.001
        self.gamma = 0.99
        self.buffer_size = 1000000
        self.random_seed = 1234
        self.summary_dir = "/"
        #self.max_episode = 100
        #self.max_episode_len = 100
        self.mu = 0

        self.actor = ActorNetwork(self.sess, self.state_size, self.action_size,
                                  self.action_low, self.action_high,
                                  self.actor_lr, self.tau, self.minibatch_size)

        self.critic = CriticNetwork(self.sess, self.state_size,
                                    self.action_size, self.critic_lr, self.tau,
                                    self.gamma,
                                    self.actor.get_num_trainable_vars())

        # Initialize replay memory
        self.replay_buffer = ReplayBuffer(self.buffer_size, self.random_seed)
        self.sess.run(tf.global_variables_initializer())
        self.actor.update_target_network()
        self.critic.update_target_network()

        self.noise = OUNoise(self.action_size, self.mu)

        self.sess.run(tf.global_variables_initializer())

    def reset_episode(self):
        #self.actor_noise.reset()
        state = self.env.reset()
        self.last_state = state
        self.ep_ave_max_q = 0
        self.ep_reward = 0
        return state

    def step(self, s, a, r, terminal, s2):
        # Save experience / reward
        #self.memory.add(self.last_state, action, reward, next_state, done)
        #summary_ops, summary_vars = self.build_summaries()
        self.replay_buffer.add(np.reshape(s, (self.actor.s_dim, )),
                               np.reshape(a, (self.actor.a_dim, )), r,
                               terminal, np.reshape(s2, (self.actor.s_dim, )))
        # Learn, if enough samples are available in memory
        if self.replay_buffer.size() > self.minibatch_size:

            s_batch, a_batch, r_batch, t_batch, s2_batch = self.replay_buffer.sample_batch(
                self.minibatch_size)
            #self.train(s_batch, a_batch, r_batch, t_batch, s2_batch)
            target_q = self.critic.predict_target(
                s2_batch, self.actor.predict_target(s2_batch))

            y_i = []
            for k in range(self.minibatch_size):
                if t_batch[k]:
                    y_i.append(r_batch[k])
                else:
                    y_i.append(r_batch[k] + self.critic.gamma * target_q[k])

                    # Update the critic given the targets
            predicted_q_value, _ = self.critic.train(
                s_batch, a_batch, np.reshape(y_i, (self.minibatch_size, 1)))

            #self.ep_ave_max_q += np.amax(predicted_q_value)

            # Update the actor policy using the sampled gradient
            a_outs = self.actor.predict(s_batch)
            grads = self.critic.action_gradients(s_batch, a_outs)
            self.actor.train(s_batch, grads[0])

            # Update target networks
            self.actor.update_target_network()
            self.critic.update_target_network()

        # Roll over last state and action
        self.last_state = s2
        '''
        self.ep_reward +=r
        
        if terminal:
            
            summary_str = self.sess.run(
            , feed_dict={summary_vars[0]: self.ep_reward, summary_vars[1]: self.ep_ave_max_q / float(j)})

            writer.add_summary(summary_str, i)
            #writer.flush()
            
            print('| Reward: {:d} |Qmax: {:.4f}'.format(int(self.ep_reward), \
                             (self.ep_ave_max_q / float(j))))
             '''

    def act(self, states):
        """Returns actions for given state(s) as per current policy."""
        states = np.reshape(states, [-1, self.state_size])

        actions = self.actor.predict(states)[0]
        #actornoises = OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.action_size))
        #print(actions)

        return actions + self.noise.sample()  # add some noise for exploration

    def train(self, s_batch, a_batch, r_batch, t_batch, s2_batch):

        target_q = self.critic.predict_target(
            s2_batch, self.actor.predict_target(s2_batch))

        y_i = []
        for k in range(self.minibatch_size):
            if t_batch[k]:
                y_i.append(r_batch[k])
            else:
                y_i.append(r_batch[k] + self.critic.gamma * target_q[k])

                # Update the critic given the targets
        predicted_q_value, _ = self.critic.train(
            s_batch, a_batch, np.reshape(y_i, (self.minibatch_size, 1)))

        #self.ep_ave_max_q += np.amax(predicted_q_value)

        # Update the actor policy using the sampled gradient
        a_outs = self.actor.predict(s_batch)
        grads = self.critic.action_gradients(s_batch, a_outs)
        self.actor.train(s_batch, grads[0])

        # Update target networks
        self.actor.update_target_network()
        self.critic.update_target_network()

    def build_summaries(self):
        episode_reward = tf.Variable(0.)
        tf.summary.scalar("Reward", episode_reward)
        episode_ave_max_q = tf.Variable(0.)
        tf.summary.scalar("Qmax Value", episode_ave_max_q)

        summary_vars = [episode_reward, episode_ave_max_q]
        summary_ops = tf.summary.merge_all()

        return summary_ops, summary_vars
Example #2
0
class DrlAgent:
    def __init__(self,
                 sess,
                 is_train,
                 dim_state,
                 dim_action,
                 num_paths,
                 actor_learn_rate,
                 critic_learn_rate,
                 tau,
                 buffer_size,
                 mini_batch,
                 ep_begin,
                 epsilon_end,
                 gamma,
                 max_epoch,
                 seed=66):
        self.__is_train = is_train
        self.__dim_state = dim_state
        self.__dim_action = dim_action
        self.__mini_batch = mini_batch
        self.__ep_begin = ep_begin
        self.__gamma = gamma
        self.__max_epoch = max_epoch

        self.__actor = ActorNetwork(sess, dim_state, dim_action, 1.0,
                                    actor_learn_rate, tau, num_paths)
        self.__critic = CriticNetwork(sess, dim_state, dim_action,
                                      critic_learn_rate, tau)

        self.__replay = ReplayBuffer(buffer_size, seed)

        self.__explorer = Explorer(ep_begin, epsilon_end, max_epoch,
                                   dim_action, num_paths, seed)

        self.__state_curt = np.zeros(dim_state)
        self.__action_curt = self.__explorer.convert_action(
            np.ones(dim_action))

        self.__episode = 0
        self.__step = 0

    def target_paras_init(self):
        self.__actor.update_target_paras()
        self.__critic.update_target_paras()

    def predict(self, state, reward):
        action_original = self.__actor.predict([state])[0]
        if not self.__is_train:
            return action_original

        action = self.__explorer.get_act(action_original)
        self.__replay.add(self.__state_curt, self.__action_curt, reward, state)
        self.__state_curt = state
        self.__action_curt = action

        if len(self.__replay) > self.__mini_batch:
            self.train()

        self.__step += 1
        if self.__step >= self.__max_epoch:
            self.__step = 0
            self.__episode += 1
            self.__explorer.reset_ep(self.__ep_begin)
        return action

    def train(self):
        batch_state, batch_action, batch_reward, batch_state_next = self.__replay.sample_batch(
            self.__mini_batch)
        weights = [1.0] * self.__mini_batch
        weights = np.expand_dims(weights, axis=1)
        target_q = self.__critic.predict_target(
            batch_state_next, self.__actor.predict_target(batch_state_next))
        value_q = self.__critic.predict(batch_state, batch_action)

        batch_y = []
        batch_error = []
        for k in range(len(batch_reward)):
            target_y = batch_reward[k] + self.__gamma * target_q[k]
            batch_error.append(abs(target_y - value_q[k]))
            batch_y.append(target_y)

        predicted_q, _ = self.__critic.train(batch_state, batch_action,
                                             batch_y, weights)
        a_outs = self.__actor.predict(batch_state)
        grads = self.__critic.calculate_gradients(batch_state, a_outs)
        weighted_grads = weights * grads[0]
        self.__actor.train(batch_state, weighted_grads)
        self.__actor.update_target_paras()
        self.__critic.update_target_paras()
Example #3
0
def trainer(epochs=1000, MINIBATCH_SIZE=40, GAMMA = 0.99, epsilon=1.0, min_epsilon=0.01, BUFFER_SIZE=10000, train_indicator=True, render=False):
    with tf.Session() as sess:


        # configuring environment
        env = gym.make(ENV_NAME)
        # configuring the random processes
        np.random.seed(RANDOM_SEED)
        tf.set_random_seed(RANDOM_SEED)
        env.seed(RANDOM_SEED)
        # info of the environment to pass to the agent
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        action_bound = np.float64(10) # I choose this number since the mountain continuos does not have a boundary
        # Creating agent
        ruido = OUNoise(action_dim, mu = 0.4) # this is the Ornstein-Uhlenbeck Noise
        actor = ActorNetwork(sess, state_dim, action_dim, action_bound, ACTOR_LEARNING_RATE, TAU, DEVICE)
        critic = CriticNetwork(sess, state_dim, action_dim, CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars(), DEVICE)


        sess.run(tf.global_variables_initializer())

        # Initialize target network weights
        actor.update_target_network()
        critic.update_target_network()
        # Initialize replay memory
        replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)

        goal = 0
        max_state = -1.
        try:
            critic.recover_critic()
            actor.recover_actor()
            print('********************************')
            print('models restored succesfully')
            print('********************************')
        except:
            pass
#            print('********************************')
#            print('Failed to restore models')
#            print('********************************')


        for i in range(epochs):

            state = env.reset()
            state = np.hstack(state)
            ep_reward = 0
            ep_ave_max_q = 0
            done = False
            step = 0
            max_state_episode = -1
            epsilon -= (epsilon/EXPLORE)
            epsilon = np.maximum(min_epsilon,epsilon)


            while (not done):

                if render:
                    env.render()

                #print('step', step)
                # 1. get action with actor, and add noise
                action_original = actor.predict(np.reshape(state,(1,state_dim))) # + (10. / (10. + i))* np.random.randn(1)
                action = action_original + max(epsilon,0)*ruido.noise()


                # remove comment if you want to see a step by step update
                # print(step,'a',action_original, action,'s', state[0], 'max state', max_state_episode)

                # 2. take action, see next state and reward :
                next_state, reward, done, info = env.step(action)

                if train_indicator:
                    # 3. Save in replay buffer:
                    replay_buffer.add(np.reshape(state, (actor.s_dim,)), np.reshape(action, (actor.a_dim,)), reward,
                                      done, np.reshape(next_state, (actor.s_dim,)))

                    # Keep adding experience to the memory until
                    # there are at least minibatch size samples
                    if replay_buffer.size() > MINIBATCH_SIZE:

                        # 4. sample random minibatch of transitions:
                        s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(MINIBATCH_SIZE)

                        # Calculate targets

                        # 5. Train critic Network (states,actions, R + gamma* V(s', a')):
                        # 5.1 Get critic prediction = V(s', a')
                        # the a' is obtained using the actor prediction! or in other words : a' = actor(s')
                        target_q = critic.predict_target(s2_batch, actor.predict_target(s2_batch))

                        # 5.2 get y_t where:
                        y_i = []
                        for k in range(MINIBATCH_SIZE):
                            if t_batch[k]:
                                y_i.append(r_batch[k])
                            else:
                                y_i.append(r_batch[k] + GAMMA * target_q[k])


                        # 5.3 Train Critic!
                        predicted_q_value, _ = critic.train(s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))

                        ep_ave_max_q += np.amax(predicted_q_value)

                        # 6 Compute Critic gradient (depends on states and actions)
                        # 6.1 therefore I first need to calculate the actions the current actor would take.
                        a_outs = actor.predict(s_batch)
                        # 6.2 I calculate the gradients
                        grads = critic.action_gradients(s_batch, a_outs)
                        actor.train(s_batch, grads[0])

                        # Update target networks
                        actor.update_target_network()
                        critic.update_target_network()


                state = next_state
                if next_state[0] > max_state_episode:
                    max_state_episode = next_state[0]

                ep_reward = ep_reward + reward
                step +=1

            if done:
                ruido.reset()
                if state[0] > 0.45:
                    #print('****************************************')
                    #print('got it!')
                    #print('****************************************')
                    goal += 1

            if max_state_episode > max_state:
                max_state = max_state_episode
            print('th',i+1,'n steps', step,'R:', round(ep_reward,3),'Pos', round(epsilon,3),'Efficiency', round(100.*((goal)/(i+1.)),3) )


            # print('Efficiency', 100.*((goal)/(i+1.)))


        print('*************************')
        print('now we save the model')
        critic.save_critic()
        actor.save_actor()
        print('model saved succesfuly')
        print('*************************')
Example #4
0
def actor_critic(epochs=1000,
                 GAMMA=0.99,
                 load_file=False,
                 render=False,
                 temp=False,
                 verbose=False):
    with tf.Session() as sess:

        # define objects
        # the gym environment is wrapped in a class. this way of working allows portability with other robots in the lab & makes the main very clear
        #robot = gym_pendulum(render, temp)
        robot = gym_mountaincar(render, temp)
        actor = ActorNetwork(sess,
                             robot.state_dim,
                             robot.action_dim,
                             ACTOR_LEARNING_RATE,
                             ACTION_BOUND,
                             device=DEVICE)
        critic = CriticNetwork(sess,
                               robot.state_dim,
                               CRITIC_LEARNING_RATE,
                               actor.get_num_trainable_vars(),
                               device=DEVICE)
        # starting tensorflow
        sess.run(tf.global_variables_initializer())

        if load_file:
            actor.recover_actor()
            critic.recover_critic()

        for i in range(epochs):
            # Reset the environment
            state, done, step = robot.reset()
            ep_reward = 0

            while (not done):
                # Choose and take action, and observe reward
                action, mu, sigma = actor.predict(
                    np.reshape(state, (1, robot.state_dim)))

                new_action = action + 0.2 * (np.random.rand(1)[0])
                action_noise = np.clip(new_action, -ACTION_BOUND, ACTION_BOUND)
                # print(round(action,3), round(new_action,3), round(action_noise,3),  round(mu,3), round(sigma,3))
                next_state, reward, done, step = robot.update(action_noise)

                # Train
                V_minib = critic.predict(
                    np.reshape(state, (1, robot.state_dim)))
                V_minib_next = critic.predict(
                    np.reshape(next_state, (1, robot.state_dim)))
                if done:
                    td_target = reward
                    td_error = reward - V_minib  # not - V_minib[k] ?
                else:
                    td_target = reward + GAMMA * V_minib_next
                    td_error = reward + GAMMA * V_minib_next - V_minib

                #critic.train(np.reshape(state, (1, robot.state_dim)), np.reshape(td_target, (1, 1)))
                critic.train(np.reshape(state, (1, robot.state_dim)),
                             np.reshape(td_target, (1, 1)))
                actor.train(np.reshape(state, (1, robot.state_dim)),
                            np.reshape(action, (1, 1)),
                            np.reshape(td_error, (1, 1)))

                state = next_state
                ep_reward = ep_reward + reward
                # this print is usefull for debuggin
                if verbose:
                    print(step, 'action', round(action, 3), 'state',
                          round(robot.state[0], 3), round(robot.state[1], 3),
                          'r', round(reward, 3))

            print('episode', i + 1, 'Steps', step, 'Reward:', ep_reward,
                  'goal achieved:', robot.goal, 'Efficiency',
                  round(100. * ((robot.goal) / (i + 1.)), 0), '%')
            #time.sleep(1)

        print('*************************')
        print('now we save the model')
        critic.save_critic()
        actor.save_actor()
        print('model saved succesfuly')
        print('*************************')
Example #5
0
class DDPG:
    def __init__(self, pretrain=False):

        # Make sure all the directories exist
        if not tf.gfile.Exists(TFLOG_PATH):
            tf.gfile.MakeDirs(TFLOG_PATH)
        if not tf.gfile.Exists(EXPERIENCE_PATH):
            tf.gfile.MakeDirs(EXPERIENCE_PATH)
        if not tf.gfile.Exists(NET_SAVE_PATH):
            tf.gfile.MakeDirs(NET_SAVE_PATH)

        # Initialize our session
        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        self.session = tf.Session(config=config)
        # self.session = tf.Session()
        self.graph = self.session.graph

        with self.graph.as_default():

            # View the state batches
            # self.visualize_input = VISUALIZE_BUFFER
            # if self.visualize_input:
            #     self.viewer = CostmapVisualizer()

            # Hardcode input size and action size
            self.height = 662
            self.width = 1
            self.depth = 4
            self.action_dim = 2

            # Initialize the current action and the old action and old state for setting experiences
            self.old_state = np.zeros((self.width, self.height, self.depth),
                                      dtype='float32')
            self.old_action = np.ones(2, dtype='float32')
            self.network_action = np.zeros(2, dtype='float32')
            self.noise_action = np.zeros(2, dtype='float32')
            self.action = np.zeros(2, dtype='float32')

            # Initialize the grad inverter object to keep the action bounds
            self.grad_inv = GradInverter(A0_BOUNDS, A1_BOUNDS, self.session)

            # Make sure the directory for the data files exists
            if not tf.gfile.Exists(DATA_PATH):
                tf.gfile.MakeDirs(DATA_PATH)

            # Initialize summary writers to plot variables during training
            self.summary_op = tf.summary.merge_all()
            self.summary_writer = tf.summary.FileWriter(TFLOG_PATH)

            # Initialize actor and critic networks
            self.actor_network = ActorNetwork(self.height, self.action_dim,
                                              self.depth, self.session,
                                              self.summary_writer)
            self.critic_network = CriticNetwork(self.height, self.action_dim,
                                                self.depth, self.session,
                                                self.summary_writer)

            # Initialize the saver to save the network params
            self.saver = tf.train.Saver()

            # initialize the experience data manger
            self.data_manager = DataManager(BATCH_SIZE, EXPERIENCE_PATH,
                                            self.session)

            # Uncomment if collecting a buffer for the autoencoder
            # self.buffer = deque()

            # Should we load the pre-trained params?
            # If so: Load the full pre-trained net
            # Else:  Initialize all variables the overwrite the conv layers with the pretrained filters
            if PRE_TRAINED_NETS:
                self.saver.restore(self.session, NET_LOAD_PATH)
            else:
                self.session.run(tf.initialize_all_variables())

            tf.train.start_queue_runners(sess=self.session)
            time.sleep(1)

            # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
            self.exploration_noise = OUNoise(self.action_dim, MU, THETA, SIGMA)
            self.noise_flag = True

            # Initialize time step
            self.training_step = 0

            # Flag: don't learn the first experience
            self.first_experience = True

            # After the graph has been filled add it to the summary writer
            self.summary_writer.add_graph(self.graph)

    def train(self):

        # Check if the buffer is big enough to start training
        if self.data_manager.enough_data():

            # start_ = time.time()

            # get the next random batch from the data manger
            state_batch, \
                action_batch, \
                reward_batch, \
                next_state_batch, \
                is_episode_finished_batch = self.data_manager.get_next_batch()

            state_batch = np.divide(state_batch, 10.0)
            next_state_batch = np.divide(next_state_batch, 10.0)

            # Are we visualizing the first state batch for debugging?
            # If so: We have to scale up the values for grey scale before plotting
            # if self.visualize_input:
            #     state_batch_np = np.asarray(state_batch)
            #     state_batch_np = np.multiply(state_batch_np, -100.0)
            #     state_batch_np = np.add(state_batch_np, 100.0)
            #     self.viewer.set_data(state_batch_np)
            #     self.viewer.run()
            #     self.visualize_input = False

            # Calculate y for the td_error of the critic

            # start = time.time()
            y_batch = []
            next_action_batch = self.actor_network.target_evaluate(
                next_state_batch, action_batch)
            q_value_batch = self.critic_network.target_evaluate(
                next_state_batch, next_action_batch)
            # done = time.time()
            # elapsed = done - start
            # print "forward actor and critic time is: ", elapsed

            for i in range(0, BATCH_SIZE):
                if is_episode_finished_batch[i]:
                    y_batch.append([reward_batch[i]])
                else:
                    y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])

            # Now that we have the y batch lets train the critic
            # start = time.time()
            self.critic_network.train(y_batch, state_batch, action_batch)
            # done = time.time()
            # elapsed = done - start
            # print "train critic time is: ", elapsed

            # self.critic_network.train(y_batch, state_batch, action_batch)

            # Get the action batch so we can calculate the action gradient with it
            # Then get the action gradient batch and adapt the gradient with the gradient inverting method
            # start = time.time()
            action_batch_for_gradients = self.actor_network.evaluate(
                state_batch, action_batch)
            # done = time.time()
            # elapsed = done - start
            # print "forward action after critic training time is: ", elapsed

            q_gradient_batch = self.critic_network.get_action_gradient(
                state_batch, action_batch_for_gradients)
            q_gradient_batch = self.grad_inv.invert(
                q_gradient_batch, action_batch_for_gradients)

            # Now we can train the actor
            # start = time.time()
            self.actor_network.train(q_gradient_batch, state_batch,
                                     action_batch)
            # done = time.time()
            # elapsed = done - start
            # print "train actor time is: ", elapsed

            # done = time.time()
            # elapsed = done - start_
            # print "====== total time is: ", elapsed

            # Save model if necessary
            if self.training_step > 0 and self.training_step % SAVE_STEP == 0:
                self.saver.save(self.session,
                                NET_SAVE_PATH,
                                global_step=self.training_step)

            # Update time step
            self.training_step += 1

            if self.training_step % 400 == 0:
                print "iter: ", self.training_step

        # start_ = time.time()
        self.data_manager.check_for_enqueue()
        # done = time.time()
        # elapsed = done - start_
        # print "############ check enqueue time is: ", elapsed

    def get_action(self, state, old_action):

        # normalize the state
        state = state.astype(float)
        state = np.divide(state, 10.0)

        # Get the action
        self.action = self.actor_network.get_action(state, old_action)
        self.action = self.action.reshape((2, ))

        # Are we using noise?
        if self.noise_flag:
            # scale noise down to 0 at training step 3000000
            self.action = 0.8 * self.exploration_noise.noise()
#            if self.training_step < MAX_NOISE_STEP:
#                self.action += (MAX_NOISE_STEP - self.training_step) / \
#                    MAX_NOISE_STEP * self.exploration_noise.noise()
# if action value lies outside of action bounds, rescale the action vector
#            if self.action[0] < A0_BOUNDS[0] or self.action[0] > A0_BOUNDS[1]:
#                self.action *= np.fabs(A0_BOUNDS[0] / self.action[0])
#            if self.action[1] < A0_BOUNDS[0] or self.action[1] > A0_BOUNDS[1]:
#                self.action *= np.fabs(A1_BOUNDS[0] / self.action[1])

# Life q value output for this action and state
        self.print_q_value(state, self.action)

        return self.action

    def set_experience(self, state, reward, is_episode_finished):

        # Make sure we're saving a new old_state for the first experience of every episode
        if self.first_experience:
            self.first_experience = False
        else:
            state.astype('float32')
            self.old_action.astype('float32')
            self.old_action.astype('float32')
            self.data_manager.store_experience_to_file(self.old_state,
                                                       self.old_action, reward,
                                                       state,
                                                       is_episode_finished)

            # Uncomment if collecting data for the auto_encoder
            # experience = (self.old_state, self.old_action, reward, state, is_episode_finished)
            # self.buffer.append(experience)

        if is_episode_finished:
            self.first_experience = True
            self.exploration_noise.reset()

        # Safe old state and old action for next experience
        self.old_state = state
        self.old_action = self.action

    def print_q_value(self, state, action):

        string = "-"
        q_value = self.critic_network.evaluate([state], [action])
        stroke_pos = 30 * q_value[0][0] + 30
        if stroke_pos < 0:
            stroke_pos = 0
        elif stroke_pos > 60:
            stroke_pos = 60
Example #6
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.Session()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
        self.sess.run(tf.global_variables_initializer())
        #target_param <- eval_param
        self.actor_network.update_target()
        self.critic_network.update_target()

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.sample(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.size > REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()
Example #7
0
class DDPG:

    def __init__(self):

        # Make sure all the directories exist
        if not tf.gfile.Exists(TFLOG_PATH):
            tf.gfile.MakeDirs(TFLOG_PATH)
        if not tf.gfile.Exists(EXPERIENCE_PATH):
            tf.gfile.MakeDirs(EXPERIENCE_PATH)
        if not tf.gfile.Exists(NET_SAVE_PATH):
            tf.gfile.MakeDirs(NET_SAVE_PATH)

        # Initialize our session
        self.session = tf.Session()
        self.graph = self.session.graph

        with self.graph.as_default():

            # View the state batches
            self.visualize_input = VISUALIZE_BUFFER
            if self.visualize_input:
                self.viewer = CostmapVisualizer()

            # Hardcode input size and action size
            self.height = 86
            self.width = self.height
            self.depth = 4
            self.action_dim = 2

            # Initialize the current action and the old action and old state for setting experiences
            self.old_state = np.zeros((self.width, self.height, self.depth), dtype='int8')
            self.old_action = np.ones(2, dtype='float')
            self.network_action = np.zeros(2, dtype='float')
            self.noise_action = np.zeros(2, dtype='float')
            self.action = np.zeros(2, dtype='float')

            # Initialize the grad inverter object to keep the action bounds
            self.grad_inv = GradInverter(A0_BOUNDS, A1_BOUNDS, self.session)

            # Make sure the directory for the data files exists
            if not tf.gfile.Exists(DATA_PATH):
                tf.gfile.MakeDirs(DATA_PATH)

            # Initialize summary writers to plot variables during training
            self.summary_op = tf.merge_all_summaries()
            self.summary_writer = tf.train.SummaryWriter(TFLOG_PATH)

            # Initialize actor and critic networks
            self.actor_network = ActorNetwork(self.height, self.action_dim, self.depth, self.session,
                                              self.summary_writer)
            self.critic_network = CriticNetwork(self.height, self.action_dim, self.depth, self.session,
                                                self.summary_writer)

            # Initialize the saver to save the network params
            self.saver = tf.train.Saver()

            # initialize the experience data manger
            self.data_manager = DataManager(BATCH_SIZE, EXPERIENCE_PATH, self.session)

            # Uncomment if collecting a buffer for the autoencoder
            # self.buffer = deque()

            # Should we load the pre-trained params?
            # If so: Load the full pre-trained net
            # Else:  Initialize all variables the overwrite the conv layers with the pretrained filters
            if PRE_TRAINED_NETS:
                self.saver.restore(self.session, NET_LOAD_PATH)
            else:
                self.session.run(tf.initialize_all_variables())

            tf.train.start_queue_runners(sess=self.session)
            time.sleep(1)

            # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
            self.exploration_noise = OUNoise(self.action_dim, MU, THETA, SIGMA)
            self.noise_flag = True

            # Initialize time step
            self.training_step = 0

            # Flag: don't learn the first experience
            self.first_experience = True

            # After the graph has been filled add it to the summary writer
            self.summary_writer.add_graph(self.graph)

    def train(self):

        # Check if the buffer is big enough to start training
        if self.data_manager.enough_data():

            # get the next random batch from the data manger
            state_batch, \
                action_batch, \
                reward_batch, \
                next_state_batch, \
                is_episode_finished_batch = self.data_manager.get_next_batch()

            state_batch = np.divide(state_batch, 100.0)
            next_state_batch = np.divide(next_state_batch, 100.0)

            # Are we visualizing the first state batch for debugging?
            # If so: We have to scale up the values for grey scale before plotting
            if self.visualize_input:
                state_batch_np = np.asarray(state_batch)
                state_batch_np = np.multiply(state_batch_np, -100.0)
                state_batch_np = np.add(state_batch_np, 100.0)
                self.viewer.set_data(state_batch_np)
                self.viewer.run()
                self.visualize_input = False

            # Calculate y for the td_error of the critic
            y_batch = []
            next_action_batch = self.actor_network.target_evaluate(next_state_batch)
            q_value_batch = self.critic_network.target_evaluate(next_state_batch, next_action_batch)

            for i in range(0, BATCH_SIZE):
                if is_episode_finished_batch[i]:
                    y_batch.append([reward_batch[i]])
                else:
                    y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])

            # Now that we have the y batch lets train the critic
            self.critic_network.train(y_batch, state_batch, action_batch)

            # Get the action batch so we can calculate the action gradient with it
            # Then get the action gradient batch and adapt the gradient with the gradient inverting method
            action_batch_for_gradients = self.actor_network.evaluate(state_batch)
            q_gradient_batch = self.critic_network.get_action_gradient(state_batch, action_batch_for_gradients)
            q_gradient_batch = self.grad_inv.invert(q_gradient_batch, action_batch_for_gradients)

            # Now we can train the actor
            self.actor_network.train(q_gradient_batch, state_batch)

            # Save model if necessary
            if self.training_step > 0 and self.training_step % SAVE_STEP == 0:
                self.saver.save(self.session, NET_SAVE_PATH, global_step=self.training_step)

            # Update time step
            self.training_step += 1

        self.data_manager.check_for_enqueue()

    def get_action(self, state):

        # normalize the state
        state = state.astype(float)
        state = np.divide(state, 100.0)

        # Get the action
        self.action = self.actor_network.get_action(state)

        # Are we using noise?
        if self.noise_flag:
            # scale noise down to 0 at training step 3000000
            if self.training_step < MAX_NOISE_STEP:
                self.action += (MAX_NOISE_STEP - self.training_step) / MAX_NOISE_STEP * self.exploration_noise.noise()
            # if action value lies outside of action bounds, rescale the action vector
            if self.action[0] < A0_BOUNDS[0] or self.action[0] > A0_BOUNDS[1]:
                self.action *= np.fabs(A0_BOUNDS[0]/self.action[0])
            if self.action[1] < A0_BOUNDS[0] or self.action[1] > A0_BOUNDS[1]:
                self.action *= np.fabs(A1_BOUNDS[0]/self.action[1])

        # Life q value output for this action and state
        self.print_q_value(state, self.action)

        return self.action

    def set_experience(self, state, reward, is_episode_finished):

        # Make sure we're saving a new old_state for the first experience of every episode
        if self.first_experience:
            self.first_experience = False
        else:
            self.data_manager.store_experience_to_file(self.old_state, self.old_action, reward, state,
                                                       is_episode_finished)

            # Uncomment if collecting data for the auto_encoder
            # experience = (self.old_state, self.old_action, reward, state, is_episode_finished)
            # self.buffer.append(experience)

        if is_episode_finished:
            self.first_experience = True
            self.exploration_noise.reset()

        # Safe old state and old action for next experience
        self.old_state = state
        self.old_action = self.action

    def print_q_value(self, state, action):

        string = "-"
        q_value = self.critic_network.evaluate([state], [action])
        stroke_pos = 30 * q_value[0][0] + 30
        if stroke_pos < 0:
            stroke_pos = 0
        elif stroke_pos > 60:
            stroke_pos = 60
        print '[' + stroke_pos * string + '|' + (60-stroke_pos) * string + ']', "Q: ", q_value[0][0], \
            "\tt: ", self.training_step
Example #8
0
def actor_critic(epochs=1000,
                 GAMMA=0.99,
                 train_indicator=True,
                 render=False,
                 temp=False):
    with tf.Session() as sess:

        # define objects
        # the gym environment is wrapped in a class. this way of working allows portability with other robots in the lab & makes the main very clear
        robot = gym_environment('FrozenLakeNonskid8x8-v0', False, render, temp)
        actor = ActorNetwork(sess, robot.state_dim, robot.action_dim,
                             ACTOR_LEARNING_RATE)
        critic = CriticNetwork(sess, robot.state_dim, CRITIC_LEARNING_RATE,
                               actor.get_num_trainable_vars())
        # starting tensorflow
        sess.run(tf.global_variables_initializer())

        for i in range(epochs):
            # Reset the environment
            state, done, step = robot.reset()
            ep_reward = 0

            while (not done):
                # Choose and take action, and observe reward
                action_prob = actor.predict(
                    np.reshape(state, (1, robot.state_dim)))
                action = np.random.choice(np.arange(len(action_prob)),
                                          p=action_prob)
                next_state, reward, done, step = robot.update(action)
                # Train
                V_minib = critic.predict(
                    np.reshape(state, (1, robot.state_dim)))
                V_minib_next = critic.predict(
                    np.reshape(next_state, (1, robot.state_dim)))
                if done:
                    td_target = reward
                    td_error = reward - V_minib  # not - V_minib[k] ?
                else:
                    td_target = reward + GAMMA * V_minib_next
                    td_error = reward + GAMMA * V_minib_next - V_minib

                critic.train(np.reshape(state, (1, robot.state_dim)),
                             np.reshape(td_target, (1, 1)))
                actor.train(np.reshape(state, (1, robot.state_dim)),
                            np.reshape(action, (1, 1)),
                            np.reshape(td_error, (1, 1)))

                state = next_state
                ep_reward = ep_reward + reward
                # this print is usefull for debuggin
                #print(step,'action', action, 'state', robot.uncodedstate,'r', round(reward,3), 'prob', action_prob)

            print('episode', i + 1, 'Steps', step, 'Reward:', ep_reward,
                  'goal achieved:', robot.goal, 'Efficiency',
                  round(100. * ((robot.goal) / (i + 1.)), 0), '%')

        print('*************************')
        print('now we save the model')
        critic.save_critic()
        actor.save_actor()
        print('model saved succesfuly')
        print('*************************')
Example #9
0
def actor_critic(epochs=1000,
                 GAMMA=0.99,
                 train_indicator=True,
                 render=False,
                 temp=False,
                 baseline=True):
    with tf.Session() as sess:

        # define objects
        # the gym environment is wrapped in a class. this way of working allows portability with other robots in the lab & makes the main very clear
        robot = gym_environment('FrozenLakeNonskid8x8-v0', False, render, temp)
        actor = ActorNetwork(sess, robot.state_dim, robot.action_dim,
                             ACTOR_LEARNING_RATE)
        critic = CriticNetwork(sess, robot.state_dim, CRITIC_LEARNING_RATE,
                               actor.get_num_trainable_vars())
        # starting tensorflow
        sess.run(tf.global_variables_initializer())

        for i in range(epochs):
            # Reset the environment
            state, done, step = robot.reset()
            ep_reward = 0
            total_reward = np.zeros(max_episode)
            total_state = deque()
            total_action = deque()
            k = 0
            while (not done) and k < max_episode:
                # Choose and take action, and observe reward
                action_prob = actor.predict(
                    np.reshape(state, (1, robot.state_dim)))
                action = np.random.choice(np.arange(len(action_prob)),
                                          p=action_prob)
                next_state, reward, done, step = robot.update(action)
                # store episode information
                total_reward[k] = reward
                total_state.append(state)
                total_action.append(action)
                state = next_state
                k = k + 1

            # Train
            # get G
            for l in range(k):

                G = np.sum(total_reward[l:k + 1])
                #print(l,G) # print for debug
                state = np.reshape(total_state[l], (1, robot.state_dim))
                action = np.reshape(total_action[l], (1, 1))

                if baseline:
                    delta = G - critic.predict(state)
                    critic.train(state, delta)
                    actor.train(state, action, delta)
                else:
                    actor.train(state, action, G)

            # this print is usefull for debuggin
            #print(step,'action', action, 'state', robot.uncodedstate,'r', round(reward,3), 'prob', action_prob)

            print('episode', i + 1, 'Steps', step, 'Reward:', ep_reward,
                  'goal achieved:', robot.goal, 'Efficiency',
                  round(100. * ((robot.goal) / (i + 1.)), 0), '%')

        print('*************************')
        print('now we save the model')
        critic.save_critic()
        actor.save_actor()
        print('model saved succesfuly')
        print('*************************')
Example #10
0
def trainer(env,
            outdir,
            epochs=100,
            MINIBATCH_SIZE=64,
            GAMMA=0.99,
            epsilon=0.01,
            min_epsilon=0.01,
            BUFFER_SIZE=10000,
            train_indicator=False,
            render=False):
    tf.reset_default_graph()
    with tf.Session(config=config) as sess:

        # configuring environment
        #env = gym.make(ENV_NAME)
        # configuring the random processes
        np.random.seed(RANDOM_SEED)
        tf.set_random_seed(RANDOM_SEED)
        env.seed(RANDOM_SEED)
        # info of the environment to pass to the agent
        state_dim = env.observation_space
        action_dim = env.action_space
        action_bound = np.float64(
            1
        )  # I choose this number since the mountain continuos does not have a boundary
        # Creating agent

        # FOR the RNN
        #tf.contrib.rnn.core_rnn_cell.BasicLSTMCell from https://github.com/tensorflow/tensorflow/issues/8771
        #cell = tf.contrib.rnn.BasicLSTMCell(num_units=300,state_is_tuple=True, reuse = None)
        #cell_target = tf.contrib.rnn.BasicLSTMCell(num_units=300,state_is_tuple=True, reuse = None)
        ruido = OUNoise(action_dim,
                        mu=0.4)  # this is the Ornstein-Uhlenbeck Noise
        actor = ActorNetwork(sess, state_dim, action_dim, action_bound,
                             ACTOR_LEARNING_RATE, TAU, outdir)
        critic = CriticNetwork(sess, state_dim, action_dim,
                               CRITIC_LEARNING_RATE, TAU,
                               actor.get_num_trainable_vars(), outdir)

        #sess.run(tf.global_variables_initializer())

        # Initialize target network weights
        actor.update_target_network()
        critic.update_target_network()
        # Initialize replay memory
        replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)
        replay_buffer.load()

        #goal = 0
        max_state = -1.
        try:
            critic.recover_critic()
            actor.recover_actor()
            print('********************************')
            print('models restored succesfully')
            print('********************************')
        except Exception as e:
            print('********************************')
            print(e)
            print('********************************')
        #critic.recover_critic()
        #actor.recover_actor()

        for i in range(epochs):
            state = env.reset()
            #state = np.hstack(state)
            ep_reward = 0
            ep_ave_max_q = 0
            done = False
            step = 0
            max_state_episode = -1
            epsilon -= epsilon / EXPLORE
            if epsilon < min_epsilon:
                epsilon = min_epsilon
            while (not done):

                if render:
                    env.render()

                #print('step', step)
                # 1. get action with actor, and add noise

                np.set_printoptions(precision=4)
                # remove comment if you want to see a step by step update
                #print(step,'a',action_original, action,'s', state[0], 'max state', max_state_episode)

                # 2. take action, see next state and reward :
                action_original = actor.predict(
                    np.reshape(state,
                               (1, actor.s_dim
                                )))  # + (10. / (10. + i))* np.random.randn(1)
                action = action_original  #+ max(epsilon, 0) * ruido.noise()
                '''
                for j in range(action.shape[1]):
                    if abs(action[0,j]) > 1:
                        act=action[0,j]
                        action[0,j]=act/abs(act)
                    else:
                        continue
                '''
                action = np.reshape(action, (actor.a_dim, ))
                next_state, reward, done, info = env.step(action)
                if train_indicator:
                    # 3. Save in replay buffer:
                    replay_buffer.add(np.reshape(state, (actor.s_dim, )),
                                      np.reshape(action, (actor.a_dim, )),
                                      reward, done,
                                      np.reshape(next_state, (actor.s_dim, )))

                    # Keep adding experience to the memory until
                    # there are at least minibatch size samples
                    if replay_buffer.size() > MINIBATCH_SIZE:

                        # 4. sample random minibatch of transitions:
                        s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(
                            MINIBATCH_SIZE)

                        # Calculate targets

                        # 5. Train critic Network (states,actions, R + gamma* V(s', a')):
                        # 5.1 Get critic prediction = V(s', a')
                        # the a' is obtained using the actor prediction! or in other words : a' = actor(s')
                        target_q = critic.predict_target(
                            s2_batch, actor.predict_target(s2_batch), 20)

                        # 5.2 get y_t where:
                        y_i = []
                        for k in range(MINIBATCH_SIZE):
                            if t_batch[k]:
                                y_i.append(r_batch[k])
                            else:
                                y_i.append(r_batch[k] + GAMMA * target_q[k])

                        # 5.3 Train Critic!
                        predicted_q_value, _ = critic.train(
                            s_batch, a_batch,
                            np.reshape(y_i, (MINIBATCH_SIZE, 1)), 20)

                        ep_ave_max_q += np.amax(predicted_q_value)

                        # 6 Compute Critic gradient (depends on states and actions)
                        # 6.1 therefore I first need to calculate the actions the current actor would take.
                        a_outs = actor.predict(s_batch)
                        # 6.2 I calculate the gradients
                        grads = critic.action_gradients(s_batch, a_outs, 20)
                        c = np.array(grads)
                        #print(c.shape)
                        #print('...')
                        #print('...',c[0].shape)
                        #print('...')
                        actor.train(s_batch, grads[0])

                        # Update target networks
                        actor.update_target_network()
                        critic.update_target_network()
                state = next_state
                if next_state[0] > max_state_episode:
                    max_state_episode = next_state[0]

                ep_reward = ep_reward + reward
                step += 1

            if max_state_episode > max_state:
                max_state = max_state_episode

            print('th', i + 1, 'Step', step, 'Reward:', ep_reward, 'Pos',
                  next_state[0], next_state[1], 'epsilon', epsilon)
            print('*************************')
            print('now we save the model')
            critic.save_critic()
            actor.save_actor()
            print('model saved succesfuly')
            print('*************************')
            replay_buffer.save()
            #proc = Popen(['rosclean','purge'],stdout=PIPE, stdin=PIPE, stderr=PIPE,universal_newlines=True)
            #out,err = proc.communicate(input="{}\n".format("y"))
            #print('maxmimum state reach', max_state)
            #print('the reward at the end of the episode,', reward)
            #print('Efficiency', 100.*((goal)/(i+1.)))
        '''
        print('*************************')
        print('now we save the model')
        critic.save_critic()
        actor.save_actor()
        print('model saved succesfuly')
        print('*************************')
        replay_buffer.save()
        #env.close()
        '''
        sess.close()
    return 0