Python CriticNetwork.train Examples

Programming Language: Python

Namespace/Package Name: critic

Class/Type: CriticNetwork

Method/Function: train

Examples at hotexamples.com: 10

Python CriticNetwork.train - 10 examples found. These are the top rated real world Python examples of critic.CriticNetwork.train extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

CriticNetwork(13)

train(9)

save_critic(5)

predict(4)

update_target_network(4)

predict_target(4)

action_gradients(3)

recover_critic(3)

gradients(3)

get_num_params(1)

get_action_gradients(1)

restore_pretrained_weights(1)

save(1)

get_action_gradient(1)

target_evaluate(1)

target_q(1)

evaluate(1)

update_target(1)

calculate_gradients(1)

update_target_paras(1)

Example #1

Show file

File: agent.py Project: JimZhang37/DDPG

class DDPG():
    def __init__(self, task, sess):
        self.sess = sess
        self.env = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        self.actor_lr = 0.0001
        self.tau = 0.001
        self.minibatch_size = 64
        self.critic_lr = 0.001
        self.gamma = 0.99
        self.buffer_size = 1000000
        self.random_seed = 1234
        self.summary_dir = "/"
        #self.max_episode = 100
        #self.max_episode_len = 100
        self.mu = 0

        self.actor = ActorNetwork(self.sess, self.state_size, self.action_size,
                                  self.action_low, self.action_high,
                                  self.actor_lr, self.tau, self.minibatch_size)

        self.critic = CriticNetwork(self.sess, self.state_size,
                                    self.action_size, self.critic_lr, self.tau,
                                    self.gamma,
                                    self.actor.get_num_trainable_vars())

        # Initialize replay memory
        self.replay_buffer = ReplayBuffer(self.buffer_size, self.random_seed)
        self.sess.run(tf.global_variables_initializer())
        self.actor.update_target_network()
        self.critic.update_target_network()

        self.noise = OUNoise(self.action_size, self.mu)

        self.sess.run(tf.global_variables_initializer())

    def reset_episode(self):
        #self.actor_noise.reset()
        state = self.env.reset()
        self.last_state = state
        self.ep_ave_max_q = 0
        self.ep_reward = 0
        return state

    def step(self, s, a, r, terminal, s2):
        # Save experience / reward
        #self.memory.add(self.last_state, action, reward, next_state, done)
        #summary_ops, summary_vars = self.build_summaries()
        self.replay_buffer.add(np.reshape(s, (self.actor.s_dim, )),
                               np.reshape(a, (self.actor.a_dim, )), r,
                               terminal, np.reshape(s2, (self.actor.s_dim, )))
        # Learn, if enough samples are available in memory
        if self.replay_buffer.size() > self.minibatch_size:

            s_batch, a_batch, r_batch, t_batch, s2_batch = self.replay_buffer.sample_batch(
                self.minibatch_size)
            #self.train(s_batch, a_batch, r_batch, t_batch, s2_batch)
            target_q = self.critic.predict_target(
                s2_batch, self.actor.predict_target(s2_batch))

            y_i = []
            for k in range(self.minibatch_size):
                if t_batch[k]:
                    y_i.append(r_batch[k])
                else:
                    y_i.append(r_batch[k] + self.critic.gamma * target_q[k])

                    # Update the critic given the targets
            predicted_q_value, _ = self.critic.train(
                s_batch, a_batch, np.reshape(y_i, (self.minibatch_size, 1)))

            #self.ep_ave_max_q += np.amax(predicted_q_value)

            # Update the actor policy using the sampled gradient
            a_outs = self.actor.predict(s_batch)
            grads = self.critic.action_gradients(s_batch, a_outs)
            self.actor.train(s_batch, grads[0])

            # Update target networks
            self.actor.update_target_network()
            self.critic.update_target_network()

        # Roll over last state and action
        self.last_state = s2
        '''
        self.ep_reward +=r
        
        if terminal:
            
            summary_str = self.sess.run(
            , feed_dict={summary_vars[0]: self.ep_reward, summary_vars[1]: self.ep_ave_max_q / float(j)})

            writer.add_summary(summary_str, i)
            #writer.flush()
            
            print('| Reward: {:d} |Qmax: {:.4f}'.format(int(self.ep_reward), \
                             (self.ep_ave_max_q / float(j))))
             '''

    def act(self, states):
        """Returns actions for given state(s) as per current policy."""
        states = np.reshape(states, [-1, self.state_size])

        actions = self.actor.predict(states)[0]
        #actornoises = OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.action_size))
        #print(actions)

        return actions + self.noise.sample()  # add some noise for exploration

    def train(self, s_batch, a_batch, r_batch, t_batch, s2_batch):

        target_q = self.critic.predict_target(
            s2_batch, self.actor.predict_target(s2_batch))

        y_i = []
        for k in range(self.minibatch_size):
            if t_batch[k]:
                y_i.append(r_batch[k])
            else:
                y_i.append(r_batch[k] + self.critic.gamma * target_q[k])

                # Update the critic given the targets
        predicted_q_value, _ = self.critic.train(
            s_batch, a_batch, np.reshape(y_i, (self.minibatch_size, 1)))

        #self.ep_ave_max_q += np.amax(predicted_q_value)

        # Update the actor policy using the sampled gradient
        a_outs = self.actor.predict(s_batch)
        grads = self.critic.action_gradients(s_batch, a_outs)
        self.actor.train(s_batch, grads[0])

        # Update target networks
        self.actor.update_target_network()
        self.critic.update_target_network()

    def build_summaries(self):
        episode_reward = tf.Variable(0.)
        tf.summary.scalar("Reward", episode_reward)
        episode_ave_max_q = tf.Variable(0.)
        tf.summary.scalar("Qmax Value", episode_ave_max_q)

        summary_vars = [episode_reward, episode_ave_max_q]
        summary_ops = tf.summary.merge_all()

        return summary_ops, summary_vars

Example #2

Show file

File: drlAgent.py Project: hexi519/MRTE_ICNP20

class DrlAgent:
    def __init__(self,
                 sess,
                 is_train,
                 dim_state,
                 dim_action,
                 num_paths,
                 actor_learn_rate,
                 critic_learn_rate,
                 tau,
                 buffer_size,
                 mini_batch,
                 ep_begin,
                 epsilon_end,
                 gamma,
                 max_epoch,
                 seed=66):
        self.__is_train = is_train
        self.__dim_state = dim_state
        self.__dim_action = dim_action
        self.__mini_batch = mini_batch
        self.__ep_begin = ep_begin
        self.__gamma = gamma
        self.__max_epoch = max_epoch

        self.__actor = ActorNetwork(sess, dim_state, dim_action, 1.0,
                                    actor_learn_rate, tau, num_paths)
        self.__critic = CriticNetwork(sess, dim_state, dim_action,
                                      critic_learn_rate, tau)

        self.__replay = ReplayBuffer(buffer_size, seed)

        self.__explorer = Explorer(ep_begin, epsilon_end, max_epoch,
                                   dim_action, num_paths, seed)

        self.__state_curt = np.zeros(dim_state)
        self.__action_curt = self.__explorer.convert_action(
            np.ones(dim_action))

        self.__episode = 0
        self.__step = 0

    def target_paras_init(self):
        self.__actor.update_target_paras()
        self.__critic.update_target_paras()

    def predict(self, state, reward):
        action_original = self.__actor.predict([state])[0]
        if not self.__is_train:
            return action_original

        action = self.__explorer.get_act(action_original)
        self.__replay.add(self.__state_curt, self.__action_curt, reward, state)
        self.__state_curt = state
        self.__action_curt = action

        if len(self.__replay) > self.__mini_batch:
            self.train()

        self.__step += 1
        if self.__step >= self.__max_epoch:
            self.__step = 0
            self.__episode += 1
            self.__explorer.reset_ep(self.__ep_begin)
        return action

    def train(self):
        batch_state, batch_action, batch_reward, batch_state_next = self.__replay.sample_batch(
            self.__mini_batch)
        weights = [1.0] * self.__mini_batch
        weights = np.expand_dims(weights, axis=1)
        target_q = self.__critic.predict_target(
            batch_state_next, self.__actor.predict_target(batch_state_next))
        value_q = self.__critic.predict(batch_state, batch_action)

        batch_y = []
        batch_error = []
        for k in range(len(batch_reward)):
            target_y = batch_reward[k] + self.__gamma * target_q[k]
            batch_error.append(abs(target_y - value_q[k]))
            batch_y.append(target_y)

        predicted_q, _ = self.__critic.train(batch_state, batch_action,
                                             batch_y, weights)
        a_outs = self.__actor.predict(batch_state)
        grads = self.__critic.calculate_gradients(batch_state, a_outs)
        weighted_grads = weights * grads[0]
        self.__actor.train(batch_state, weighted_grads)
        self.__actor.update_target_paras()
        self.__critic.update_target_paras()

Example #3

Show file

def trainer(epochs=1000, MINIBATCH_SIZE=40, GAMMA = 0.99, epsilon=1.0, min_epsilon=0.01, BUFFER_SIZE=10000, train_indicator=True, render=False):
    with tf.Session() as sess:


        # configuring environment
        env = gym.make(ENV_NAME)
        # configuring the random processes
        np.random.seed(RANDOM_SEED)
        tf.set_random_seed(RANDOM_SEED)
        env.seed(RANDOM_SEED)
        # info of the environment to pass to the agent
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        action_bound = np.float64(10) # I choose this number since the mountain continuos does not have a boundary
        # Creating agent
        ruido = OUNoise(action_dim, mu = 0.4) # this is the Ornstein-Uhlenbeck Noise
        actor = ActorNetwork(sess, state_dim, action_dim, action_bound, ACTOR_LEARNING_RATE, TAU, DEVICE)
        critic = CriticNetwork(sess, state_dim, action_dim, CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars(), DEVICE)


        sess.run(tf.global_variables_initializer())

        # Initialize target network weights
        actor.update_target_network()
        critic.update_target_network()
        # Initialize replay memory
        replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)

        goal = 0
        max_state = -1.
        try:
            critic.recover_critic()
            actor.recover_actor()
            print('********************************')
            print('models restored succesfully')
            print('********************************')
        except:
            pass
#            print('********************************')
#            print('Failed to restore models')
#            print('********************************')


        for i in range(epochs):

            state = env.reset()
            state = np.hstack(state)
            ep_reward = 0
            ep_ave_max_q = 0
            done = False
            step = 0
            max_state_episode = -1
            epsilon -= (epsilon/EXPLORE)
            epsilon = np.maximum(min_epsilon,epsilon)


            while (not done):

                if render:
                    env.render()

                #print('step', step)
                # 1. get action with actor, and add noise
                action_original = actor.predict(np.reshape(state,(1,state_dim))) # + (10. / (10. + i))* np.random.randn(1)
                action = action_original + max(epsilon,0)*ruido.noise()


                # remove comment if you want to see a step by step update
                # print(step,'a',action_original, action,'s', state[0], 'max state', max_state_episode)

                # 2. take action, see next state and reward :
                next_state, reward, done, info = env.step(action)

                if train_indicator:
                    # 3. Save in replay buffer:
                    replay_buffer.add(np.reshape(state, (actor.s_dim,)), np.reshape(action, (actor.a_dim,)), reward,
                                      done, np.reshape(next_state, (actor.s_dim,)))

                    # Keep adding experience to the memory until
                    # there are at least minibatch size samples
                    if replay_buffer.size() > MINIBATCH_SIZE:

                        # 4. sample random minibatch of transitions:
                        s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(MINIBATCH_SIZE)

                        # Calculate targets

                        # 5. Train critic Network (states,actions, R + gamma* V(s', a')):
                        # 5.1 Get critic prediction = V(s', a')
                        # the a' is obtained using the actor prediction! or in other words : a' = actor(s')
                        target_q = critic.predict_target(s2_batch, actor.predict_target(s2_batch))

                        # 5.2 get y_t where:
                        y_i = []
                        for k in range(MINIBATCH_SIZE):
                            if t_batch[k]:
                                y_i.append(r_batch[k])
                            else:
                                y_i.append(r_batch[k] + GAMMA * target_q[k])


                        # 5.3 Train Critic!
                        predicted_q_value, _ = critic.train(s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))

                        ep_ave_max_q += np.amax(predicted_q_value)

                        # 6 Compute Critic gradient (depends on states and actions)
                        # 6.1 therefore I first need to calculate the actions the current actor would take.
                        a_outs = actor.predict(s_batch)
                        # 6.2 I calculate the gradients
                        grads = critic.action_gradients(s_batch, a_outs)
                        actor.train(s_batch, grads[0])

                        # Update target networks
                        actor.update_target_network()
                        critic.update_target_network()


                state = next_state
                if next_state[0] > max_state_episode:
                    max_state_episode = next_state[0]

                ep_reward = ep_reward + reward
                step +=1

            if done:
                ruido.reset()
                if state[0] > 0.45:
                    #print('****************************************')
                    #print('got it!')
                    #print('****************************************')
                    goal += 1

            if max_state_episode > max_state:
                max_state = max_state_episode
            print('th',i+1,'n steps', step,'R:', round(ep_reward,3),'Pos', round(epsilon,3),'Efficiency', round(100.*((goal)/(i+1.)),3) )


            # print('Efficiency', 100.*((goal)/(i+1.)))


        print('*************************')
        print('now we save the model')
        critic.save_critic()
        actor.save_actor()
        print('model saved succesfuly')
        print('*************************')

Example #4

Show file

def actor_critic(epochs=1000,
                 GAMMA=0.99,
                 load_file=False,
                 render=False,
                 temp=False,
                 verbose=False):
    with tf.Session() as sess:

        # define objects
        # the gym environment is wrapped in a class. this way of working allows portability with other robots in the lab & makes the main very clear
        #robot = gym_pendulum(render, temp)
        robot = gym_mountaincar(render, temp)
        actor = ActorNetwork(sess,
                             robot.state_dim,
                             robot.action_dim,
                             ACTOR_LEARNING_RATE,
                             ACTION_BOUND,
                             device=DEVICE)
        critic = CriticNetwork(sess,
                               robot.state_dim,
                               CRITIC_LEARNING_RATE,
                               actor.get_num_trainable_vars(),
                               device=DEVICE)
        # starting tensorflow
        sess.run(tf.global_variables_initializer())

        if load_file:
            actor.recover_actor()
            critic.recover_critic()

        for i in range(epochs):
            # Reset the environment
            state, done, step = robot.reset()
            ep_reward = 0

            while (not done):
                # Choose and take action, and observe reward
                action, mu, sigma = actor.predict(
                    np.reshape(state, (1, robot.state_dim)))

                new_action = action + 0.2 * (np.random.rand(1)[0])
                action_noise = np.clip(new_action, -ACTION_BOUND, ACTION_BOUND)
                # print(round(action,3), round(new_action,3), round(action_noise,3),  round(mu,3), round(sigma,3))
                next_state, reward, done, step = robot.update(action_noise)

                # Train
                V_minib = critic.predict(
                    np.reshape(state, (1, robot.state_dim)))
                V_minib_next = critic.predict(
                    np.reshape(next_state, (1, robot.state_dim)))
                if done:
                    td_target = reward
                    td_error = reward - V_minib  # not - V_minib[k] ?
                else:
                    td_target = reward + GAMMA * V_minib_next
                    td_error = reward + GAMMA * V_minib_next - V_minib

                #critic.train(np.reshape(state, (1, robot.state_dim)), np.reshape(td_target, (1, 1)))
                critic.train(np.reshape(state, (1, robot.state_dim)),
                             np.reshape(td_target, (1, 1)))
                actor.train(np.reshape(state, (1, robot.state_dim)),
                            np.reshape(action, (1, 1)),
                            np.reshape(td_error, (1, 1)))

                state = next_state
                ep_reward = ep_reward + reward
                # this print is usefull for debuggin
                if verbose:
                    print(step, 'action', round(action, 3), 'state',
                          round(robot.state[0], 3), round(robot.state[1], 3),
                          'r', round(reward, 3))

            print('episode', i + 1, 'Steps', step, 'Reward:', ep_reward,
                  'goal achieved:', robot.goal, 'Efficiency',
                  round(100. * ((robot.goal) / (i + 1.)), 0), '%')
            #time.sleep(1)

        print('*************************')
        print('now we save the model')
        critic.save_critic()
        actor.save_actor()
        print('model saved succesfuly')
        print('*************************')

Example #5

Show file

File: ddpg.py Project: MaidouPP/robot_follower

class DDPG:
    def __init__(self, pretrain=False):

        # Make sure all the directories exist
        if not tf.gfile.Exists(TFLOG_PATH):
            tf.gfile.MakeDirs(TFLOG_PATH)
        if not tf.gfile.Exists(EXPERIENCE_PATH):
            tf.gfile.MakeDirs(EXPERIENCE_PATH)
        if not tf.gfile.Exists(NET_SAVE_PATH):
            tf.gfile.MakeDirs(NET_SAVE_PATH)

        # Initialize our session
        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        self.session = tf.Session(config=config)
        # self.session = tf.Session()
        self.graph = self.session.graph

        with self.graph.as_default():

            # View the state batches
            # self.visualize_input = VISUALIZE_BUFFER
            # if self.visualize_input:
            #     self.viewer = CostmapVisualizer()

            # Hardcode input size and action size
            self.height = 662
            self.width = 1
            self.depth = 4
            self.action_dim = 2

            # Initialize the current action and the old action and old state for setting experiences
            self.old_state = np.zeros((self.width, self.height, self.depth),
                                      dtype='float32')
            self.old_action = np.ones(2, dtype='float32')
            self.network_action = np.zeros(2, dtype='float32')
            self.noise_action = np.zeros(2, dtype='float32')
            self.action = np.zeros(2, dtype='float32')

            # Initialize the grad inverter object to keep the action bounds
            self.grad_inv = GradInverter(A0_BOUNDS, A1_BOUNDS, self.session)

            # Make sure the directory for the data files exists
            if not tf.gfile.Exists(DATA_PATH):
                tf.gfile.MakeDirs(DATA_PATH)

            # Initialize summary writers to plot variables during training
            self.summary_op = tf.summary.merge_all()
            self.summary_writer = tf.summary.FileWriter(TFLOG_PATH)

            # Initialize actor and critic networks
            self.actor_network = ActorNetwork(self.height, self.action_dim,
                                              self.depth, self.session,
                                              self.summary_writer)
            self.critic_network = CriticNetwork(self.height, self.action_dim,
                                                self.depth, self.session,
                                                self.summary_writer)

            # Initialize the saver to save the network params
            self.saver = tf.train.Saver()

            # initialize the experience data manger
            self.data_manager = DataManager(BATCH_SIZE, EXPERIENCE_PATH,
                                            self.session)

            # Uncomment if collecting a buffer for the autoencoder
            # self.buffer = deque()

            # Should we load the pre-trained params?
            # If so: Load the full pre-trained net
            # Else:  Initialize all variables the overwrite the conv layers with the pretrained filters
            if PRE_TRAINED_NETS:
                self.saver.restore(self.session, NET_LOAD_PATH)
            else:
                self.session.run(tf.initialize_all_variables())

            tf.train.start_queue_runners(sess=self.session)
            time.sleep(1)

            # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
            self.exploration_noise = OUNoise(self.action_dim, MU, THETA, SIGMA)
            self.noise_flag = True

            # Initialize time step
            self.training_step = 0

            # Flag: don't learn the first experience
            self.first_experience = True

            # After the graph has been filled add it to the summary writer
            self.summary_writer.add_graph(self.graph)

    def train(self):

        # Check if the buffer is big enough to start training
        if self.data_manager.enough_data():

            # start_ = time.time()

            # get the next random batch from the data manger
            state_batch, \
                action_batch, \
                reward_batch, \
                next_state_batch, \
                is_episode_finished_batch = self.data_manager.get_next_batch()

            state_batch = np.divide(state_batch, 10.0)
            next_state_batch = np.divide(next_state_batch, 10.0)

            # Are we visualizing the first state batch for debugging?
            # If so: We have to scale up the values for grey scale before plotting
            # if self.visualize_input:
            #     state_batch_np = np.asarray(state_batch)
            #     state_batch_np = np.multiply(state_batch_np, -100.0)
            #     state_batch_np = np.add(state_batch_np, 100.0)
            #     self.viewer.set_data(state_batch_np)
            #     self.viewer.run()
            #     self.visualize_input = False

            # Calculate y for the td_error of the critic

            # start = time.time()
            y_batch = []
            next_action_batch = self.actor_network.target_evaluate(
                next_state_batch, action_batch)
            q_value_batch = self.critic_network.target_evaluate(
                next_state_batch, next_action_batch)
            # done = time.time()
            # elapsed = done - start
            # print "forward actor and critic time is: ", elapsed

            for i in range(0, BATCH_SIZE):
                if is_episode_finished_batch[i]:
                    y_batch.append([reward_batch[i]])
                else:
                    y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])

            # Now that we have the y batch lets train the critic
            # start = time.time()
            self.critic_network.train(y_batch, state_batch, action_batch)
            # done = time.time()
            # elapsed = done - start
            # print "train critic time is: ", elapsed

            # self.critic_network.train(y_batch, state_batch, action_batch)

            # Get the action batch so we can calculate the action gradient with it
            # Then get the action gradient batch and adapt the gradient with the gradient inverting method
            # start = time.time()
            action_batch_for_gradients = self.actor_network.evaluate(
                state_batch, action_batch)
            # done = time.time()
            # elapsed = done - start
            # print "forward action after critic training time is: ", elapsed

            q_gradient_batch = self.critic_network.get_action_gradient(
                state_batch, action_batch_for_gradients)
            q_gradient_batch = self.grad_inv.invert(
                q_gradient_batch, action_batch_for_gradients)

            # Now we can train the actor
            # start = time.time()
            self.actor_network.train(q_gradient_batch, state_batch,
                                     action_batch)
            # done = time.time()
            # elapsed = done - start
            # print "train actor time is: ", elapsed

            # done = time.time()
            # elapsed = done - start_
            # print "====== total time is: ", elapsed

            # Save model if necessary
            if self.training_step > 0 and self.training_step % SAVE_STEP == 0:
                self.saver.save(self.session,
                                NET_SAVE_PATH,
                                global_step=self.training_step)

            # Update time step
            self.training_step += 1

            if self.training_step % 400 == 0:
                print "iter: ", self.training_step

        # start_ = time.time()
        self.data_manager.check_for_enqueue()
        # done = time.time()
        # elapsed = done - start_
        # print "############ check enqueue time is: ", elapsed

    def get_action(self, state, old_action):

        # normalize the state
        state = state.astype(float)
        state = np.divide(state, 10.0)

        # Get the action
        self.action = self.actor_network.get_action(state, old_action)
        self.action = self.action.reshape((2, ))

        # Are we using noise?
        if self.noise_flag:
            # scale noise down to 0 at training step 3000000
            self.action = 0.8 * self.exploration_noise.noise()
#            if self.training_step < MAX_NOISE_STEP:
#                self.action += (MAX_NOISE_STEP - self.training_step) / \
#                    MAX_NOISE_STEP * self.exploration_noise.noise()
# if action value lies outside of action bounds, rescale the action vector
#            if self.action[0] < A0_BOUNDS[0] or self.action[0] > A0_BOUNDS[1]:
#                self.action *= np.fabs(A0_BOUNDS[0] / self.action[0])
#            if self.action[1] < A0_BOUNDS[0] or self.action[1] > A0_BOUNDS[1]:
#                self.action *= np.fabs(A1_BOUNDS[0] / self.action[1])

# Life q value output for this action and state
        self.print_q_value(state, self.action)

        return self.action

    def set_experience(self, state, reward, is_episode_finished):

        # Make sure we're saving a new old_state for the first experience of every episode
        if self.first_experience:
            self.first_experience = False
        else:
            state.astype('float32')
            self.old_action.astype('float32')
            self.old_action.astype('float32')
            self.data_manager.store_experience_to_file(self.old_state,
                                                       self.old_action, reward,
                                                       state,
                                                       is_episode_finished)

            # Uncomment if collecting data for the auto_encoder
            # experience = (self.old_state, self.old_action, reward, state, is_episode_finished)
            # self.buffer.append(experience)

        if is_episode_finished:
            self.first_experience = True
            self.exploration_noise.reset()

        # Safe old state and old action for next experience
        self.old_state = state
        self.old_action = self.action

    def print_q_value(self, state, action):

        string = "-"
        q_value = self.critic_network.evaluate([state], [action])
        stroke_pos = 30 * q_value[0][0] + 30
        if stroke_pos < 0:
            stroke_pos = 0
        elif stroke_pos > 60:
            stroke_pos = 60

Example #6

Show file

class DDPG:
    """docstring for DDPG"""
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.Session()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
        self.sess.run(tf.global_variables_initializer())
        #target_param <- eval_param
        self.actor_network.update_target()
        self.critic_network.update_target()

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.sample(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.size > REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

Example #7

Show file

File: ddpg.py Project: JakobBreuninger/neurobotics

class DDPG:

    def __init__(self):

        # Make sure all the directories exist
        if not tf.gfile.Exists(TFLOG_PATH):
            tf.gfile.MakeDirs(TFLOG_PATH)
        if not tf.gfile.Exists(EXPERIENCE_PATH):
            tf.gfile.MakeDirs(EXPERIENCE_PATH)
        if not tf.gfile.Exists(NET_SAVE_PATH):
            tf.gfile.MakeDirs(NET_SAVE_PATH)

        # Initialize our session
        self.session = tf.Session()
        self.graph = self.session.graph

        with self.graph.as_default():

            # View the state batches
            self.visualize_input = VISUALIZE_BUFFER
            if self.visualize_input:
                self.viewer = CostmapVisualizer()

            # Hardcode input size and action size
            self.height = 86
            self.width = self.height
            self.depth = 4
            self.action_dim = 2

            # Initialize the current action and the old action and old state for setting experiences
            self.old_state = np.zeros((self.width, self.height, self.depth), dtype='int8')
            self.old_action = np.ones(2, dtype='float')
            self.network_action = np.zeros(2, dtype='float')
            self.noise_action = np.zeros(2, dtype='float')
            self.action = np.zeros(2, dtype='float')

            # Initialize the grad inverter object to keep the action bounds
            self.grad_inv = GradInverter(A0_BOUNDS, A1_BOUNDS, self.session)

            # Make sure the directory for the data files exists
            if not tf.gfile.Exists(DATA_PATH):
                tf.gfile.MakeDirs(DATA_PATH)

            # Initialize summary writers to plot variables during training
            self.summary_op = tf.merge_all_summaries()
            self.summary_writer = tf.train.SummaryWriter(TFLOG_PATH)

            # Initialize actor and critic networks
            self.actor_network = ActorNetwork(self.height, self.action_dim, self.depth, self.session,
                                              self.summary_writer)
            self.critic_network = CriticNetwork(self.height, self.action_dim, self.depth, self.session,
                                                self.summary_writer)

            # Initialize the saver to save the network params
            self.saver = tf.train.Saver()

            # initialize the experience data manger
            self.data_manager = DataManager(BATCH_SIZE, EXPERIENCE_PATH, self.session)

            # Uncomment if collecting a buffer for the autoencoder
            # self.buffer = deque()

            # Should we load the pre-trained params?
            # If so: Load the full pre-trained net
            # Else:  Initialize all variables the overwrite the conv layers with the pretrained filters
            if PRE_TRAINED_NETS:
                self.saver.restore(self.session, NET_LOAD_PATH)
            else:
                self.session.run(tf.initialize_all_variables())

            tf.train.start_queue_runners(sess=self.session)
            time.sleep(1)

            # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
            self.exploration_noise = OUNoise(self.action_dim, MU, THETA, SIGMA)
            self.noise_flag = True

            # Initialize time step
            self.training_step = 0

            # Flag: don't learn the first experience
            self.first_experience = True

            # After the graph has been filled add it to the summary writer
            self.summary_writer.add_graph(self.graph)

    def train(self):

        # Check if the buffer is big enough to start training
        if self.data_manager.enough_data():

            # get the next random batch from the data manger
            state_batch, \
                action_batch, \
                reward_batch, \
                next_state_batch, \
                is_episode_finished_batch = self.data_manager.get_next_batch()

            state_batch = np.divide(state_batch, 100.0)
            next_state_batch = np.divide(next_state_batch, 100.0)

            # Are we visualizing the first state batch for debugging?
            # If so: We have to scale up the values for grey scale before plotting
            if self.visualize_input:
                state_batch_np = np.asarray(state_batch)
                state_batch_np = np.multiply(state_batch_np, -100.0)
                state_batch_np = np.add(state_batch_np, 100.0)
                self.viewer.set_data(state_batch_np)
                self.viewer.run()
                self.visualize_input = False

            # Calculate y for the td_error of the critic
            y_batch = []
            next_action_batch = self.actor_network.target_evaluate(next_state_batch)
            q_value_batch = self.critic_network.target_evaluate(next_state_batch, next_action_batch)

            for i in range(0, BATCH_SIZE):
                if is_episode_finished_batch[i]:
                    y_batch.append([reward_batch[i]])
                else:
                    y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])

            # Now that we have the y batch lets train the critic
            self.critic_network.train(y_batch, state_batch, action_batch)

            # Get the action batch so we can calculate the action gradient with it
            # Then get the action gradient batch and adapt the gradient with the gradient inverting method
            action_batch_for_gradients = self.actor_network.evaluate(state_batch)
            q_gradient_batch = self.critic_network.get_action_gradient(state_batch, action_batch_for_gradients)
            q_gradient_batch = self.grad_inv.invert(q_gradient_batch, action_batch_for_gradients)

            # Now we can train the actor
            self.actor_network.train(q_gradient_batch, state_batch)

            # Save model if necessary
            if self.training_step > 0 and self.training_step % SAVE_STEP == 0:
                self.saver.save(self.session, NET_SAVE_PATH, global_step=self.training_step)

            # Update time step
            self.training_step += 1

        self.data_manager.check_for_enqueue()

    def get_action(self, state):

        # normalize the state
        state = state.astype(float)
        state = np.divide(state, 100.0)

        # Get the action
        self.action = self.actor_network.get_action(state)

        # Are we using noise?
        if self.noise_flag:
            # scale noise down to 0 at training step 3000000
            if self.training_step < MAX_NOISE_STEP:
                self.action += (MAX_NOISE_STEP - self.training_step) / MAX_NOISE_STEP * self.exploration_noise.noise()
            # if action value lies outside of action bounds, rescale the action vector
            if self.action[0] < A0_BOUNDS[0] or self.action[0] > A0_BOUNDS[1]:
                self.action *= np.fabs(A0_BOUNDS[0]/self.action[0])
            if self.action[1] < A0_BOUNDS[0] or self.action[1] > A0_BOUNDS[1]:
                self.action *= np.fabs(A1_BOUNDS[0]/self.action[1])

        # Life q value output for this action and state
        self.print_q_value(state, self.action)

        return self.action

    def set_experience(self, state, reward, is_episode_finished):

        # Make sure we're saving a new old_state for the first experience of every episode
        if self.first_experience:
            self.first_experience = False
        else:
            self.data_manager.store_experience_to_file(self.old_state, self.old_action, reward, state,
                                                       is_episode_finished)

            # Uncomment if collecting data for the auto_encoder
            # experience = (self.old_state, self.old_action, reward, state, is_episode_finished)
            # self.buffer.append(experience)

        if is_episode_finished:
            self.first_experience = True
            self.exploration_noise.reset()

        # Safe old state and old action for next experience
        self.old_state = state
        self.old_action = self.action

    def print_q_value(self, state, action):

        string = "-"
        q_value = self.critic_network.evaluate([state], [action])
        stroke_pos = 30 * q_value[0][0] + 30
        if stroke_pos < 0:
            stroke_pos = 0
        elif stroke_pos > 60:
            stroke_pos = 60
        print '[' + stroke_pos * string + '|' + (60-stroke_pos) * string + ']', "Q: ", q_value[0][0], \
            "\tt: ", self.training_step

Example #8

Show file

def actor_critic(epochs=1000,
                 GAMMA=0.99,
                 train_indicator=True,
                 render=False,
                 temp=False):
    with tf.Session() as sess:

        # define objects
        # the gym environment is wrapped in a class. this way of working allows portability with other robots in the lab & makes the main very clear
        robot = gym_environment('FrozenLakeNonskid8x8-v0', False, render, temp)
        actor = ActorNetwork(sess, robot.state_dim, robot.action_dim,
                             ACTOR_LEARNING_RATE)
        critic = CriticNetwork(sess, robot.state_dim, CRITIC_LEARNING_RATE,
                               actor.get_num_trainable_vars())
        # starting tensorflow
        sess.run(tf.global_variables_initializer())

        for i in range(epochs):
            # Reset the environment
            state, done, step = robot.reset()
            ep_reward = 0

            while (not done):
                # Choose and take action, and observe reward
                action_prob = actor.predict(
                    np.reshape(state, (1, robot.state_dim)))
                action = np.random.choice(np.arange(len(action_prob)),
                                          p=action_prob)
                next_state, reward, done, step = robot.update(action)
                # Train
                V_minib = critic.predict(
                    np.reshape(state, (1, robot.state_dim)))
                V_minib_next = critic.predict(
                    np.reshape(next_state, (1, robot.state_dim)))
                if done:
                    td_target = reward
                    td_error = reward - V_minib  # not - V_minib[k] ?
                else:
                    td_target = reward + GAMMA * V_minib_next
                    td_error = reward + GAMMA * V_minib_next - V_minib

                critic.train(np.reshape(state, (1, robot.state_dim)),
                             np.reshape(td_target, (1, 1)))
                actor.train(np.reshape(state, (1, robot.state_dim)),
                            np.reshape(action, (1, 1)),
                            np.reshape(td_error, (1, 1)))

                state = next_state
                ep_reward = ep_reward + reward
                # this print is usefull for debuggin
                #print(step,'action', action, 'state', robot.uncodedstate,'r', round(reward,3), 'prob', action_prob)

            print('episode', i + 1, 'Steps', step, 'Reward:', ep_reward,
                  'goal achieved:', robot.goal, 'Efficiency',
                  round(100. * ((robot.goal) / (i + 1.)), 0), '%')

        print('*************************')
        print('now we save the model')
        critic.save_critic()
        actor.save_actor()
        print('model saved succesfuly')
        print('*************************')

Example #9

Show file

def actor_critic(epochs=1000,
                 GAMMA=0.99,
                 train_indicator=True,
                 render=False,
                 temp=False,
                 baseline=True):
    with tf.Session() as sess:

        # define objects
        # the gym environment is wrapped in a class. this way of working allows portability with other robots in the lab & makes the main very clear
        robot = gym_environment('FrozenLakeNonskid8x8-v0', False, render, temp)
        actor = ActorNetwork(sess, robot.state_dim, robot.action_dim,
                             ACTOR_LEARNING_RATE)
        critic = CriticNetwork(sess, robot.state_dim, CRITIC_LEARNING_RATE,
                               actor.get_num_trainable_vars())
        # starting tensorflow
        sess.run(tf.global_variables_initializer())

        for i in range(epochs):
            # Reset the environment
            state, done, step = robot.reset()
            ep_reward = 0
            total_reward = np.zeros(max_episode)
            total_state = deque()
            total_action = deque()
            k = 0
            while (not done) and k < max_episode:
                # Choose and take action, and observe reward
                action_prob = actor.predict(
                    np.reshape(state, (1, robot.state_dim)))
                action = np.random.choice(np.arange(len(action_prob)),
                                          p=action_prob)
                next_state, reward, done, step = robot.update(action)
                # store episode information
                total_reward[k] = reward
                total_state.append(state)
                total_action.append(action)
                state = next_state
                k = k + 1

            # Train
            # get G
            for l in range(k):

                G = np.sum(total_reward[l:k + 1])
                #print(l,G) # print for debug
                state = np.reshape(total_state[l], (1, robot.state_dim))
                action = np.reshape(total_action[l], (1, 1))

                if baseline:
                    delta = G - critic.predict(state)
                    critic.train(state, delta)
                    actor.train(state, action, delta)
                else:
                    actor.train(state, action, G)

            # this print is usefull for debuggin
            #print(step,'action', action, 'state', robot.uncodedstate,'r', round(reward,3), 'prob', action_prob)

            print('episode', i + 1, 'Steps', step, 'Reward:', ep_reward,
                  'goal achieved:', robot.goal, 'Efficiency',
                  round(100. * ((robot.goal) / (i + 1.)), 0), '%')

        print('*************************')
        print('now we save the model')
        critic.save_critic()
        actor.save_actor()
        print('model saved succesfuly')
        print('*************************')

Example #10

Show file

File: drone_train.py Project: a4aleem/UAV-DDPG

def trainer(env,
            outdir,
            epochs=100,
            MINIBATCH_SIZE=64,
            GAMMA=0.99,
            epsilon=0.01,
            min_epsilon=0.01,
            BUFFER_SIZE=10000,
            train_indicator=False,
            render=False):
    tf.reset_default_graph()
    with tf.Session(config=config) as sess:

        # configuring environment
        #env = gym.make(ENV_NAME)
        # configuring the random processes
        np.random.seed(RANDOM_SEED)
        tf.set_random_seed(RANDOM_SEED)
        env.seed(RANDOM_SEED)
        # info of the environment to pass to the agent
        state_dim = env.observation_space
        action_dim = env.action_space
        action_bound = np.float64(
            1
        )  # I choose this number since the mountain continuos does not have a boundary
        # Creating agent

        # FOR the RNN
        #tf.contrib.rnn.core_rnn_cell.BasicLSTMCell from https://github.com/tensorflow/tensorflow/issues/8771
        #cell = tf.contrib.rnn.BasicLSTMCell(num_units=300,state_is_tuple=True, reuse = None)
        #cell_target = tf.contrib.rnn.BasicLSTMCell(num_units=300,state_is_tuple=True, reuse = None)
        ruido = OUNoise(action_dim,
                        mu=0.4)  # this is the Ornstein-Uhlenbeck Noise
        actor = ActorNetwork(sess, state_dim, action_dim, action_bound,
                             ACTOR_LEARNING_RATE, TAU, outdir)
        critic = CriticNetwork(sess, state_dim, action_dim,
                               CRITIC_LEARNING_RATE, TAU,
                               actor.get_num_trainable_vars(), outdir)

        #sess.run(tf.global_variables_initializer())

        # Initialize target network weights
        actor.update_target_network()
        critic.update_target_network()
        # Initialize replay memory
        replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)
        replay_buffer.load()

        #goal = 0
        max_state = -1.
        try:
            critic.recover_critic()
            actor.recover_actor()
            print('********************************')
            print('models restored succesfully')
            print('********************************')
        except Exception as e:
            print('********************************')
            print(e)
            print('********************************')
        #critic.recover_critic()
        #actor.recover_actor()

        for i in range(epochs):
            state = env.reset()
            #state = np.hstack(state)
            ep_reward = 0
            ep_ave_max_q = 0
            done = False
            step = 0
            max_state_episode = -1
            epsilon -= epsilon / EXPLORE
            if epsilon < min_epsilon:
                epsilon = min_epsilon
            while (not done):

                if render:
                    env.render()

                #print('step', step)
                # 1. get action with actor, and add noise

                np.set_printoptions(precision=4)
                # remove comment if you want to see a step by step update
                #print(step,'a',action_original, action,'s', state[0], 'max state', max_state_episode)

                # 2. take action, see next state and reward :
                action_original = actor.predict(
                    np.reshape(state,
                               (1, actor.s_dim
                                )))  # + (10. / (10. + i))* np.random.randn(1)
                action = action_original  #+ max(epsilon, 0) * ruido.noise()
                '''
                for j in range(action.shape[1]):
                    if abs(action[0,j]) > 1:
                        act=action[0,j]
                        action[0,j]=act/abs(act)
                    else:
                        continue
                '''
                action = np.reshape(action, (actor.a_dim, ))
                next_state, reward, done, info = env.step(action)
                if train_indicator:
                    # 3. Save in replay buffer:
                    replay_buffer.add(np.reshape(state, (actor.s_dim, )),
                                      np.reshape(action, (actor.a_dim, )),
                                      reward, done,
                                      np.reshape(next_state, (actor.s_dim, )))

                    # Keep adding experience to the memory until
                    # there are at least minibatch size samples
                    if replay_buffer.size() > MINIBATCH_SIZE:

                        # 4. sample random minibatch of transitions:
                        s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(
                            MINIBATCH_SIZE)

                        # Calculate targets

                        # 5. Train critic Network (states,actions, R + gamma* V(s', a')):
                        # 5.1 Get critic prediction = V(s', a')
                        # the a' is obtained using the actor prediction! or in other words : a' = actor(s')
                        target_q = critic.predict_target(
                            s2_batch, actor.predict_target(s2_batch), 20)

                        # 5.2 get y_t where:
                        y_i = []
                        for k in range(MINIBATCH_SIZE):
                            if t_batch[k]:
                                y_i.append(r_batch[k])
                            else:
                                y_i.append(r_batch[k] + GAMMA * target_q[k])

                        # 5.3 Train Critic!
                        predicted_q_value, _ = critic.train(
                            s_batch, a_batch,
                            np.reshape(y_i, (MINIBATCH_SIZE, 1)), 20)

                        ep_ave_max_q += np.amax(predicted_q_value)

                        # 6 Compute Critic gradient (depends on states and actions)
                        # 6.1 therefore I first need to calculate the actions the current actor would take.
                        a_outs = actor.predict(s_batch)
                        # 6.2 I calculate the gradients
                        grads = critic.action_gradients(s_batch, a_outs, 20)
                        c = np.array(grads)
                        #print(c.shape)
                        #print('...')
                        #print('...',c[0].shape)
                        #print('...')
                        actor.train(s_batch, grads[0])

                        # Update target networks
                        actor.update_target_network()
                        critic.update_target_network()
                state = next_state
                if next_state[0] > max_state_episode:
                    max_state_episode = next_state[0]

                ep_reward = ep_reward + reward
                step += 1

            if max_state_episode > max_state:
                max_state = max_state_episode

            print('th', i + 1, 'Step', step, 'Reward:', ep_reward, 'Pos',
                  next_state[0], next_state[1], 'epsilon', epsilon)
            print('*************************')
            print('now we save the model')
            critic.save_critic()
            actor.save_actor()
            print('model saved succesfuly')
            print('*************************')
            replay_buffer.save()
            #proc = Popen(['rosclean','purge'],stdout=PIPE, stdin=PIPE, stderr=PIPE,universal_newlines=True)
            #out,err = proc.communicate(input="{}\n".format("y"))
            #print('maxmimum state reach', max_state)
            #print('the reward at the end of the episode,', reward)
            #print('Efficiency', 100.*((goal)/(i+1.)))
        '''
        print('*************************')
        print('now we save the model')
        critic.save_critic()
        actor.save_actor()
        print('model saved succesfuly')
        print('*************************')
        replay_buffer.save()
        #env.close()
        '''
        sess.close()
    return 0