Esempio n. 1
0
class Ddpg_Agent():
    def __init__(self, params):

        self.env = gym.make('CartPole-v0')
        self.params = params
        self.graph = tf.Graph()

        with self.graph.as_default():

            self.main_actor = Policy_network(params, "primary")
            tvars = tf.trainable_variables()
            tact_start_index = int(len(tvars))

            self.target_actor = Policy_network(params, "target")
            tvars = tf.trainable_variables()
            mcri_start_index = int(len(tvars))

            self.main_critic = Value_network(params, "primary")
            tvars = tf.trainable_variables()
            tcri_start_index = int(len(tvars))

            self.target_critic = Value_network(params, "target")

            self.tvars = tf.trainable_variables()

            self.main_actor_tvars = self.tvars[:tact_start_index]
            self.target_actor_tvars = self.tvars[
                tact_start_index:mcri_start_index]
            self.main_critic_tvars = self.tvars[
                mcri_start_index:tcri_start_index]
            self.target_critic_tvars = self.tvars[tcri_start_index:]

            self.main_actor.backprop(self.main_actor_tvars)

            self.init = tf.global_variables_initializer()
            self.saver = tf.train.Saver()

        if not os.path.exists(self.params.logdir):
            os.mkdir(self.params.logdir)

        self.myBuffer = ReplayMemory(max_size=self.params.max_buffer_size)
        self.running_reward = None
        self.reward_sum = 0
        self.global_step = 0

        self.actor_targetOps = self.update_TargetGraph(self.main_actor_tvars,
                                                       self.target_actor_tvars,
                                                       self.params.tau)
        self.critic_targetOps = self.update_TargetGraph(
            self.main_critic_tvars, self.target_critic_tvars, self.params.tau)

    def update_TargetGraph(self, main_tfVar, target_tfVar, tau):
        '''Holds operation node for assigning Target values to Target network
        Args:
            tfVars - Variables for training(weights, bias...)
            Tau - rate for updating (low Tau value for slow updates)
        Return:
            op_holder - tf.assign() operation. input for updateTarget Function'''

        assert len(main_tfVar) == len(target_tfVar)
        total_vars = len(main_tfVar)
        op_holder = []

        # for latter-half part of trainable variables (= for Target network variables)
        for idx, var in enumerate(main_tfVar[0:total_vars]):
            # assigning tau*new_value+(1-tau)*old_values
            op_holder.append(target_tfVar[idx].assign((var.value() * tau) + (
                (1 - tau) * target_tfVar[idx].value())))

        return op_holder

    def update_Target(self, op_holder, sess):
        '''run operation defined in updateTargetGraph function'''

        for op in op_holder:
            sess.run(op)

    def _load_model(self, sess, load_ckpt):
        if load_ckpt:
            print('Loading Model...')
            ckpt = tf.train.get_checkpoint_state(self.params.logdir)
            self.saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            # initialize gloabl variables
            print('Initialize variables...')
            sess.run(self.init)

    def train(self):

        with tf.Session(graph=self.graph) as sess:

            self._load_model(sess, self.params.load_model)
            self.total_episodes = self.params.total_episodes

            # Obtain an initial observation of the environment
            state = self.env.reset()
            state_input = state.reshape([1, self.params.input_dim])

            for episode_number in xrange(self.params.total_episodes):

                done = False
                score = 0

                while not done:

                    if self.global_step > self.params.preTrainStep:

                        # Value network update
                        trainBatch = self.myBuffer.sample(
                            self.params.batch_size)

                        batch_state = np.array(trainBatch[0]).reshape(
                            [self.params.batch_size, self.params.input_dim])
                        batch_actions = np.array(trainBatch[1]).reshape(
                            [self.params.batch_size, self.params.num_actions])
                        batch_rewards = np.array(trainBatch[2])
                        batch_next_state = np.array(trainBatch[3]).reshape(
                            [self.params.batch_size, self.params.input_dim])
                        batch_done = np.array(trainBatch[4])

                        end_multiplier = -(batch_done - 1)

                        target_action = sess.run(self.target_actor.det_prob,
                                                 feed_dict={
                                                     self.target_actor.input_x:
                                                     batch_next_state
                                                 })
                        target_action = np.array([[1, 0] if i == 0 else [0, 1]
                                                  for i in target_action])
                        targetQ_all = sess.run(self.target_critic.Qout,
                                               feed_dict={
                                                   self.target_critic.input_x:
                                                   batch_next_state,
                                                   self.target_critic.actions:
                                                   target_action
                                               })
                        nextQ = np.sum(np.multiply(targetQ_all, target_action),
                                       axis=-1)
                        targetQ = batch_rewards + (self.params.gamma * nextQ *
                                                   end_multiplier)

                        pred_actions = sess.run(
                            self.main_actor.det_prob,
                            feed_dict={self.main_actor.input_x: batch_state})
                        pred_actions = np.array([[1, 0] if i == 0 else [0, 1]
                                                 for i in pred_actions])

                        # Update the network with our target values.
                        sess.run(self.main_critic.update_value_model,
                                 feed_dict={
                                     self.main_critic.input_x: batch_state,
                                     self.main_critic.target_Q: targetQ,
                                     self.main_critic.actions: batch_actions
                                 })
                        self.update_Target(self.critic_targetOps, sess)

                        gradients = sess.run(self.main_critic.action_grads,
                                             feed_dict={
                                                 self.main_critic.input_x:
                                                 batch_state,
                                                 self.main_critic.actions:
                                                 pred_actions
                                             })

                        gradients = np.array(gradients).reshape(
                            self.params.batch_size, self.params.num_actions)
                        sess.run(self.main_actor.optimize,
                                 feed_dict={
                                     self.main_actor.input_x: batch_state,
                                     self.main_actor.action_gradient: gradients
                                 })

                        self.update_Target(self.actor_targetOps, sess)

                    # Make sure the observation is in a shape the network can handle.
                    state_buffer, reward_buffer, action_buffer, next_state_buffer, done_buffer = [], [], [], [], []

                    actor_noise = OrnsteinUhlenbeckActionNoise(
                        mu=np.zeros(self.params.num_actions))

                    action = sess.run(self.main_actor.logits,
                                      feed_dict={
                                          self.main_actor.input_x: state_input
                                      }) + actor_noise()
                    action = np.argmax(action)

                    # step the environment and get new measurements
                    next_state, reward, done, _ = self.env.step(action)

                    next_state = next_state.reshape([1, self.params.input_dim])

                    state_buffer.append(state_input)
                    action_buffer.append([1, 0] if action == 0 else [0, 1])
                    reward_buffer.append(
                        reward if not done or score == 299 else -100)
                    #reward_buffer.append(reward)
                    next_state_buffer.append(next_state)
                    done_buffer.append(done)

                    # move to next state
                    state_input = next_state

                    # add up reward
                    self.reward_sum += reward
                    score += reward
                    self.global_step += 1
                    self.myBuffer.append(state_buffer, action_buffer,
                                         reward_buffer, next_state_buffer,
                                         done_buffer)

                if episode_number % self.params.update_freq == 0:
                    self.running_reward = self.reward_sum if self.running_reward is None else self.running_reward * 0.99 + self.reward_sum * 0.01
                    print(
                        'Current Episode {} Average reward for episode {:.2f}.  Total average reward {:.2f}.'
                        .format(episode_number,
                                self.reward_sum // self.params.update_freq,
                                self.running_reward //
                                self.params.update_freq))
                    self.reward_sum = 0
                    time.sleep(0.5)

                self.state = self.env.reset()
                state_input = self.state.reshape([1, self.params.input_dim])
                self.global_step += 1
Esempio n. 2
0
class Worker():
    def __init__(self, params, num, global_episodes, tvars, global_network):

        self.params = params
        self.name = "worker_" + str(num)
        self.number = num
        self.model_path = self.params.logdir
        self.global_episodes = global_episodes
        self.increment = self.global_episodes.assign_add(1)
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_mean_values = []
        self.summary_writer = tf.summary.FileWriter("train_" +
                                                    str(self.number))
        self.global_network = global_network

        #Create the local copy of the network and the tensorflow op to copy global paramters to local network
        self.local_AC = AC_network(params, num, tvars, name=self.name)
        self.update_local_ops = self.update_target_graph(
            tvars, self.local_AC.local_vars)

        #The Below code is related to setting up the Doom environment
        self.actions = None

        #load cartpole
        self.env = gym.make('CartPole-v0')
        self.myBuffer = ReplayMemory(max_size=self.params.max_ep_length)

    def train(self, sess):
        trainBatch = self.myBuffer.sample(self.total_steps)
        batch_state = np.array(trainBatch[0]).reshape(
            [self.total_steps, self.params.input_dim])
        batch_actions = np.array(trainBatch[1]).reshape(
            [self.total_steps, self.params.num_actions])
        batch_rewards = np.array(trainBatch[2])
        batch_next_state = np.array(trainBatch[3]).reshape(
            [self.total_steps, self.params.input_dim])
        batch_done = np.array(trainBatch[4])

        end_multiplier = -(batch_done - 1)

        # Here we take the rewards and values from the buffer, and use them to
        # generate the advantage and discounted returns.
        # The advantage function uses "Generalized Advantage Estimation"
        #self.rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
        #discounted_rewards = discount(self.rewards_plus,gamma)[:-1]
        #self.value_plus = np.asarray(values.tolist() + [bootstrap_value])
        #advantages = rewards + gamma * self.value_plus[1:] - self.value_plus[:-1]
        #advantages = discount(advantages,gamma)

        next_Q = np.max(
            sess.run(self.local_AC.Qout,
                     feed_dict={self.local_AC.input_x: batch_next_state}))
        state_value = np.max(
            sess.run(self.local_AC.Qout,
                     feed_dict={self.local_AC.input_x: batch_state}))

        batch_target_Q = batch_rewards + (self.params.gamma * next_Q *
                                          end_multiplier)
        batch_advantages = batch_target_Q - state_value

        # Update the global network using gradients from loss
        # Generate network statistics to periodically save
        feed_dict = {
            self.local_AC.input_x: batch_state,
            self.local_AC.target_Q: batch_target_Q,
            self.local_AC.actions: batch_actions,
            self.local_AC.advantages:
            batch_advantages.reshape(self.total_steps, 1)
        }

        v_l, p_l, e_l, _ = sess.run([
            self.local_AC.value_loss, self.local_AC.policy_loss,
            self.local_AC.entropy, self.local_AC.apply_grads
        ],
                                    feed_dict=feed_dict)

        #return v_l/self.total_steps , p_l/self.total_steps , e_l/self.total_steps

    def work(self, sess, coord, saver):
        episode_count = sess.run(self.global_episodes)
        self.total_steps = 0
        print("Starting worker " + str(self.number))
        with sess.as_default(), sess.graph.as_default():
            while not coord.should_stop():
                episode_buffer = []
                episode_values = []
                episode_frames = []
                episode_reward = []
                episode_step_count = []
                score = 0
                d = False
                state_input = self.env.reset()
                state_buffer, reward_buffer, action_buffer, next_state_buffer, done_buffer = [], [], [], [], []

                while not d:

                    state_input = state_input.reshape(
                        [1, self.params.input_dim])
                    # Run the policy network and get an action to take.
                    curr_policy = sess.run(
                        self.local_AC.probability,
                        feed_dict={self.local_AC.input_x: state_input})

                    # get the action from predicted policy
                    action = np.random.choice(np.arange(len(curr_policy)),
                                              p=curr_policy)

                    # step the environment and get new measurements
                    next_state, reward, d, _ = self.env.step(action)

                    next_state = next_state.reshape([1, self.params.input_dim])

                    state_buffer.append(state_input)
                    action_buffer.append([1, 0] if action == 0 else [0, 1])
                    reward_buffer.append(
                        reward if not d or score == 399 else -200)
                    # reward_buffer.append(reward)
                    next_state_buffer.append(next_state)
                    done_buffer.append(d)
                    score += reward
                    self.total_steps += 1

                    state_input = next_state

                self.myBuffer.append(state_buffer, action_buffer,
                                     reward_buffer, next_state_buffer,
                                     done_buffer)

                #state_buffer, reward_buffer, action_buffer, next_state_buffer, done_buffer = [], [], [], [], []
                episode_reward.append(score)
                #print(score)

                episode_step_count.append(self.total_steps)

                self.episode_rewards.append(episode_reward)
                self.episode_lengths.append(episode_step_count)
                self.episode_mean_values.append(np.mean(episode_values))

                # Update the network using the episode buffer at the end of the episode.
                if self.myBuffer != None:
                    #v_l,p_l,e_l = self.train(sess)
                    self.train(sess)
                    #     #print(v_l, p_l, e_l)
                    self.update_Target(self.update_local_ops, sess)
                    #print(myBuffer._memory)
                    self.myBuffer.reset()
                    self.total_steps = 0

                # Periodically save gifs of episodes, model parameters, and summary statistics.
                if episode_count % 10 == 0 and episode_count != 0:
                    if episode_count % 100 == 0 and self.name == 'worker_0':
                        saver.save(
                            sess, self.model_path + '/model-' +
                            str(episode_count) + '.cptk')
                        print("Saved Model")

                    if self.name == "worker_0":

                        curr_reward = 0

                        for i in range(5):
                            test_done = False
                            state = self.env.reset()
                            while not test_done:
                                state = state.reshape(1, self.params.input_dim)
                                curr_policy = sess.run(
                                    self.global_network.probability,
                                    feed_dict={
                                        self.global_network.input_x: state
                                    })

                                # get the action from predicted policy
                                action = np.random.choice(np.arange(
                                    len(curr_policy)),
                                                          p=curr_policy)

                                # step the environment and get new measurements
                                next_state, reward, test_done, _ = self.env.step(
                                    action)
                                curr_reward += 1
                                state = next_state

                        print("Episode: {}, Current global reward: {:.1f}".
                              format(episode_count, curr_reward / 5))
                        time.sleep(0.5)

                if self.name == 'worker_0':
                    sess.run(self.increment)
                episode_count += 1

                if episode_count > self.params.total_episodes and self.name == "worker_0":
                    coord.request_stop()

    def update_target_graph(self, from_vars, to_vars):
        op_holder = []
        for from_var, to_var in zip(from_vars, to_vars):
            op_holder.append(to_var.assign(from_var))
        return op_holder

    def update_Target(self, op_holder, sess):
        '''run operation defined in updateTargetGraph function'''
        for op in op_holder:
            sess.run(op)
Esempio n. 3
0
class AC_Agent():

    def __init__(self, params):

        self.env = gym.make('CartPole-v0')
        #self.env = gym.make('Pong-v0')

        self.params = params
        self.graph = tf.Graph()

        with self.graph.as_default():

            self.actor = Policy_network(params)

            self.main_critic = Value_network(params, "primary")
            self.target_critic = Value_network(params, "target")

            self.init = tf.global_variables_initializer()

            if not os.path.exists(self.params.logdir):
                os.mkdir(self.params.logdir)

            self.saver = tf.train.Saver()
            self.tvars = tf.trainable_variables()
            main_start_index = int(len(self.tvars)/3)
            target_start_index = int(2*len(self.tvars)/3)
            self.actor_tvars = self.tvars[:main_start_index]
            self.main_critic_tvars = self.tvars[main_start_index:target_start_index]
            self.target_critic_tvars = self.tvars[target_start_index:]
            #self.actor.backprop(tvars=None)

        self.running_reward = None
        self.reward_sum = 0
        self.episode_number = 0
        rendering = False
        self.global_step = 0

        self.critic_targetOps = self.update_critic_TargetGraph(self.main_critic_tvars, self.target_critic_tvars, self.params.tau)

        self.myBuffer = ReplayMemory(max_size=self.params.max_buffer_size)


    def update_critic_TargetGraph(self, main_tfVar, target_tfVar, tau):
        '''Holds operation node for assigning Target values to Target network
        Args:
            tfVars - Variables for training(weights, bias...)
            Tau - rate for updating (low Tau value for slow updates)
        Return:
            op_holder - tf.assign() operation. input for updateTarget Function'''
        assert len(main_tfVar) == len(target_tfVar)
        total_vars = len(main_tfVar)
        op_holder = []

        # for latter-half part of trainable variables (= for Target network variables)
        for idx, var in enumerate(main_tfVar[0:total_vars]):
            # assigning tau*new_value+(1-tau)*old_values
            op_holder.append(target_tfVar[idx].assign(
                (var.value() * tau) + ((1 - tau) * target_tfVar[idx].value())))
        return op_holder

    def update_critic_Target(self, op_holder, sess):
        '''run operation defined in updateTargetGraph function'''
        for op in op_holder:
            sess.run(op)


    def _load_model(self, sess, load_ckpt):
        if load_ckpt:
            print('Loading Model...')
            ckpt = tf.train.get_checkpoint_state(self.params.logdir)
            self.saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            # initialize gloabl variables
            print('Initialize variables...')
            sess.run(self.init)

    def rendering(self, rendering):
        if self.reward_sum / self.params.update_freq >= 180 or rendering == True :
            self.env.render()
            rendering = True


    def train(self):

        with tf.Session(graph=self.graph) as sess:

            self._load_model(sess, self.params.load_model)
            self.total_episodes = self.params.total_episodes

            # Obtain an initial observation of the environment
            self.state = self.env.reset()
            #state_input = self.prepro(self.state)
            state_input = self.state.reshape([1, self.params.input_dim])

            for self.episode_number in xrange(self.params.total_episodes):

                done = False
                score = 0

                while not done:

                    if self.global_step > self.params.preTrainStep:

                        #print(self.myBuffer)

                        # Value network update
                        trainBatch = self.myBuffer.sample(self.params.batch_size)

                        #print(trainBatch)
                        batch_state = np.array(trainBatch[0]).reshape([self.params.batch_size, self.params.input_dim])
                        batch_actions = np.array(trainBatch[1]).reshape([self.params.batch_size, self.params.num_actions])
                        batch_rewards = np.array(trainBatch[2])
                        batch_next_state = np.array(trainBatch[3]).reshape([self.params.batch_size, self.params.input_dim])
                        batch_done = np.array(trainBatch[4])

                        end_multiplier = -(batch_done - 1)

                        targetQ_all = sess.run(self.target_critic.Qout, feed_dict={self.target_critic.input_x: batch_next_state})
                        targetQ = batch_rewards + (self.params.gamma * np.max(targetQ_all, axis=-1) * end_multiplier)

                        predictedQ_all = sess.run(self.main_critic.Qout, feed_dict={self.main_critic.input_x: batch_state})

                        # Update the network with our target values.
                        sess.run(self.main_critic.update_value_model,
                                                           feed_dict={self.main_critic.input_x : batch_state,
                                                                      self.main_critic.target_Q : targetQ,
                                                                      self.main_critic.actions : batch_actions})
                        self.update_critic_Target(self.critic_targetOps, sess)

                        batch_advantage = batch_rewards + (self.params.gamma * np.max(targetQ_all, axis=-1) * end_multiplier) - np.max(predictedQ_all)
                        # Policy network update
                        batch_advantage = batch_advantage.reshape([self.params.batch_size, 1])
                        sess.run(self.actor.optimize, feed_dict={self.actor.input_x: batch_state,
                                                                 self.actor.input_y: batch_actions,
                                                                 self.actor.advantages: batch_advantage})


                    # Make sure the observation is in a shape the network can handle.
                    state_buffer, reward_buffer, action_buffer, next_state_buffer, done_buffer = [], [], [], [], []

                    #print(state_input.shape)
                    #prev_state = state_input

                    # Run the policy network and get an action to take.
                    curr_policy = sess.run(self.actor.probability, feed_dict={self.actor.input_x: state_input})

                    # get the action from predicted policy
                    action = np.random.choice(np.arange(len(curr_policy)), p=curr_policy)

                    # step the environment and get new measurements
                    next_state, reward, done, _ = self.env.step(action)

                    next_state = next_state.reshape([1, self.params.input_dim])
                    #next_state = self.prepro(next_state)
                    #next_state = next_state - prev_state

                    state_buffer.append(state_input)
                    action_buffer.append([1, 0] if action == 0 else [0, 1])
                    reward_buffer.append(reward if not done or score == 299 else -100)
                    #reward_buffer.append(reward)
                    next_state_buffer.append(next_state)
                    done_buffer.append(done)

                    state_input = next_state

                    # move to next state

                    # add up reward
                    self.reward_sum += reward
                    score += reward
                    self.global_step += 1
                    self.myBuffer.append(state_buffer, action_buffer, reward_buffer, next_state_buffer, done_buffer)

                if self.episode_number % self.params.update_freq == 0:
                    self.running_reward = self.reward_sum if self.running_reward is None else self.running_reward * 0.99 + self.reward_sum * 0.01
                    print('Current Episode {} Average reward for episode {:.2f}.  Total average reward {:.2f}.'
                          .format(self.episode_number,
                                  self.reward_sum // self.params.update_freq,
                                  self.running_reward // self.params.update_freq))
                    self.reward_sum = 0
                    time.sleep(0.5)


                self.state = self.env.reset()
                state_input = self.state.reshape([1, self.params.input_dim])
                #state_input = self.prepro(self.state)
                self.global_step += 1