コード例 #1
0
    def test(self, render=False):
        # re-initialize game for evaluation
        episode_buffer = []
        self.game_state.reset(random_restart=False,
                              terminate_loss_of_life=False)
        observation = self._reset(testing=True)
        episode_buffer.append(self.game_state.screen_buffer)

        max_steps = self.eval_max_steps
        total_reward = 0.0
        total_steps = 0
        sub_total_reward = 0.0
        sub_steps = 0
        n_episodes = 0
        time.sleep(0.5)
        while max_steps > 0:
            self._update_state_input(observation)
            readout_t = self.net.evaluate(self.state_input)[0]
            action = get_action_index(readout_t,
                                      is_random=(random.random() <= 0.05),
                                      n_actions=self.game_state.n_actions)
            observation, reward, terminal = self.game_state.step(action,
                                                                 render=render)
            if n_episodes == 0:
                episode_buffer.append(observation)
            observation = process_frame(observation, self.resized_h,
                                        self.resized_w)
            sub_total_reward += reward
            sub_steps += 1
            max_steps -= 1
            if terminal:
                if n_episodes == 0:
                    time_per_step = 0.05
                    images = np.array(episode_buffer)
                    make_gif(images,
                             self.folder + '/frames/image{ep:010d}.gif'.format(
                                 ep=(self.t - self.observe)),
                             duration=len(images) * time_per_step,
                             true_image=True,
                             salience=False)
                    episode_buffer = []
                n_episodes += 1
                print("\tTRIAL", n_episodes, "/ REWARD", sub_total_reward,
                      "/ STEPS", sub_steps, "/ TOTAL STEPS", total_steps)
                self.game_state.reset(random_restart=True,
                                      terminate_loss_of_life=False)
                observation = self._reset(testing=True)
                total_reward += sub_total_reward
                total_steps += sub_steps
                sub_total_reward = 0.0
                sub_steps = 0
                time.sleep(0.5)
        # (timestep, total sum of rewards, toal # of steps before terminating)
        total_reward = total_reward / max(1, n_episodes)
        total_steps = total_steps / max(1, n_episodes)
        total_reward = round(total_reward, 4)
        self.rewards['eval'].append(
            ((self.t - self.observe), total_reward, total_steps))
        return total_reward, total_steps, n_episodes
コード例 #2
0
 def get_meta_state(self,s,g):
     """compute the 4 channel meta-state from meta-action (goal / option)
     Parameters
     ==========
     s: the raw state
     g: the goal mask
     """
     return np.dstack([process_frame(s),g]) # stack state and goal
コード例 #3
0
 def _reset(self):
     self.state_input.fill(0)
     observation, r_0, terminal = self.game_state.step(0, render=True)
     observation = process_frame(observation, self.resized_h,
                                 self.resized_w)
     for _ in range(self.phi_length - 1):
         empty_img = np.zeros((self.resized_w, self.resized_h),
                              dtype=np.uint8)
         self.D.add_sample(empty_img, 0, 0, 0)
     return observation
コード例 #4
0
    def rolloutPCL(self,
                   sess,
                   initial_state,
                   rnn_state_init,
                   max_path_length=None,
                   episode_count=1):

        # ToDo: Do not loop over "episode_count" but perform only one sess.run per step
        # Perform rollout of given environment
        if max_path_length is None:
            max_path_length = self.env.envs[0].spec.tags.get(
                'wrapper_config.TimeLimit.max_episode_steps')

        # Reset rnn_state for every iteration
        s = initial_state
        rnn_state = rnn_state_init
        path_length = np.zeros(len(self.env))

        # Sample one episode
        while all(path_length) < max_path_length and not self.env.all_done():
            dummy_lengths = np.ones(len(self.env))
            a, v, rnn_state, _ = self.act(s, rnn_state, dummy_lengths, sess)
            # Get action for every environment
            act_ = [np.argmax(a_) for a_ in a]
            # Sample new state and reward from environment
            s2, r, terminal, info = self.env.step(act_)
            if self.preprocessing_state:
                s2 = U.process_frame(s2, self.preprocessing_config)
            # Add states, rewards, actions, values and terminal information to PCL episode batch
            self.add_to_batch(s, r, a, v, terminal)
            for i in range(len(self.env)):
                if not self.env.dones[i]:
                    path_length[i] = path_length[i] + 1
            s = s2

        episodes = []
        for i in range(len(self.env)):
            path_length_temp = int(path_length[i]) + 1
            episodes.append(
                dict(states=np.expand_dims(
                    self.episode_states_train[i][:path_length_temp], 0),
                     actions=np.expand_dims(
                         self.episode_actions_train[i][:path_length_temp], 0),
                     rewards=np.expand_dims(
                         self.episode_reward_train[i][:path_length_temp], 0),
                     values=np.expand_dims(
                         self.episode_values_train[i][:path_length_temp], 0),
                     path_length=path_length_temp))

        return episodes
コード例 #5
0
    def step(self,m_a):
        """Take a step in this meta_environment
        This single meta_step involves possibly many steps in the environment
        Parameters
        ==========
        m_a: an action of the meta_agent, which is also a goal of this sub agent
           Current this is an input to the get_mask() function
        """



        if self.sess is None: # I cannot init before the sess exists
            self.sess = tf.get_default_session()
            self.summary_writer.add_graph(self.sess.graph)
        
            
        self.sess.run(self.agent.update_local_ops)
        episode_buffer = []
        episode_values = []
        episode_frames = []
        episode_reward = 0
        episode_step_count = 0
        d = False
        i_r = 0
        m_r = 0
        s = self.get_last_obs() # The meta-agent is responsible for resetting
        s = process_frame(s)

        self.subgoal.set_meta_action(m_a)
        s = self.subgoal.augment_obs(s)
        episode_frames.append((self.subgoal.visualize(s),['i_r  =  0', 'm_r  =  0', 'step = 0']))



        self.agent.start_trial()        
        while d == False:
            # Take an action using probabilities from policy
            # network output.
            a,v = self.agent.sample_av(s, self.sess, i_r)
            s1,f,m_d = self.env.step(a)
            self.last_obs = s1.copy()
            s1 = process_frame(s1)
            s1 = self.subgoal.augment_obs(s1)




            # ARA - todo: make into internal critic or provide a env. wrapper
            i_r, m_r_step, i_d = self.subgoal.intrinsic_reward(s,a,s1,f,m_d)
            m_r += m_r_step
            # if(self.flags['verbose']):
            #     print('i_r: ' + str(i_r))


            d = m_d or i_d or episode_step_count == self.max_ep_len-1

            data = ['i_r  = ' + str(i_r),
                    'm_r  = ' + str(m_r_step),
                    'd    = ' + str(d),
                    'step = ' + str(episode_step_count),
                    'a    = ' + str(np.round(a,2)),
                    'm_a  = ' + str(np.round(m_a,2)),
                    'v    = ' + str(v[0,0])]
                    
            episode_frames.append((self.subgoal.visualize(s1), data))

                        
            episode_buffer.append([s,a,i_r,s1,d,v[0,0]])
            episode_values.append(v[0,0])
            episode_reward += i_r
            s = s1
            episode_step_count += 1
            self.total_step_count += 1
                                        
        self.episode_count += 1
        if(self.flags['verbose']):
            print('\ttotal intrisic episode reward: ' + str(episode_reward))
            print('\tsubagent length: ' + str(episode_step_count))

                
        # Update the network using the experience buffer at the
        # end of the episode.
        if len(episode_buffer) != 0 and \
           self.flags['train']:
            v_l,p_l,e_l,g_n,v_n = self.agent.train(episode_buffer,
                                                   self.sess,
                                                   self.gamma, self.lam, 0.0)



            if self.episode_count % 50 == 0:
                global_ep_count = self.sess.run(self.global_episodes)

                data = {'Perf/Intrinsic Reward' : episode_reward,
                        'Perf/Length'           : episode_step_count,
                        'Perf/Value'            : np.mean(episode_values),
                        'Perf/Total Step Count' : self.total_step_count,
                        'Perf/Global Ep Count'  : global_ep_count,
                        'Losses/Value Loss'     : v_l,
                        'Losses/Policy Loss'    : p_l,
                        'Losses/Entropy'        : e_l,
                        'Losses/Grad Norm'      : g_n,
                        'Losses/Var Norm'       : v_n}

                self.agent.write_summary(data, self.episode_count)
                    

        # ARA - todo: check if max meta-episodes is reached in meta-agent
        #       only send a done (m_d) signal if inner env. needs resetting.
        self.frames = episode_frames
        return self.last_obs, m_r, m_d 
コード例 #6
0
    def evaluate(self, sess, n=0):
        episode_count = sess.run(self.global_episodes)
        s = self.env.reset()
        self.reset_agent()
        self.start_trial()

        step = 0

        s = process_frame(s)
        d = False
        r = 0
        episode_r = 0

        is_meta = hasattr(self.env, 'flags')

        if is_meta:
            self.env.flags['train'] = False
            self.env.flags['verbose'] = True

        printing = True

        frames = []

        while d == False and step < self.max_ep:
            a, v = self.sample_av(s, sess, r)

            s1, r, d = self.env.step(a)
            episode_r += r
            s = process_frame(s1)
            step += 1

            if is_meta:
                frames += self.env.get_frames()
            else:
                data = [
                    'r = ' + str(r), 'd = ' + str(d), 'v = ' + str(v),
                    'a = ' + str(a), 'step = ' + str(step),
                    'cum_r = ' + str(episode_r)
                ]
                frames.append((s1, data))

        print('episode reward: ' + str(episode_r))

        if not printing:
            return

        fig = plt.figure()
        f, d = frames[0]
        lf_sp = fig.add_subplot(121)
        l = plt.imshow(f)
        data_plot = fig.add_subplot(122)

        plt.imshow(np.ones(f.shape))
        plt.axis('off')

        FFMpegWriter = manimation.writers['ffmpeg']
        metadata = dict(title='Episode ' + str(n),
                        artist='Matplotlib',
                        comment='Movie support!')
        writer = FFMpegWriter(fps=15, metadata=metadata)

        movie_path = self.movie_path + "episode_" + str(n) + ".mp4"
        with writer.saving(fig, movie_path, 100):
            for f, data in frames:
                l.set_data(f)

                data_plot.cla()
                data_plot.axis('off')

                h = 3
                for text in data:
                    data_plot.text(1, h, text)
                    h += 8

                writer.grab_frame()
        plt.close()

        if is_meta:
            self.env.flags['train'] = True
            self.env.flags['verbose'] = False
コード例 #7
0
    def run(self):
        # get the first state by doing nothing and preprocess the image to 80x80x4
        observation = self._reset()
        self.t, self.epsilon, self.rewards = self._load()

        # only executed at the very beginning of training and never again
        if self.t == 0 and self.train_with_demo_steps > 0:
            self.train_with_demo_memory_only()

        # set start time
        self.start_time = time.time() - self.wall_t

        print("D size: ", self.D.size)
        total_reward = 0.0
        sub_steps = 0

        while (self.t - self.observe) < self.train_max_steps:
            # Evaluation of policy
            if (self.t - self.observe) >= 0 and (
                    self.t - self.observe) % self.eval_freq == 0:
                terminal = 0
                total_reward, total_steps, n_episodes = self.test()
                self.net.add_accuracy(total_reward, total_steps, n_episodes,
                                      (self.t - self.observe))
                print("TIMESTEP", (self.t - self.observe), "/ AVE REWARD",
                      total_reward, "/ AVE TOTAL STEPS", total_steps,
                      "/ # EPISODES", n_episodes)
                # re-initialize game for training
                self.game_state.reset(random_restart=True)
                observation = self._reset()
                sub_steps = 0
                time.sleep(0.5)

            # choose an action epsilon greedily
            self._update_state_input(observation)
            readout_t = self.net.evaluate(self.state_input)[0]
            action = get_action_index(
                readout_t,
                is_random=(random.random() <= self.epsilon
                           or self.t <= self.observe),
                n_actions=self.game_state.n_actions)

            # scale down epsilon
            if self.epsilon > self.final_epsilon and self.t > self.observe:
                self.epsilon -= (self.init_epsilon -
                                 self.final_epsilon) / self.explore

            ##### HUMAN ADVICE OVERRIDE ACTION #####
            if self.use_human_advice and self.psi > self.final_epsilon:
                use_advice = False
                # After n exploration steps, decay psi
                if (self.t - self.observe) >= self.explore:
                    self.psi *= self.init_psi

                if random.random() > self.final_epsilon:
                    psi_cond = True if self.psi == self.init_psi else (
                        self.psi > random.random())
                    if psi_cond:
                        action_advice = self.human_net.evaluate(
                            self.state_input)[0]
                        action_human = np.argmax(action_advice)
                        if action_advice[action_human] >= self.confidence:
                            action = action_human
                            use_advice = True
            ##### HUMAN ADVICE OVERRIDE ACTION #####

            # Training
            # run the selected action and observe next state and reward
            next_observation, reward, terminal = self.game_state.step(
                action, random_restart=True)
            next_observation = process_frame(next_observation, self.resized_h,
                                             self.resized_w)
            terminal_ = terminal or (
                (self.t + 1 - self.observe) >= 0 and
                (self.t + 1 - self.observe) % self.eval_freq == 0)

            # store the transition in D
            self.D.add_sample(observation, action, reward,
                              (1 if terminal_ else 0))

            # only train if done observing
            if self.t > self.observe and self.t % self.update_freq == 0:
                s_j_batch, a_batch, r_batch, s_j1_batch, terminals = self.D.random_batch(
                    self.batch)
                # perform gradient step
                summary = self.net.train(s_j_batch, a_batch, r_batch,
                                         s_j1_batch, terminals)
                self.net.add_summary(summary, self.t - self.observe)

                self.rewards['train'].append(round(reward, 4))

            # update the old values
            sub_steps += 1
            self.t += 1
            observation = next_observation

            if terminal:
                observation = self._reset()
                sub_steps = 0

            # save progress every SAVE_FREQ iterations
            if (self.t - self.observe) % self.save_freq == 0:
                self.net.save(self.t)

                data = {
                    'D.width': self.D.width,
                    'D.height': self.D.height,
                    'D.max_steps': self.D.max_steps,
                    'D.phi_length': self.D.phi_length,
                    'D.num_actions': self.D.num_actions,
                    'D.actions': self.D.actions,
                    'D.rewards': self.D.rewards,
                    'D.terminal': self.D.terminal,
                    'D.bottom': self.D.bottom,
                    'D.top': self.D.top,
                    'D.size': self.D.size,
                    'epsilon': self.epsilon,
                    't': self.t
                }
                print(colored('Saving data...', 'blue'))
                pickle.dump(
                    data, open(self.folder + '/' + self.name + '-dqn.pkl',
                               'wb'), pickle.HIGHEST_PROTOCOL)
                pickle.dump(
                    self.rewards,
                    open(self.folder + '/' + self.name + '-dqn-rewards.pkl',
                         'wb'), pickle.HIGHEST_PROTOCOL)
                print(colored('Successfully saved data!', 'green'))
                print(
                    colored('Compressing and saving replay memory...', 'blue'))
                save_compressed_images(
                    self.folder + '/' + self.name + '-dqn-images.h5',
                    self.D.imgs)
                print(colored('Compressed and saved replay memory', 'green'))

                # write wall time
                self.wall_t = time.time() - self.start_time
                print('Total time: {} seconds'.format(self.wall_t))

            # print info
            state = ""
            if self.t <= self.observe:
                state = "observe"
            elif self.t > self.observe and self.t <= self.observe + self.explore:
                state = "explore"
            else:
                state = "train"

            if self.t % 1000 == 0:
                if self.use_human_advice:
                    print("T:", self.t, "/ STATE", state, "/ EPSILON",
                          round(self.epsilon, 4), "/ PSI", round(self.psi, 4),
                          "/ ADVICE", use_advice, "/ ACTION", action,
                          "/ REWARD", reward, "/ Q_MAX %e" % np.max(readout_t))
                else:
                    print("T:", self.t, "/ STATE", state, "/ EPSILON",
                          round(self.epsilon, 4), "/ ACTION", action,
                          "/ REWARD", reward, "/ Q_MAX %e" % np.max(readout_t))
コード例 #8
0
    def run(self, minutes_limit=5, demo_type=0, model_net=None):
        imgs = []
        acts = []
        rews = []
        terms = []

        rewards = {'train': [], 'eval': []}

        # regular game
        start_time = datetime.now()
        timeout_start = time.time()
        timeout = 60 * minutes_limit
        t = 0
        terminal = False
        terminal_force = False
        is_reset = True
        total_reward = 0.0
        score1 = score2 = 0
        sub_t = 0
        sub_r = 0.
        rewards = []
        sub_steps = []
        total_episodes = 0

        # re-initialize game for evaluation
        self.game_state.reset(
            render=True,
            random_restart=True,
            terminate_loss_of_life=self.terminate_loss_of_life)
        observation = self._reset()

        while True:
            if demo_type == 1:  # RANDOM AGENT
                action = np.random.randint(self.game_state.n_actions)
            elif demo_type == 2:  # MODEL AGENT
                if sub_t % self._skip == 0:
                    self._update_state_input(observation)
                    readout_t = model_net.evaluate(self.state_input)[0]
                    action = get_action_index(
                        readout_t,
                        is_random=False,
                        n_actions=self.game_state.n_actions)
            else:  # HUMAN
                action = self.game_state.human_agent_action

            next_observation, reward, terminal = self.game_state.step(
                action, render=True, random_restart=True)
            next_observation = process_frame(next_observation, self.resized_h,
                                             self.resized_w)
            terminal = True if terminal or (
                time.time() > timeout_start + timeout) else False

            # store the transition in D
            # when using frameskip=1, should store every four steps
            if sub_t % self._skip == 0:
                self.D.add_sample(observation, action, reward, terminal)
            observation = next_observation
            sub_r += reward
            total_reward += reward

            #time.sleep(0.0166666)
            sub_t += 1
            t += 1

            # Ensure that D does not reach max memory that mitigate
            # problems when combining different human demo files
            if (self.D.size + 3) == self.D.max_steps:
                terminal_force = True
                terminal = True

            if terminal:
                total_episodes += 1
                rewards.append(sub_r)
                sub_steps.append(sub_t)
                sub_r = 0.
                sub_t = 0
                self.game_state.reset(
                    render=True,
                    random_restart=True,
                    terminate_loss_of_life=self.terminate_loss_of_life)
                observation = self._reset()
                is_reset = True
                time.sleep(0.5)

                if terminal_force or time.time() > timeout_start + timeout:
                    break

        if demo_type == 0:  # HUMAN
            self.game_state.stop_thread = True

        print("Duration: {}".format(datetime.now() - start_time))
        print("Total # of episodes: {}".format(total_episodes))
        print("Mean steps: {} / Mean reward: {}".format(
            t / total_episodes, total_reward / total_episodes))
        print("\tsteps / episode:", sub_steps)
        print("\treward / episode:", rewards)
        print("Total Replay memory saved: {}".format(self.D.size))

        # Resize replay memory to exact memory size
        self.D.resize()
        data = {
            'D.width': self.D.width,
            'D.height': self.D.height,
            'D.max_steps': self.D.max_steps,
            'D.phi_length': self.D.phi_length,
            'D.num_actions': self.D.num_actions,
            'D.actions': self.D.actions,
            'D.rewards': self.D.rewards,
            'D.terminal': self.D.terminal,
            'D.bottom': self.D.bottom,
            'D.top': self.D.top,
            'D.size': self.D.size
        }
        images = self.D.imgs
        pkl_file = '{name}-dqn.pkl'.format(name=self.name)
        h5_file = '{name}-dqn-images.h5'.format(name=self.name)
        pickle.dump(data, open(self.folder + pkl_file, 'wb'),
                    pickle.HIGHEST_PROTOCOL)
        print(colored('Compressing and saving replay memory...', 'blue'))
        save_compressed_images(self.folder + h5_file, images)
        print(colored('Compressed and saved replay memory', 'green'))
コード例 #9
0
    def work(self, sess, coord, saver):
        gamma = self.gamma
        lam = self.lam

        t0 = time.time()
        episode_count = sess.run(self.global_episodes)
        total_steps = 0
        print("Starting worker " + str(self.name))
        with sess.as_default(), sess.graph.as_default():
            while not coord.should_stop():
                sess.run(self.update_local_ops)
                episode_buffer = []
                episode_values = []
                episode_frames = []
                episode_reward = 0
                episode_step_count = 0
                action_mag = []
                d = False
                r = 0

                s = self.env.reset()
                self.reset_agent()
                episode_frames.append((s, ['']))
                s = process_frame(s)

                self.start_trial()

                while d == False:
                    # Take an action using probabilities from policy
                    # network output.
                    a, v = self.sample_av(s, sess, r)
                    s1, r, d = self.env.step(a)

                    #                     if episode_count == 50:o
                    #                         coord.request_stop()

                    s1 = process_frame(s1)
                    if episode_step_count == self.max_ep - 1:
                        d = True

                    data = ['r = ' + str(r), 'd = ' + str(d), 'a = ' + str(a)]
                    episode_frames.append((s1, data))

                    episode_buffer.append([s, a, r, s1, d, v[0, 0]])
                    episode_values.append(v[0, 0])

                    episode_reward += r
                    s = s1
                    total_steps += 1
                    episode_step_count += 1

                    # If the episode hasn't ended, but the experience
                    # buffer is full, then we make an update step using
                    # that experience rollout.
                    if len(episode_buffer) == self.update_ival and d != True and \
                       episode_step_count != self.max_ep - 1:
                        # Since we don't know what the true final return
                        # is, we "bootstrap" from our current value
                        # estimation.
                        v1 = sess.run(self.local_AC.value,
                                      feed_dict={
                                          self.local_AC.inputs: [s],
                                          self.local_AC.prev_actions: [a],
                                          self.local_AC.prev_rewards: [[r]],
                                          self.local_AC.is_training_ph:
                                          False,
                                          self.local_AC.state_in[0]:
                                          self.rnn_state[0],
                                          self.local_AC.state_in[1]:
                                          self.rnn_state[1]
                                      })[0, 0]
                        v_l, p_l, e_l, g_n, v_n = self.train(
                            episode_buffer, sess, gamma, lam, v1)
                        episode_buffer = []
                        sess.run(self.update_local_ops)

                self.episode_rewards.append(episode_reward)
                self.episode_lengths.append(episode_step_count)
                self.episode_mean_values.append(np.mean(episode_values))

                # Update the network using the experience buffer at the
                # end of the episode.
                if len(episode_buffer) != 0:
                    v_l, p_l, e_l, g_n, v_n = self.train(
                        episode_buffer, sess, gamma, lam, 0.0)

                # Periodically save model parameters, and summary statistics.
                if episode_count % 100 == 0 and self.is_writer:
                    saver.save(
                        sess, self.model_path + '/model-' +
                        str(episode_count) + '.cptk')
                    s_dt = str(timedelta(seconds=time.time() - t0))
                    self.evaluate(sess, episode_count)
                    print("Saved Model " + str(episode_count) + '\tat time ' +
                          s_dt)

                if episode_count % 5 == 0 and episode_count != 0:

                    data = {
                        'Perf/Reward': episode_reward,
                        'Perf/Length': episode_step_count,
                        'Perf/Value': np.mean(episode_values),
                        'Losses/Value Loss': v_l,
                        'Losses/Policy Loss': p_l,
                        'Losses/Entropy': e_l,
                        'Losses/Grad Norm': g_n,
                        'Losses/Var Norm': v_n
                    }
                    self.write_summary(data, episode_count)

                if self.is_writer:
                    sess.run(self.increment)
                episode_count += 1
コード例 #10
0
def main(job, task, worker_num, ps_num, initport, ps_hosts, worker_hosts):

    PS_HOST = ps_hosts.split(",")
    WORKER_HOSTS = worker_hosts.split(",")

    INITPORT = initport
    CLUSTER = dict()
    """
    workers = []
    ps_ = []
    for i in range(ps_num):
        ps_.append('localhost:{}'.format(INITPORT + i))
    for i in range(worker_num):
        workers.append("localhost:{}".format(i + ps_num + INITPORT))

    CLUSTER['worker'] = workers
    CLUSTER['ps'] = ps_
        """
    # Infer the amount of workers and ps servers

    cluster = tf.train.ClusterSpec({"ps": PS_HOST, "worker": WORKER_HOSTS})
    print({"ps": PS_HOST, "worker": WORKER_HOSTS})

    num_ps, num_workers = len(PS_HOST), len(WORKER_HOSTS)
    #  Get the Cluster Spec
    # cluster = tf.train.ClusterSpec(CLUSTER)

    # Get the current server element
    TASK_ID = task
    JOB = job
    server = tf.train.Server(cluster, job_name=JOB, task_index=TASK_ID)

    # Check if we have a worker or ps node running
    if JOB == 'ps':
        server.join()
    else:

        # Get all required Paramters

        # Running Paramters
        TOTAL_GLOBAL_EPISODES = 100000

        # Gym environment

        ENV_NAME = 'SpaceInvaders-v0'  # MsPacman CartPole
        NUM_ENVS = 3
        PREPROCESSING = True
        IMAGE_SIZE_PREPROCESSED = 80

        PREPROCESSING_CONFIG = [
            {
                "type": "image_resize",
                "width": IMAGE_SIZE_PREPROCESSED,
                "height": IMAGE_SIZE_PREPROCESSED
            }, {
                "type": "grayscale"
            }

            #     {
            #         "type": "sequence",         # TO-DO: sequence not supported
            #         "length": 2
            #     }
        ]

        # Get env parameters

        gw = GymWrapper(ENV_NAME)
        ACTION_DIM = gw.act_space.n

        if PREPROCESSING:
            STATE_DIM = IMAGE_SIZE_PREPROCESSED * IMAGE_SIZE_PREPROCESSED

            types_of_preprocess = []
            for operation in PREPROCESSING_CONFIG:
                types_of_preprocess.append(operation['type'])
                if operation['type'] == "sequence":
                    length_sequence = operation['length']

            print("Do following preprocessing steps: {0}".format(
                types_of_preprocess))

        else:
            PREPROCESSING_CONFIG = None
            STATE_DIM = gw.obs_space.shape[0]

        # Network configuration
        network_config = dict(shared=True,
                              shared_config=dict(
                                  kind=["CNN"],
                                  cnn_input_size=IMAGE_SIZE_PREPROCESSED,
                                  cnn_output_size=256,
                                  lstm_cell_units=16),
                              policy_config=dict(layers=[ACTION_DIM],
                                                 noise_dist=None),
                              value_config=dict(layers=[1], noise_dist=None))

        # Learning rate
        LEARNING_RATE = 0.01
        UPDATE_LEARNING_RATE = False
        # Discount rate for advantage estimation and reward discounting
        GAMMA = 0.99

        # Summary LOGDIR
        # LOG_DIR = '~/A3C/MyDistTest/'
        LOG_DIR = os.getcwd() + '_tensorflowlogs'
        LOG_DIR_CHECKPOINT = os.getcwd() + "_modelcheckpoints"

        # Print latest checkpoint
        checkpoint_sync = True

        # Choose RL method (A3C, PCL)
        METHOD = "A3C"
        print("Run method: " + METHOD)

        # PCL variables
        TAU = 0.2
        ROLLOUT = 5
        # Define the global network and get relevant worker_device
        worker_device = '/job:worker/task:{}/cpu:0'.format(TASK_ID)

        with tf.device(
                tf.train.replica_device_setter(
                    cluster=cluster,  # Makes sure global variables defined in
                    worker_device=worker_device,
                    # this contexts are synced across processes
                    ps_strategy=U.greedy_ps_strategy(ps_tasks=num_ps))):

            global_episodes = tf.train.get_or_create_global_step()
            master_network = AC_Network(
                STATE_DIM,
                ACTION_DIM,
                'global',
                network_config,
                learning_rate=None,
                tau=TAU,
                rollout=ROLLOUT,
                method=METHOD)  # Generate global network

            with tf.device(worker_device):
                worker = Worker(TASK_ID,
                                STATE_DIM,
                                ACTION_DIM,
                                network_config,
                                LEARNING_RATE,
                                global_episodes,
                                ENV_NAME,
                                number_envs=NUM_ENVS,
                                tau=TAU,
                                rollout=ROLLOUT,
                                method=METHOD,
                                update_learning_rate_=UPDATE_LEARNING_RATE,
                                preprocessing_config=PREPROCESSING_CONFIG)

        # Get summary information
        if worker.name == "worker_0":
            merged_summary = tf.summary.merge_all()
            writer = tf.summary.FileWriter(LOG_DIR,
                                           graph=tf.get_default_graph())
        else:
            merged_summary = None

        local_init_op = tf.global_variables_initializer()

        with tf.Session(server.target) as sess:
            sess.run(local_init_op)

        # Setup monitoring
        is_chief = (TASK_ID == 0)

        # Setup hooks required to coordinate training
        stopHook = tf.train.StopAtStepHook(num_steps=TOTAL_GLOBAL_EPISODES)
        saver = tf.train.Saver(max_to_keep=3,
                               var_list=tf.get_collection(
                                   tf.GraphKeys.GLOBAL_VARIABLES,
                                   scope='global'))
        saverHook = tf.train.CheckpointSaverHook(
            checkpoint_dir=LOG_DIR_CHECKPOINT,
            save_steps=200,
            checkpoint_basename=worker.method,
            saver=saver)

        # Start Training
        with tf.train.MonitoredTrainingSession(master=server.target,
                                               is_chief=is_chief,
                                               chief_only_hooks=[saverHook],
                                               hooks=[stopHook]) as sess:

            # Reload global model from chief
            if is_chief:
                try:
                    saver.restore(
                        sess,
                        tf.train.latest_checkpoint(LOG_DIR_CHECKPOINT,
                                                   latest_filename=None))
                except ValueError:
                    print("No Model Checkpoint available")

            # If Checkpoint is loaded make sure all workers start after
            # the variables have been reloaded in order to avoid "bad" updates
            if checkpoint_sync:
                while sess.run(worker.global_episodes) == 0:
                    print(worker.name +
                          " Waiting for Sync of Checkpoint loaded by worker_0")
                    time.sleep(1)

            # Update from global
            sess.run(worker.update_local_ops)

            # Define input to worker.work( gaxmma, sess, coord, merged_summary, writer_summary)
            gamma = GAMMA

            MINI_BATCH = 40
            REWARD_FACTOR = 0.001
            EPISODE_RUNS = 1000

            episode_count = 0
            total_steps = 0
            train_steps = 0
            print("Starting worker " + str(TASK_ID))

            while not sess.should_stop():

                worker.episode_values = []
                worker.episode_reward = []

                # Objects to hold the bacth used to update the Agent
                worker.reset_batch()

                # Used by PCL
                # Hold reward and value function mean value of sampled episodes from replay buffer
                episode_reward_offline = 0
                episode_value_offline = 0
                episode_step_count = 0

                # Restart environment
                s = worker.env.reset()
                if worker.preprocessing_state:
                    s = U.process_frame(s, worker.preprocessing_config)

                if worker.rnn_network:
                    # Set initial rnn state based on number of episodes
                    c_init = np.zeros(
                        (len(worker.env), worker.local_AC.cell_units),
                        np.float32)
                    h_init = np.zeros(
                        (len(worker.env), worker.local_AC.cell_units),
                        np.float32)
                    rnn_state = np.array([c_init, h_init])
                else:
                    rnn_state = None

                # sample new noisy parameters in fully connected layers if
                # noisy net is used
                # if episode_count % 15 == 0:
                if worker.noisy_policy is not None or worker.noisy_value is not None:
                    sess.run(worker.local_AC.noisy_sampling)

                if worker.method == "PCL":

                    # Perform a rollout of the chosen environment
                    episodes = worker.rolloutPCL(sess,
                                                 s,
                                                 rnn_state,
                                                 max_path_length=1000,
                                                 episode_count=len(worker.env))

                    # Add sampled episode to replay buffer
                    worker.replay_buffer.add(episodes)

                    # Get rewards and value estimates of current sample
                    _, _, r_ep, v_ep, _, _ = unpack_episode(episodes)

                    episode_values = np.mean(np.sum(v_ep, axis=1))
                    episode_reward = np.mean(np.sum(r_ep, axis=1))

                    # Train on online episode if applicable
                    train_online = False
                    train_offline = True

                    if train_online:
                        # Train PCL agent
                        _, _, summary = worker.train_pcl(
                            episodes, gamma, sess, merged_summary)

                        # Update summary information
                        train_steps = train_steps + 1
                        # if worker.name == "worker_0":
                        #    writer_summary.add_summary(summary, train_steps)

                    if train_offline:

                        # Sample len(envs) many episodes from the replay buffer
                        sampled_episodes = worker.replay_buffer.sample(
                            episode_count=len(worker.env))

                        # Train PCL agent
                        r_ep, v_ep, summary, logits = worker.train_pcl(
                            sampled_episodes, gamma, sess, merged_summary)
                        # Update global network
                        sess.run(worker.update_local_ops)

                        # Update learning rate based on calculated KL Divergence
                        if worker.update_learning_rate_:
                            # Calculate KL-Divergence of updated policy and policy before update
                            kl_divergence = worker.calculate_kl_divergence(
                                logits, sampled_episodes, sess)
                            # Perform learning rate update based on KL-Divergence
                            worker.update_learning_rate(kl_divergence, sess)

                        # Update summary information
                        train_steps = train_steps + 1
                        if worker.name == "worker_0":
                            writer.add_summary(summary, train_steps)

                        # Write add. summary information
                        episode_reward_offline = np.mean(np.sum(r_ep, axis=1))
                        episode_value_offline = np.mean(np.sum(v_ep, axis=1))

                elif worker.method == "A3C":
                    # Run an episode
                    while not worker.env.all_done():

                        # Get preferred action distribution
                        dummy_lengths = np.ones(len(worker.env))
                        a, v, rnn_state, _ = worker.act(
                            s, rnn_state, dummy_lengths, sess)

                        # Get action for every environment
                        act_ = [np.argmax(a_) for a_ in a]
                        # Sample new state and reward from environment
                        s2, r, terminal, info = worker.env.step(act_)
                        if worker.preprocessing_state:
                            s2 = U.process_frame(s2,
                                                 worker.preprocessing_config)

                        # Add states, rewards, actions, values and terminal information to A3C minibatch
                        worker.add_to_batch(s, r, a, v, terminal)

                        # Get episode information for tracking the training process
                        worker.episode_values.append(v)
                        worker.episode_reward.append(r)

                        # Train on mini batches from episode
                        if (episode_step_count % MINI_BATCH == 0
                                and episode_step_count > 0
                            ) or worker.env.all_done():

                            feed_dict_ = {
                                worker.local_AC.inputs: s2,
                                worker.local_AC.lengths_episodes: dummy_lengths
                            }

                            if worker.rnn_network:
                                feed_dict_[
                                    worker.local_AC.state_in[0]] = rnn_state[0]
                                feed_dict_[
                                    worker.local_AC.state_in[1]] = rnn_state[1]

                            v1 = sess.run([worker.local_AC.value], feed_dict_)

                            v_l, p_l, e_l, g_n, v_n, summary, logits = worker.train(
                                worker.episode_states_train,
                                worker.episode_reward_train,
                                worker.episode_actions_train,
                                worker.episode_values_train,
                                worker.episode_done_train, sess, gamma,
                                np.squeeze(v1), merged_summary)

                            if worker.env.all_done():
                                # Update global network
                                sess.run(worker.update_local_ops)

                                # Update learning rate based on calculated KL Divergence
                                if worker.update_learning_rate_:
                                    # Calculate KL-Divergence of updated policy and policy before update
                                    kl_divergence = worker.calculate_kl_divergence(
                                        logits, worker.episode_states_train,
                                        sess, worker.episode_done_train)
                                    # Perform learning rate update based on KL-Divergence
                                    if not np.isnan(kl_divergence):
                                        worker.update_learning_rate(
                                            kl_divergence, sess)

                            train_steps = train_steps + 1

                            # Update summary information
                            if worker.name == "worker_0":
                                writer.add_summary(summary, train_steps)

                            # Reset A3C minibatch after it has been used to update the model
                            worker.reset_batch()

                        # Set previous state for next step
                        s = s2
                        total_steps += 1
                        episode_step_count += 1

                    episode_values = np.mean(
                        np.sum(worker.episode_values, axis=0))
                    episode_reward = np.mean(
                        np.sum(worker.episode_reward, axis=0))

                if episode_count % 20 == 0:
                    print("Reward: " + str(episode_reward), " | Episode",
                          episode_count, " of " + worker.name,
                          " | Global Episode",
                          str(sess.run(worker.global_episodes)))
                    if worker.method == "PCL":
                        print("Reward Offline: " + str(episode_reward_offline),
                              " | Episode", episode_count,
                              " of " + worker.name)

                worker.episode_rewards.append(episode_reward)
                worker.episode_lengths.append(episode_step_count)
                worker.episode_mean_values.append(episode_values)

                sess.run(worker.increment)  # Next global episode

                episode_count += 1

        # Ask for all the services to stop.
        print("Worker stops because max episode runs are reached")
コード例 #11
0
ファイル: simulate_a3c.py プロジェクト: alexansari101/deep-rl
    trainer = tf.train.AdamOptimizer(learning_rate=1e-5)
    ac_net = AC_rnn_ra_Network(s_shape, a_size, 'global_0', None)
    saver = tf.train.Saver(max_to_keep=5)

with tf.Session() as sess:
    print('Loading Model...')
    ckpt = tf.train.get_checkpoint_state(model_path)
    saver.restore(sess, ckpt.model_checkpoint_path)

    rnn_state = ac_net.state_init
    i = 0
    d = False
    r = 0
    a = np.array([0, 0])
    while i < max_episode_length and d == False:
        s_p = process_frame(s)
        # Take an action using probabilities from policy network output.
        a, v, rnn_state = sess.run(
            [ac_net.sample_a, ac_net.value, ac_net.state_out],
            feed_dict={
                ac_net.inputs: [s_p],
                ac_net.prev_actions: [a],
                ac_net.prev_rewards: [[r]],
                ac_net.is_training_ph: False,
                ac_net.state_in[0]: rnn_state[0],
                ac_net.state_in[1]: rnn_state[1]
            })

        s, r, d = env_g.step(a)
        sarray.append(s)
        rarray.append(r)