def make_subset_buffer(buffer_path,
                       max_examples=100000,
                       frame_height=40,
                       frame_width=40):
    # keep max_examples < 100000 to enable knn search
    # states [top of image:bottom of image,:]
    # in breakout - can safely reduce size to be 80x80 of the given image
    # try to get an even number of each type of reward

    small_path = buffer_path.replace('.npz', '_%06d.npz' % max_examples)
    if os.path.exists(small_path):
        print('loading small buffer path')
        print(small_path)
        load_buffer = ReplayMemory(load_file=small_path)
    else:
        load_buffer = ReplayMemory(load_file=buffer_path)
        print('loading prescribed buffer path')
        print(buffer_path)

    # TODO if frame size is wrong - we arent handling
    if load_buffer.count > max_examples:
        print('creating small buffer')
        # actions for breakout:
        # ['NOOP', 'FIRE', 'RIGHT', 'LEFT']
        sbuffer = ReplayMemory(
            max_examples,
            frame_height=frame_height,
            frame_width=frame_width,
            agent_history_length=load_buffer.agent_history_length)

        # remove ends because they are scary
        ends = np.where(load_buffer.terminal_flags == 1)[0][1:-1]
        random_state.shuffle(ends)
        for tidx in ends:
            if sbuffer.count >= max_examples:
                print('stopping after %s examples' % sbuffer.count)
                continue
            else:
                # start after the last terminal
                i = tidx + 1
                # while there isnt a new terminal flag
                while not load_buffer.terminal_flags[i + 1]:
                    frame = cv2.resize(load_buffer.frames[i][:, :, None],
                                       (frame_height, frame_width))
                    sbuffer.add_experience(
                        action=load_buffer.actions[i],
                        frame=frame,
                        reward=load_buffer.rewards[i],
                        terminal=load_buffer.terminal_flags[i])
                    i += 1
                    if not i % 100:
                        print(sbuffer.count)

        sbuffer.save_buffer(small_path)
        load_buffer = sbuffer
    assert load_buffer.count > 10
    return load_buffer, small_path
class StateManager():
    def __init__(self):
        self.reward_space = [-1, 0, 1]
        self.latent_representation_function = None
        pass

    def create_new_state_instance(self, config_handler, phase):
        self.ch = config_handler
        self.save_time = time.time() - 100000
        self.phase = phase
        self.step_number = 0
        self.end_step_number = -1
        self.episode_number = 0
        self.seed = self.ch.cfg['RUN']['%s_seed' % self.phase]
        self.random_state = np.random.RandomState(self.seed)
        self.heads = np.arange(self.ch.cfg['DQN']['n_ensemble'])
        self.episodic_reward = []
        self.episodic_reward_avg = []
        self.episodic_step_count = []
        self.episodic_step_ends = []
        self.episodic_loss = []
        self.episodic_times = []
        self.episodic_eps = []

        self.env = self.ch.create_environment(self.seed)
        self.memory_buffer = self.ch.load_memory_buffer(self.phase)
        # TODO should you load the count from the memory buffer - ?
        self.step_number = self.memory_buffer.count
        self.setup_eps()

    def setup_eps(self):
        if self.phase == 'train':
            self.eps_init = self.ch.cfg['DQN']['eps_init']
            self.eps_final = self.ch.cfg['DQN']['eps_final']
            self.eps_annealing_steps = self.ch.cfg['DQN'][
                'eps_annealing_steps']
            self.last_annealing_step = self.eps_annealing_steps + self.ch.cfg[
                'DQN']['num_pure_random_steps_train']
            if self.eps_annealing_steps > 0:
                self.slope = -(self.eps_init -
                               self.eps_final) / self.eps_annealing_steps
                self.intercept = self.eps_init - self.slope * self.ch.cfg[
                    'DQN']['num_pure_random_steps_train']

    def load_checkpoint(self, filepath, config_handler=''):
        # load previously saved state file
        fh = open(filepath, 'rb')
        fdict = pickle.load(fh)
        fh.close()
        if config_handler != '':
            # use given config handler
            del fdict['ch']
            self.ch = config_handler

        self.__dict__.update(fdict)

        self.heads = np.arange(self.ch.cfg['DQN']['n_ensemble'])
        self.random_state = np.random.RandomState()
        self.random_state.set_state(fdict['state_random_state'])
        # TODO NOTE this does not restart at same env state
        self.seed = self.ch.cfg['RUN']['%s_seed' % self.phase]
        self.env = self.ch.create_environment(self.seed)
        buffer_path = filepath.replace('.pkl', '.npz')
        self.memory_buffer = ReplayMemory(load_file=buffer_path)
        # TODO should you load the count from the memory buffer - ?
        # TODO what about episode number - it will be off now
        self.step_number = self.memory_buffer.count
        self.setup_eps()

    def save_checkpoint(self, checkpoint_basepath):
        # pass in step number because we always want to use training step number as reference
        self.save_time = time.time()
        self.plot_progress(checkpoint_basepath)
        # TODO save this class - except for random state i assume
        self.memory_buffer.save_buffer(checkpoint_basepath + '.npz')
        # TOO big - prob need to save specifics
        ## preserve random state -
        self.state_random_state = self.random_state.get_state()
        save_dict = {
            'episodic_reward': self.episodic_reward,
            'episodic_reward_avg': self.episodic_reward_avg,
            'episodic_step_count': self.episodic_step_count,
            'episodic_step_ends': self.episodic_step_ends,
            'episodic_loss': self.episodic_loss,
            'episodic_times': self.episodic_times,
            'state_random_state': self.state_random_state,
            'episode_number': self.episode_number,
            'step_number': self.step_number,
            'phase': self.phase,
            'save_time': self.save_time,
            'ch': self.ch,
            'episodic_eps': self.episodic_eps,
        }
        fh = open(checkpoint_basepath + '.pkl', 'wb')
        pickle.dump(save_dict, fh)
        fh.close()
        print('finished pickle in', time.time() - self.save_time)

    def end_episode(self):
        # catalog
        self.end_time = time.time()
        self.end_step_number = deepcopy(self.step_number)
        # add to lists
        self.episodic_reward.append(np.sum(self.episode_rewards))
        self.episodic_step_count.append(self.end_step_number -
                                        self.start_step_number)
        self.episodic_step_ends.append(self.end_step_number)
        self.episodic_loss.append(np.mean(self.episode_losses))
        self.episodic_times.append(self.end_time - self.start_time)
        try:
            self.episodic_eps.append(self.eps)
        except:
            self.episodic_eps = [1.0 for x in range(len(self.episodic_times))]
        # smoothed reward over last 100 episodes
        self.episodic_reward_avg.append(
            np.mean(
                self.
                episodic_reward[-self.ch.cfg['PLOT']['num_prev_steps_avg']:]))
        num_steps = self.episodic_step_count[-1]
        print("*** %s E%05d S%010d AH%s-R%s num random/total steps:%s/%s***" %
              (self.phase, self.episode_number, self.step_number,
               self.active_head, self.episodic_reward[-1],
               self.num_random_steps, num_steps))
        self.episode_active = False
        self.episode_number += 1

    def start_episode(self):
        self.start_time = time.time()
        self.random_state.shuffle(self.heads)
        self.active_head = self.heads[0]
        self.end_step_number = -1

        self.episode_losses = []
        self.episode_actions = []
        self.episode_rewards = []
        self.start_step_number = deepcopy(self.step_number)
        self.num_random_steps = 0

        # restart counters
        self.terminal = False
        self.life_lost = True
        self.episode_reward = 0

        state = self.env.reset()
        self.prev_action = 0
        self.prev_reward = 0
        for i in range(state.shape[0] + 1):
            # add enough memories to use the memory buffer
            # not sure if this is correct
            self.memory_buffer.add_experience(
                action=0,
                frame=state[
                    -1],  # use last frame in state bc it is only nonzero one
                reward=0,
                terminal=0,
                end=0,
            )

        # get correctly formatted last state
        batch = self.memory_buffer.get_history_minibatch(indices='last')
        # get state
        self.state = batch[0][0]
        if self.state.shape != (self.ch.num_prev_steps,
                                self.memory_buffer.agent_history_length,
                                self.memory_buffer.frame_height,
                                self.memory_buffer.frame_width):
            print("start shape wrong")
            embed()
        self.episode_active = True
        return self.state

    def plot_current_episode(self, plot_basepath=''):
        if plot_basepath == '':
            plot_basepath = self.get_plot_basepath()
        plot_dict = {
            'mean loss': self.episode_losses,
            'actions': self.episode_actions,
            'rewards': self.episode_rewards,
        }
        suptitle = 'E%s S%s-%s R%s' % (
            self.episode_number, self.start_step_number, self.end_step_number,
            self.episodic_reward[-1])
        plot_path = plot_basepath + '_ep%06d.png' % self.episode_number
        #step_range = np.arange(self.start_step_number, self.end_step_number)
        #self.plot_data(plot_path, plot_dict, suptitle, xname='episode steps', xdata=step_range)
        self.plot_data(plot_path, plot_dict, suptitle,
                       xname='episode steps')  #, xdata=step_range)
        ep_steps = self.end_step_number - self.start_step_number
        self.plot_histogram(plot_basepath +
                            '_ep_histrewards_%06d.png' % self.episode_number,
                            data=self.episode_rewards,
                            bins=self.reward_space,
                            title='rewards TR%s' % self.episode_reward)
        self.plot_histogram(
            plot_basepath + '_ep_histactions_%06d.png' % self.episode_number,
            data=self.episode_actions,
            bins=self.env.action_space,
            title='actions acthead:%s nrand:%s/%s' %
            (self.active_head, self.num_random_steps, ep_steps))

    def plot_last_episode(self):
        ep_steps = self.end_step_number - self.start_step_number
        ep_states, ep_actions, ep_rewards, ep_next_states, ep_terminals, ep_masks, indexes = self.memory_buffer.get_last_n_states(
            ep_steps)
        plot_basepath = self.get_plot_basepath() + '_episode_states_frames'
        self.plot_episode_movie(plot_basepath, ep_states, ep_actions,
                                ep_rewards, ep_next_states, ep_terminals,
                                ep_masks, indexes)

    def plot_episode_movie(self, plot_basepath, states, actions, rewards,
                           next_states, terminals, masks, indexes):
        if not os.path.exists(plot_basepath):
            os.makedirs(plot_basepath)
        n_steps = states.shape[0]
        print('plotting episode of length %s' % n_steps)
        if self.latent_representation_function == None:
            n_cols = 2
        else:
            pred_next_states, zs, latents = self.latent_representation_function(
                states, actions, rewards, self.ch)
            n_cols = 4
        latent_image_path = os.path.join(plot_basepath, 'latent_step_%05d.png')
        ep_reward = sum(rewards)
        movie_path = plot_basepath + '_movie_R%04d.mp4' % ep_reward

        print('starting to make movie', movie_path)
        # write frame by frame then use ffmpeg to generate movie
        #image_path = os.path.join(plot_basepath, 'step_%05d.png')
        #w_path = plot_basepath+'_write_movie_R%04d.sh'%ep_reward
        #a = open(w_path, 'w')
        #cmd = "ffmpeg -n -r 30 -i %s -c:v libx264 -pix_fmt yuv420p %s"%(os.path.abspath(image_path),os.path.abspath(movie_path))
        #a.write(cmd)
        #a.close()
        #w,h = states[0,3].shape
        #treward = 0
        #for step in range(min(n_steps, 100)):
        #    f, ax = plt.subplots(1, n_cols)
        #    if not step%20:
        #        print('plotting step', step)
        #    ax[0].imshow(states[step, 3], cmap=plt.cm.gray)
        #    #ax[0].set_title('OS-A%s' %(actions[step]))
        #    ax[1].imshow(next_states[step, 3], cmap=plt.cm.gray)
        #    treward+=rewards[step]
        #    if self.latent_representation_function != None:
        #        ax[2].imshow(pred_next_states[step], cmap=plt.cm.gray)
        #        z = np.hstack((zs[step,0], zs[step,1], zs[step,2]))
        #        ax[3].imshow(z)
        #    for aa in range(n_cols):
        #        ax[aa].set_xticks([])
        #        ax[aa].set_yticks([])
        #    f.suptitle('%sA%sR%sT%sD%s'%(step, actions[step], rewards[step], treward, int(terminals[step])))
        #    plt.savefig(image_path%step)
        #    plt.close()

        # generate movie directly
        max_frames = 5000
        n = min(n_steps, max_frames)
        for step in range(n):
            if self.latent_representation_function != None:
                z = np.hstack((zs[step, 0], zs[step, 1], zs[step, 2]))
                zo = resize(z, (84, 84), cval=0, order=0)
                # TODO - is imwrite clipping zo since it is not a uint8?
                img = np.hstack(
                    (states[step,
                            3], next_states[step,
                                            3], pred_next_states[step], zo))
            else:
                img = np.hstack((states[step, 3], next_states[step, 3]))

            if not step:
                movie = np.zeros((n, img.shape[0], img.shape[1]))
                latent_movie = np.zeros((n, z.shape[0], z.shape[1]))
            movie[step] = img
            latent_movie[step] = z
        vwrite(movie_path, movie)

    def plot_histogram(self, plot_path, data, bins, title=''):
        n, bins, _ = plt.hist(data, bins=bins)
        plt.xticks(bins, bins)
        plt.yticks(n, n)
        plt.xlim(min(bins), max(bins) + 1)
        plt.title(title)
        plt.savefig(plot_path)
        plt.close()

    def plot_progress(self, plot_basepath=''):
        if plot_basepath == '':
            plot_basepath = self.get_plot_basepath()
        det_plot_dict = {
            'episodic step count': self.episodic_step_count,
            'episodic time': self.episodic_times,
            'mean episodic loss': self.episodic_loss,
            'eps': self.episodic_eps,
        }

        suptitle = 'Details E%s S%s' % (self.episode_number,
                                        self.end_step_number)
        edet_plot_path = plot_basepath + '_details_episodes.png'
        sdet_plot_path = plot_basepath + '_details_episodes.png'
        if self.end_step_number > 1:
            #exdata = np.arange(self.episode_number)
            #self.plot_data(edet_plot_path, det_plot_dict, suptitle, xname='episode', xdata=exdata)
            #self.plot_data(sdet_plot_path, det_plot_dict, suptitle, xname='steps', xdata=self.episodic_step_ends)
            self.plot_data(edet_plot_path,
                           det_plot_dict,
                           suptitle,
                           xname='episode')  #, xdata=exdata)
            self.plot_data(sdet_plot_path,
                           det_plot_dict,
                           suptitle,
                           xname='steps',
                           xdata=self.episodic_step_ends)

            rew_plot_dict = {
                'episodic reward': self.episodic_reward,
                'smooth episodic reward': self.episodic_reward_avg,
            }

            suptitle = 'Reward E%s S%s R%s' % (self.episode_number,
                                               self.end_step_number,
                                               self.episodic_reward[-1])
            erew_plot_path = plot_basepath + '_reward_episodes.png'
            srew_plot_path = plot_basepath + '_reward_steps.png'
            #self.plot_data(erew_plot_path, rew_plot_dict, suptitle, xname='episode', xdata=np.arange(self.episode_number))
            #self.plot_data(srew_plot_path, rew_plot_dict, suptitle, xname='steps', xdata=self.episodic_step_ends)
            self.plot_data(
                erew_plot_path, rew_plot_dict, suptitle,
                xname='episode')  #, xdata=np.arange(self.episode_number))
            self.plot_data(srew_plot_path,
                           rew_plot_dict,
                           suptitle,
                           xname='steps',
                           xdata=self.episodic_step_ends)

    def plot_data(self, savepath, plot_dict, suptitle, xname, xdata=None):
        st = time.time()
        print('starting plot data')
        n = len(plot_dict.keys())
        f, ax = plt.subplots(n, 1, figsize=(6, 3 * n))
        #f,ax = plt.subplots(n,1)
        try:
            for xx, name in enumerate(sorted(plot_dict.keys())):
                if xdata is not None:
                    ax[xx].plot(xdata, plot_dict[name])
                else:
                    ax[xx].plot(plot_dict[name])
                ax[xx].set_title('%s' % (name))
                ax[xx].set_ylabel(name)
                print(name, xname, st - time.time())
            ax[xx].set_xlabel(xname)
            f.suptitle('%s %s' % (self.phase, suptitle))
            print('end sup', st - time.time())
            f.savefig(savepath)
            print("saved: %s" % savepath)
            plt.close()
            print('finished')
        except Exception:
            print("plot")
            embed()

    def get_plot_basepath(self):
        return self.ch.get_checkpoint_basepath(
            self.step_number) + '_%s' % self.phase

    def handle_plotting(self, plot_basepath='', force_plot=False):
        # will plot at beginning of episode
        #if not self.episode_number % self.ch.cfg['PLOT']['plot_episode_every_%s_episodes'%self.phase]:
        # dont plot first episode
        plot_basepath = self.get_plot_basepath()
        if self.episode_number:
            if force_plot:
                self.plot_current_episode(plot_basepath)
                self.plot_progress(plot_basepath)
            if self.episode_number == 1 or not self.episode_number % self.ch.cfg[
                    'PLOT']['plot_episode_every_%s_episodes' % self.phase]:
                self.plot_current_episode(plot_basepath)
            if self.episode_number == 1 or not self.episode_number % self.ch.cfg[
                    'PLOT']['plot_every_%s_episodes' % self.phase]:
                self.plot_progress(plot_basepath)

    def step(self, action):
        next_state, reward, self.life_lost, self.terminal = self.env.step(
            action)
        self.prev_action = action
        self.prev_reward = np.sign(reward)
        # the replay buffer will convert the observed state as needed
        self.memory_buffer.add_experience(
            action=action,
            frame=next_state[-1],
            reward=self.prev_reward,
            terminal=self.life_lost,
            end=self.terminal,
        )
        self.episode_actions.append(self.prev_action)
        self.episode_rewards.append(self.prev_reward)
        self.step_number += 1
        batch = self.memory_buffer.get_history_minibatch(indices='last')
        # get state
        self.state = batch[0][0]
        #self.state = self.memory_buffer.get_last_state()
        if self.state.shape[1] == 0:
            print('handler state chan 0')
            embed()

    def set_eps(self):
        # TODO function to find eps - for now use constant
        if self.step_number <= self.ch.cfg['DQN']['num_pure_random_steps_%s' %
                                                  self.phase]:
            self.eps = 1.0
        if self.phase == 'train':
            self.eps = self.eps_final
            if self.step_number < self.last_annealing_step:
                self.eps = self.slope * self.step_number + self.intercept
        else:
            self.eps = self.ch.cfg['EVAL']['eps_eval']

    def random_action(self):
        self.num_random_steps += 1
        # pass action_idx to env.action_space
        return self.random_state.choice(range(self.env.num_actions))

    def is_random_action(self):
        self.set_eps()
        r = self.random_state.rand()
        if r < self.eps:
            return True
        else:
            return False
Example #3
0
def make_random_subset_buffers(dataset_path,
                               buffer_path,
                               train_max_examples=100000,
                               kernel_size=(2, 2),
                               trim_before=0,
                               trim_after=0):
    sys.path.append('../agents')
    from replay import ReplayMemory
    # keep max_examples < 100000 to enable knn search
    # states [top of image:bottom of image,:]
    # in breakout - can safely reduce size to be 40x40 of the given image
    # try to get an even number of each type of reward

    if not os.path.exists(dataset_path):
        os.makedirs(dataset_path)
    buffer_name = os.path.split(buffer_path)[1]
    buffers = {}
    paths = {}
    for phase in ['valid', 'train']:
        if phase == 'valid':
            max_examples = int(0.15 * train_max_examples)
        else:
            max_examples = train_max_examples
        small_name = buffer_name.replace(
            '.npz', '_random_subset_%06d_%sx%stb%sta%s_%s.npz' %
            (max_examples, kernel_size[0], kernel_size[1], trim_before,
             trim_after, phase))
        small_path = os.path.join(dataset_path, small_name)
        paths[phase] = small_path
        if os.path.exists(small_path):
            print('loading small buffer path')
            print(small_path)
            sbuffer = ReplayMemory(load_file=small_path)
            sbuffer.init_unique()
            buffers[phase] = sbuffer

    # if we dont have both train and valid - make completely new train/valid set
    if not len(buffers.keys()) == 2:
        print('creating new train/valid buffers')
        load_buffer = ReplayMemory(load_file=buffer_path)
        orig_states = []
        small_states = []
        for index in range(10, 400):
            if load_buffer.is_valid_index(index):
                s, _ = load_buffer._get_state(index)
                orig_states.append(s[-1])
                small_states.append(
                    load_buffer.online_shrink_frame_size(
                        s[-1], trim_before, kernel_size, trim_after))
        bdir = small_path.replace('.npz', '')
        if not os.path.exists(bdir):
            os.makedirs(bdir)
        image_path = os.path.join(bdir, 'step_%03d.png')
        movie_path = os.path.join(bdir, 'movie.mp4')
        for index in range(len(orig_states)):
            f, ax = plt.subplots(1, 2)
            ax[0].matshow(orig_states[index])
            ax[1].matshow(small_states[index])
            plt.savefig(image_path % index)
            plt.close()
        cmd = "ffmpeg -n -r 10 -i %s -c:v libx264 -pix_fmt yuv420p %s" % (
            os.path.abspath(image_path), os.path.abspath(movie_path))
        print(cmd)
        os.system(cmd)

        if max(list(kernel_size) + [trim_before, trim_after]) > 1:
            load_buffer.shrink_frame_size(kernel_size=kernel_size,
                                          reduction_function=np.max,
                                          trim_before=trim_before,
                                          trim_after=trim_after)

        #for r in range(states.shape[0]):
        #    imwrite('mp%s.png'%r, states[r,-1])

        load_buffer.reset_unique()
        # history_length + 1 for every random example
        frame_multiplier = (load_buffer.agent_history_length + 1)
        #frame_multiplier = 2
        total_frames_needed = int((max_examples * 1.15) * frame_multiplier) + 1
        # not sure why we weren't allowing overlapping frames
        #total_frames_needed = int((max_examples*1.15))
        if load_buffer.count < total_frames_needed % load_buffer.size:
            raise ValueError(
                'load buffer is not large enough (%s) to collect number of examples (%s)'
                % (load_buffer.count, total_frames_needed))
        print('loading prescribed buffer path.... this may take a while')
        print(buffer_path)
        for phase in ['valid', 'train']:
            if phase == 'valid':
                max_examples = int(0.15 * train_max_examples)
            else:
                max_examples = train_max_examples
            print('creating small %s buffer with %s examples' %
                  (phase, max_examples))
            # actions for breakout:
            # ['NOOP', 'FIRE', 'RIGHT', 'LEFT']
            frames_needed = max_examples * frame_multiplier
            sbuffer = ReplayMemory(
                frames_needed,
                frame_height=load_buffer.frame_height,
                frame_width=load_buffer.frame_width,
                agent_history_length=load_buffer.agent_history_length)

            num_examples = 0
            while num_examples < max_examples:
                batch = load_buffer.get_unique_minibatch(1)
                states, actions, rewards, next_states, real_terminal_flags, _, unique_indices, index_indices = batch
                bs, num_hist, h, w = states.shape
                # action is the action that was used to get from state to next state
                #    t-3, t-2, t-1, t-1, t
                #  s-4, s-3, s-2, s-1
                #     s-3, s-2, s-1, s

                past_indices = np.arange(unique_indices - (num_hist),
                                         unique_indices + 1)
                for batch_idx in range(bs):
                    # get t-4 thru t=0
                    # size is bs,5,h,w
                    all_states = np.hstack((states[:, 0:1], next_states))
                    for ss in range(num_hist + 1):
                        # only use batch size 1 in minibatch
                        # frame is "next state" in replay buffer
                        frame = all_states[batch_idx, ss]
                        action = load_buffer.actions[past_indices[ss]]
                        reward = load_buffer.rewards[past_indices[ss]]
                        if ss == num_hist:
                            # this is the observed state and the only one we will
                            # use a true action/reward for
                            #action = actions[batch_idx]
                            #reward = rewards[batch_idx]
                            terminal_flag = True
                            end_flag = True
                            num_examples += 1
                            if not num_examples % 5000:
                                print('added %s examples to %s buffer' %
                                      (num_examples, phase))
                        else:
                            # use this to debug and assert that all actions/rewards
                            # in sampled minibatch of sbuffer are < 99
                            terminal_flag = False
                            end_flag = False

                        sbuffer.add_experience(action, frame, reward,
                                               terminal_flag, end_flag)
            sbuffer.rewards = sbuffer.rewards.astype(np.int32)
            sbuffer.init_unique()
            sbuffer.save_buffer(paths[phase])
            buffers[phase] = sbuffer
    return buffers, paths