Exemple #1
0
class Worker(threading.Thread):

    global_episode = 0

    global_moving_average_reward = 0
    best_score = 0
    save_lock = threading.Lock()

    def __init__(self,
                 state_size,
                 action_size,
                 global_model,
                 opt,
                 result_queue,
                 idx,
                 game_name='Tetris',
                 save_dir='/tmp'):
        super(Worker, self).__init__()
        self.state_size = state_size
        self.action_size = action_size
        self.result_queue = result_queue
        self.global_model = global_model
        self.opt = opt
        self.local_model = ActorCriticModel(self.state_size, self.action_size)
        self.worker_idx = idx
        self.env = gym_tetris.make('TetrisA-v0')
        self.env = JoypadSpace(self.env, MOVEMENT)
        self.save_dir = save_dir
        self.ep_loss = 0.0
        self.game_name = 'Tetris'

    def run(self):
        total_step = 1
        mem = Memory()

        while Worker.global_episode < episodios:
            self.env.reset()
            estado = [0., 0., 0., 0.]
            mem.clear()
            ep_reward = 0.
            ep_steps = 0
            self.ep_loss = 0
            informacion = self.env.get_info()
            antiguo_statistics = informacion['statistics']
            time_count = 0

            done = False
            pieza_colocada = True

            while not done:

                # Si hemos colocado la pieza calculamos la posicion y el giro de la proxima pieza
                if pieza_colocada:
                    pieza_colocada = False
                    pos = 5
                    giro = 1
                    u = -1
                    ant_nom_piez = ''
                    estado = [estado]

                    logits, _ = self.local_model(
                        tf.convert_to_tensor(estado, dtype=tf.float32))

                    probs = tf.nn.softmax(logits)

                    prob = probs[0][39]
                    probs = np.delete(probs[0], 39)
                    suma = np.sum(probs)
                    probs = np.insert(probs, 39, abs(1 - suma))

                    action = np.random.choice(self.action_size, p=probs)
                    pos_objetivo = action % 10
                    giro_objetivo = (action // 10) + 1

                # Colocamos la pieza donde hemos calculado girandola y moviendola
                if (giro % giro_objetivo) != 0 and not done:
                    state, reward, done, info = self.env.step(1)
                    accion = 0
                    giro = giro + 1
                elif pos > pos_objetivo and not done:
                    state, reward, done, info = self.env.step(6)
                    pos = pos - 1
                    accion = 0
                elif pos < pos_objetivo and not done:
                    state, reward, done, info = self.env.step(3)
                    pos = pos + 1
                    accion = 0
                elif not done and not pieza_colocada:
                    state, reward, done, info = self.env.step(9)
                    accion = 9
                else:
                    accion = 0
                if not done:
                    new_state, reward, done, info = self.env.step(accion)

                informacion = self.env.get_info()

                # Si la pieza ha sido colocada calculamos las ganancias del movimiento

                if antiguo_statistics != informacion['statistics']:
                    antiguo_statistics = informacion['statistics']
                    ep_reward_new = informacion['score']
                    reward = ep_reward_new - ep_reward
                    board = self.env.board()
                    nuevo_estado = board_prop(board)[:]
                    pieza_colocada = True
                    k = 1
                    if nuevo_estado[0] > 18:
                        done = True

                    ep_reward = ep_reward_new

                    mem.store(estado[0], action, reward)

                    # Calculamos el gradiente local usando la perdida calculada de nuestra partida actual y
                    # nuestro modelo

                    if time_count == 10 or done:

                        with tf.GradientTape() as tape:
                            total_loss = self.compute_loss(
                                done, nuevo_estado, mem, 0.99)
                        self.ep_loss += total_loss

                        grads = tape.gradient(
                            total_loss, self.local_model.trainable_weights)

                        self.opt.apply_gradients(
                            zip(grads, self.global_model.trainable_weights))

                        self.local_model.set_weights(
                            self.global_model.get_weights())

                        mem.clear()
                        time_count = 0

                        if done:
                            Worker.global_moving_average_reward = \
                            record(Worker.global_episode, ep_reward, self.worker_idx,
                                   Worker.global_moving_average_reward, self.result_queue,
                                   self.ep_loss, ep_steps)

                            if ep_reward > Worker.best_score:
                                with Worker.save_lock:

                                    self.global_model.save_weights(
                                        os.path.join(
                                            self.save_dir,
                                            'model_{}.h5'.format(
                                                self.game_name)))
                                    Worker.best_score = ep_reward
                            Worker.global_episode += 1
                    ep_steps += 1

                    time_count += 1
                    estado = nuevo_estado

                    total_step += 1
        self.result_queue.put(None)

    # Calculamos la perdida

    def compute_loss(self, done, nuevo_estado, memory, gamma=0.99):
        if done:
            reward_sum = 0.  # terminal
        else:
            nuevo_estado = [nuevo_estado]
            reward_sum = self.local_model(
                tf.convert_to_tensor(nuevo_estado,
                                     dtype=tf.float32))[-1].numpy()[0]

        discounted_rewards = []
        for reward in memory.rewards[::-1]:
            reward_sum = reward + gamma * reward_sum
            discounted_rewards.append(reward_sum)
        discounted_rewards.reverse()

        logits, values = self.local_model(
            tf.convert_to_tensor(np.vstack(memory.states), dtype=tf.float32))

        advantage = tf.convert_to_tensor(np.array(discounted_rewards)[:, None],
                                         dtype=tf.float32) - values

        value_loss = advantage**2

        policy = tf.nn.softmax(logits)
        entropy = tf.nn.softmax_cross_entropy_with_logits_v2(labels=policy,
                                                             logits=logits)

        policy_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=memory.actions, logits=logits)
        policy_loss *= tf.stop_gradient(advantage)
        policy_loss -= 0.01 * entropy
        total_loss = tf.reduce_mean((0.5 * value_loss + policy_loss))
        return total_loss
Exemple #2
0
    class Env(object):
        def __init__(self, game, **kwargs):
            self.act_space = kwargs.get("act_space")
            self.state_size = kwargs.get("state_size")
            self.burn_in = kwargs.get("burn_in")
            self.seqlen = kwargs.get("seqlen")
            self.n_step = kwargs.get("n_step")
            self.frames = kwargs.get("frames")
            self.replay = kwargs.get("replay")
            self.use_epsilon_greedy = kwargs.get("use_epsilon_greedy")

            self.game = game

            self.count = 0

            env = gym_super_mario_bros.make(game)
            if self.act_space == 7:
                self.env = JoypadSpace(env, SIMPLE_MOVEMENT)
            elif self.act_space == 12:
                self.env = JoypadSpace(env, COMPLEX_MOVEMENT)

            self.max_pos = -10000
            self.done = True
            self.reset()

        def step(self, a, a_logits, v_cur, state_in):
            self.count += 1
            if self.use_epsilon_greedy:
                a = np.argmax(a_logits)
                a_logits = self.epsilon / self.act_space * np.ones(
                    self.act_space)
                a_logits[a] += (1 - self.epsilon)
                a_logits = np.log(a_logits)
                if random.random() < self.epsilon:
                    a = random.randint(0, self.act_space - 1)
            self.a_t = a
            gs_t1, gr_t, gdone, ginfo = self.env.step(self.a_t)
            if not gdone:
                s_t1, r_t, done, info = self.env.step(self.a_t)
                r_t += gr_t
                r_t /= 2.
            else:
                s_t1 = gs_t1
                r_t = gr_t
                done = gdone
                info = ginfo
            r_t /= 15.0
            s_t1 = self.resize_image(s_t1)
            channels = s_t1.shape[-1]
            self.s_t = np.concatenate([s_t1, self.s_t[:, :, :-channels]],
                                      axis=-1)

            self.s.append(self.s_t)
            self.a.append(self.a_t)
            self.a_logits.append(a_logits)
            self.r.append(r_t)
            self.v_cur.append(v_cur)
            self.max_pos = max(self.max_pos, info["x_pos"])
            self.pos.append(info["x_pos"])
            if (len(self.pos) >
                    100) and (info["x_pos"] - self.pos[-100] <
                              5) and (self.pos[-100] - info["x_pos"] < 5):
                done = True
            self.done = done
            if self.done:
                self.mask.append(0)
            else:
                self.mask.append(1)

            self.state_in.append(state_in)
            """
            get segs
            """
            segs = self.get_history()

            self.reset()

            return segs

        def reset(self):
            if self.done:
                print(self.game, self.max_pos)

                self.count = 0
                self.epsilon = 0.4**random.uniform(1, 8)

                s_t = self.resize_image(self.env.reset())

                self.s_t = np.tile(s_t, [1, 1, self.frames])
                self.s = [
                    np.zeros_like(self.s_t) for i in range(self.burn_in)
                ] + [self.s_t]

                self.a_t = random.randint(0, self.act_space - 1)
                self.a = [
                    random.randint(0, self.act_space - 1)
                    for i in range(self.burn_in)
                ] + [self.a_t]
                self.a_logits = [
                    np.zeros(self.act_space) for i in range(self.burn_in)
                ]
                self.r = [0] * self.burn_in + [0]
                self.v_cur = [0] * self.burn_in
                self.mask = [1] * self.burn_in + [1]

                self.max_pos = -10000
                self.pos = []

                state_in = np.zeros(self.state_size, dtype=np.float32)
                self.state_in = [state_in] * self.burn_in + [state_in]

                self.done = False

        def get_state(self):
            return self.s_t

        def get_act(self):
            return self.a_t

        def get_reward(self):
            return self.r[-1]

        def get_max_pos(self):
            return self.max_pos

        def get_state_in(self):
            return self.state_in[-1]

        def get_history(self):
            segs = []
            t = self.burn_in // self.replay
            if self.done:
                for i in range(self.replay):
                    seg = Seg(self.s[i * t:], self.a[i * t:],
                              self.a_logits[i * t:], self.r[i * t:],
                              self.v_cur[i * t:], self.state_in[i * t:],
                              self.mask[i * t:])
                    segs += self.postprocess(seg)
            elif len(self.s) >= self.burn_in + self.seqlen + self.n_step:
                cut = self.burn_in + self.seqlen + self.n_step
                seg = Seg(self.s[:cut], self.a[:cut], self.a_logits[:cut],
                          self.r[:cut], self.v_cur[:cut], self.state_in[:cut],
                          self.mask[:cut])

                self.s = self.s[t:]
                self.a = self.a[t:]
                self.a_logits = self.a_logits[t:]
                self.r = self.r[t:]
                self.v_cur = self.v_cur[t:]
                self.state_in = self.state_in[t:]
                self.mask = self.mask[t:]

                return [self.postprocess_one_seg(seg)]
            return segs

        def postprocess_one_seg(self, seg):
            seqlen = self.seqlen + self.burn_in + self.n_step

            next_seg = dict()

            next_seg["s"] = padding(seg.s[:seqlen], seqlen, np.uint8)
            next_seg["a"] = padding(seg.a[:seqlen], seqlen, np.int32)
            next_seg["a_logits"] = padding(seg.a_logits[:seqlen], seqlen,
                                           np.float32)
            next_seg["r"] = padding(seg.r[:seqlen], seqlen, np.float32)
            next_seg["v_cur"] = padding(seg.v_cur[:seqlen], seqlen, np.float32)
            next_seg["state_in"] = np.array(seg.state_in[0], np.float32)
            next_seg["mask"] = padding(seg.mask[:seqlen], seqlen, np.int32)

            return next_seg

        def postprocess(self, seg):
            """
            postprocess the seg for training
            :author lhw
            """
            burn_in = self.burn_in
            seg_results = []
            if seg is not None:
                while len(seg[0]) > burn_in + self.n_step:
                    next_seg = self.postprocess_one_seg(seg)
                    seg_results.append(next_seg)
                    seg = Seg(*[t[burn_in:] for t in seg])
            return seg_results

        @staticmethod
        def resize_image(image, size=84):
            image = Image.fromarray(image)
            image = image.convert("L")
            image = image.resize((size, size))
            image = np.array(image, np.uint8)
            return image[:, :, None]
class MarioEnvironment(dm_env.Environment):
    def __init__(
        self,
        skip_frames: int = 3,
        img_rescale_pc: float = 0.4,
        stack_func: Optional[Callable[[List[np.ndarray]],
                                      np.ndarray]] = np.hstack,
        stack_mode: str = "all",
        grayscale: bool = True,
        black_background: bool = True,
        in_game_score_weight: float = 0.01,
        movement_type: str = "simple",
        world_and_level: Optional[Tuple[int, int]] = None,
        idle_frames_threshold: Optional[int] = 1250,
        colorful_rendering: bool = True,
    ) -> None:
        assert stack_mode in ("first_and_last", "all")
        self._stack_mode = stack_mode

        env_name = (f"SuperMarioBros" if world_and_level is None else
                    "SuperMarioBros-%d-%d" % world_and_level)
        env_name += f"-v{int(black_background)}"
        self._smb_env = gym_super_mario_bros.make(env_name)
        self._smb_env = JoypadSpace(self._smb_env,
                                    MOVEMENTS_TYPES[movement_type])

        self._actions_queue = []
        self._colorful_env = None
        if (grayscale or black_background) and colorful_rendering:
            self._colorful_env = gym_super_mario_bros.make(
                "SuperMarioBros-%d-%d-v0" % world_and_level)
            self._colorful_env = JoypadSpace(self._colorful_env,
                                             MOVEMENTS_TYPES[movement_type])

        self._stack_func = stack_func
        self._grayscale = grayscale

        self._score_weight = in_game_score_weight
        self._idle_frames_threshold = idle_frames_threshold

        self._last_score = 0
        self._last_x = 40
        self._idle_counter = 0

        self._rescale_pc = img_rescale_pc
        self._skip_frames = skip_frames

        self._obs_shape = self.reset().observation.shape
        self._num_actions = self._smb_env.action_space.n

    def reset(self):
        """ Returns the first `TimeStep` of a new episode. """
        self._smb_env.reset()
        self._last_score = 0
        self._last_x = 40
        self._idle_counter = 0

        self._actions_queue = []
        if self._colorful_env is not None:
            self._colorful_env.reset()

        return dm_env.restart(self.step(0).observation)

    def _is_idle(self, info):
        if self._idle_frames_threshold is None:
            return False

        x = info["x_pos"]
        delta_x = x - self._last_x
        self._last_x = x

        if abs(delta_x) < 1:
            self._idle_counter += 1
            return self._idle_counter > self._idle_frames_threshold

        self._idle_counter = 0
        return False

    def step(self, action) -> TimeStep:
        """ Updates the environment's state. """
        # NOTE:
        # The gym_super_mario_bros environment reuses the numpy array it
        # returns as observation. When stacking observations, this might be
        # a source of bugs (all observations in the stack might be representing
        # the same, final frame!), so always copy the arrays when doing that.
        # The observation arrays are already being copied inside
        # `self._preprocess_img`, so no explicit copying is needed here.

        action = int(action)
        initial_img, total_reward, done, info = self._smb_env.step(action)
        self._actions_queue.append(action)
        done = done or self._is_idle(info)

        # Skipping frames:
        if self._skip_frames > 0:
            imgs = [self._process_img(initial_img)]
            skip_count = 0
            while skip_count < self._skip_frames:
                skip_count += 1
                if not done:
                    last_img, reward, done, info = self._smb_env.step(action)
                    self._actions_queue.append(action)
                    done = done or self._is_idle(info)
                    total_reward += reward
                else:
                    last_img = np.zeros_like(initial_img)

                if self._stack_mode == "all" or skip_count == self._skip_frames:
                    imgs.append(self._process_img(last_img))

            obs = self._stack_func(imgs)
        # Single frame:
        else:
            obs = self._process_img(initial_img)

        score_diff = info["score"] - self._last_score
        self._last_score = info["score"]
        total_reward = np.float64(total_reward +
                                  self._score_weight * score_diff)

        if done:
            return dm_env.termination(reward=total_reward, observation=obs)
        return dm_env.transition(reward=total_reward, observation=obs)

    def observation_spec(self):
        return dm_env.specs.BoundedArray(shape=self._obs_shape,
                                         dtype=np.float32,
                                         name="image",
                                         minimum=0,
                                         maximum=1)

    def action_spec(self):
        return dm_env.specs.DiscreteArray(dtype=np.int32,
                                          name="action",
                                          num_values=self._num_actions)

    def _process_img(self, img):
        img = np.divide(img, 255)
        img = img[50:, :, :]

        if abs(self._rescale_pc - 1) > 1e-2:
            img = rescale(img, scale=self._rescale_pc, multichannel=True)

        if self._grayscale:
            img = img @ RGB2GRAY_COEFFICIENTS

        return img.astype(np.float32, copy=True)

    def render(self, mode="human", return_all_imgs=False):
        if return_all_imgs:
            assert self._colorful_env is not None and mode == "rgb_array", (
                "The option 'return_all_imgs' is valid only when using "
                "colorful rendering and rgb array mode!")

        # Regular rendering:
        if self._colorful_env is None:
            return self._smb_env.render(mode)

        # Colorful rendering:
        img_list = []
        for action in self._actions_queue:
            self._colorful_env.step(action)
            if return_all_imgs:
                # NOTE: make sure a copy of the returned rgb array is made!
                img_list.append(self._colorful_env.render(mode).copy())

        self._actions_queue = []
        return img_list if return_all_imgs else self._colorful_env.render(mode)

    def plot_obs(self, obs):
        plt.imshow(obs, cmap="gray" if self._grayscale else None)
        plt.show()

    def close(self):
        self._smb_env.close()
Exemple #4
0
class Env(object):
    def __init__(self, act_space, act_repeats, frames, epsilon, game):
        self.act_space = act_space
        self.act_repeats = act_repeats
        self.act_repeat = random.choice(self.act_repeats)
        self.epsilon = epsilon
        self.frames = frames

        self.max_pos = -10000

        self.count = 0

        env = gym_super_mario_bros.make(game)
        if act_space == 7:
            self.env = JoypadSpace(env, SIMPLE_MOVEMENT)
        elif act_space == 12:
            self.env = JoypadSpace(env, COMPLEX_MOVEMENT)

        s_t = self.resize_image(self.env.reset())

        self.s_t = np.tile(s_t, [1, 1, frames])
        self.s = [self.s_t]

        self.a_t = random.randint(0, act_space - 1)
        self.a = [self.a_t]
        self.r = []
        self.pos = []

        c_in = np.zeros(256, dtype=np.float32)
        h_in = np.zeros(256, dtype=np.float32)
        state_in = np.concatenate([c_in, h_in], axis=-1)
        self.state_in = [state_in]

        self.done = False

    def step(self, a, state_in):
        self.count += 1
        if random.random() < self.epsilon:
            a = random.randint(0, self.act_space - 1)
        if self.count % self.act_repeat == 0:
            self.a_t = a
            self.count = 0
            self.act_repeat = random.choice(self.act_repeats)
        gs_t1, gr_t, gdone, ginfo = self.env.step(self.a_t)
        if not gdone:
            s_t1, r_t, done, info = self.env.step(self.a_t)
            r_t += gr_t
            r_t /= 2.
        else:
            s_t1 = gs_t1
            r_t = gr_t
            done = gdone
            info = ginfo
        r_t /= 15.
        s_t1 = self.resize_image(s_t1)
        channels = s_t1.shape[-1]
        self.s_t = np.concatenate([s_t1, self.s_t[:, :, :-channels]], axis=-1)

        self.s.append(self.s_t)
        self.a.append(self.a_t)
        self.r.append(r_t)
        self.max_pos = max(self.max_pos, info["x_pos"])
        self.pos.append(info["x_pos"])
        if (len(self.pos) > 500) and (info["x_pos"] - self.pos[-500] < 5) and (
                self.pos[-500] - info["x_pos"] < 5):
            done = True
        self.done = done

        self.state_in.append(state_in)

    def reset(self, force=False):
        if self.done or force:
            self.count = 0
            self.act_repeat = random.choice(self.act_repeats)

            s_t = self.resize_image(self.env.reset())

            self.s_t = np.tile(s_t, [1, 1, self.frames])
            self.s = [self.s_t]

            self.a_t = random.randint(0, self.act_space - 1)
            self.a = [self.a_t]
            self.r = []
            self.pos = []

            c_in = np.zeros(256, dtype=np.float32)
            h_in = np.zeros(256, dtype=np.float32)
            state_in = np.concatenate([c_in, h_in], axis=-1)
            self.state_in = [state_in]

            self.done = False

    def get_state(self):
        return self.s_t

    def get_act(self):
        return self.a_t

    def get_max_pos(self):
        return self.max_pos

    def reset_max_pos(self):
        self.max_pos = -10000

    def get_state_in(self):
        return self.state_in[-1]

    def get_history(self, force=False):
        if self.done or force:
            seg = Seg(self.s, self.a, self.r, self.state_in)
            return seg
        return None

    @staticmethod
    def resize_image(image, size=84):
        image = Image.fromarray(image)
        image = image.convert("L")
        image = image.resize((size, size))
        image = np.array(image)
        image = image / 255.
        image = np.array(image, np.float32)
        return image[:, :, None]
In Mario, the environment consists of tubes, mushrooms and other
components.

When Mario makes an action, the environment responds with the changed
(next) state, reward and other info.
"""

# Initialize Super Mario environment
env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0")

# Limit the action-space to
#   0. walk right
#   1. jump right
env = JoypadSpace(env, [["right"], ["right", "A"]])

env.reset()
next_state, reward, done, info = env.step(action=0)
print(f"{next_state.shape},\n {reward},\n {done},\n {info}")
"""Preprocess Environment
------------------------

Environment data is returned to the agent in ``next_state``. As you saw
above, each state is represented by a ``[3, 240, 256]`` size array.
Often that is more information than our agent needs; for instance,
Mario’s actions do not depend on the color of the pipes or the sky!

We use **Wrappers** to preprocess environment data before sending it to
the agent.

``GrayScaleObservation`` is a common wrapper to transform an RGB image
to grayscale; doing so reduces the size of the state representation
Exemple #6
0
        'ac_kwargs':
        dict(hidden_sizes=[64] * 2),
        'device':
        'cpu'
    }

    policy = PPO(**kwargs)
    policy.ac.load_state_dict(torch.load(model_name, map_location='cpu'))
    obs_normal = joblib.load(save_name)['obs_normal']
    if obs_normal is not None: obs_normal.cpu = 1
    # This Command for slower(setpts > 1.0) of faster(setpts < 1.0) video
    # ffmpeg -r 60 -i input.mp4 -filter:v "setpts=2.0*PTS" output.mp4
    # This Command for add audio to video(Magic)
    # ffmpeg -i video.mp4 -i audio.mp3 -map 0:v -map 1:a -codec copy -shortest out.mp4
    # Source: https://stackoverflow.com/questions/20254846/how-to-add-an-external-audio-track-to-a-video-file-using-vlc-or-ffmpeg-command-l

    # xvfb-run -s "-screen 0 640x480x24" python test_model.py
    # this cmd are for envs needs display to render(like CartPloy-v1)
    for ep in range(10):
        obs = env.reset()
        if obs_normal is not None:
            obs = obs_normal.normalize_all(obs, update=False)
        while True:
            act = policy.act(obs)
            nx_obs, rew, done, info = env.step(act)
            obs = nx_obs
            if obs_normal is not None:
                obs = obs_normal.normalize_all(obs, update=False)
            if done:
                break
Exemple #7
0
class MoMarioEnv(Process):
    def __init__(self, args, env_idx, child_conn, history_size=4, h=84, w=84):
        super(MoMarioEnv, self).__init__()
        self.daemon = True
        self.env = JoypadSpace(gym_super_mario_bros.make(args.env_id),
                               SIMPLE_MOVEMENT)

        self.is_render = args.render
        self.env_idx = env_idx
        self.steps = 0
        self.episode = 0
        self.rall = 0
        self.coin = 0
        self.x_pos = 0
        self.time = 0
        self.score = 0
        self.n_mo = 5
        self.morall = np.zeros(self.n_mo)
        self.recent_rlist = deque(maxlen=100)
        self.recent_morlist = deque(maxlen=100)
        self.child_conn = child_conn
        self.life_done = args.life_done
        self.single_stage = args.single_stage
        self.stage_bonus = 0

        self.history_size = history_size
        self.history = np.zeros([history_size, h, w])
        self.h = h
        self.w = w

        self.reset()

    def run(self):
        super(MoMarioEnv, self).run()
        while True:
            action = self.child_conn.recv()
            if self.is_render:
                self.env.render()
            obs, reward, done, info = self.env.step(action)

            if self.single_stage and info["flag_get"]:
                self.stage_bonus = 10000
                done = True
            ''' Construct Multi-Objective Reward'''  #####################################
            # [x_pos, time, death, coin]
            moreward = []
            # 1. x position
            xpos_r = info["x_pos"] - self.x_pos
            self.x_pos = info["x_pos"]
            # resolve an issue where after death the x position resets
            if xpos_r < -5:
                xpos_r = 0
            moreward.append(xpos_r)

            # 2. time penaltiy
            time_r = info["time"] - self.time
            self.time = info["time"]
            # time is aways decreasing
            if time_r > 0:
                time_r = 0
            moreward.append(time_r)

            # 3. death
            if self.lives > info['life']:
                death_r = -25
            else:
                death_r = 0
            moreward.append(death_r)

            # 4. coin
            coin_r = (info['coins'] - self.coin) * 100
            self.coin = info['coins']
            moreward.append(coin_r)

            # 5. enemy
            enemy_r = info['score'] - self.score
            if coin_r > 0 or done:
                enemy_r = 0
            self.score = info['score']
            moreward.append(enemy_r)

            ############################################################################

            if self.life_done:
                # when Mario loses life, changes the state to the terminal
                # state.
                if self.lives > info['life'] and info['life'] > 0:
                    force_done = True
                    self.lives = info['life']
                else:
                    force_done = done
                    self.lives = info['life']
            else:
                # normal terminal state
                force_done = done

            # reward range -15 ~ 15
            r = reward / 15
            self.rall += reward

            self.morall += np.array(moreward)
            mor = np.array(moreward) * self.n_mo / 15

            self.history[:3, :, :] = self.history[1:, :, :]
            self.history[3, :, :] = self.pre_proc(obs)

            self.steps += 1

            score = info['score'] + self.stage_bonus

            if done:
                self.recent_rlist.append(self.rall)
                self.recent_morlist.append(self.morall)
                print(
                    "[Episode {}({})]\tStep: {}\tScore: {}\tMoReward: {}\tRecent MoReward: {}\tcoin: {}\tcurrent x:{}"
                    .format(self.episode, self.env_idx, self.steps,
                            score, self.morall,
                            np.mean(self.recent_morlist,
                                    axis=0), info['coins'], info['x_pos']))

                self.history = self.reset()

            self.child_conn.send(
                [self.history[:, :, :], r, force_done, done, mor, score])

    def reset(self):
        self.steps = 0
        self.episode += 1
        self.rall = 0
        self.lives = 3
        self.coin = 0
        self.x_pos = 0
        self.time = 0
        self.score = 0
        self.stage_bonus = 0
        self.morall = np.zeros(self.n_mo)
        self.get_init_state(self.env.reset())
        return self.history[:, :, :]

    def pre_proc(self, X):
        # grayscaling
        x = cv2.cvtColor(X, cv2.COLOR_RGB2GRAY)
        # resize
        x = cv2.resize(x, (self.h, self.w))
        x = np.float32(x) * (1.0 / 255.0)

        return x

    def get_init_state(self, s):
        for i in range(self.history_size):
            self.history[i, :, :] = self.pre_proc(s)
    def run(self):
        global episode
        env = gym_super_mario_bros.make('SuperMarioBros-v0')
        env = JoypadSpace(env, SIMPLE_MOVEMENT)
        # env = gym.make(env_name)
        # env.render()

        step = 0

        gc.collect()

        while episode < EPISODES:
            done = False
            dead = False

            score, start_life = 0, 5
            observe = env.reset()
            next_observe = observe

            # 0~30 상태동안 정지
            for _ in range(random.randint(1, 30)):
                observe = next_observe
                next_observe, _, _, _ = env.step(1)

            state = pre_processing(next_observe, observe)
            history = np.stack((state, state, state, state), axis=2)
            history = np.reshape([history], (1, 240, 256, 4))

            coinStatus = 0
            marioStatus = "small"
            flagStatus = False
            softReward = 0
            lifeStatus = 2

            while not done:
                step += 1
                self.t += 1
                observe = next_observe
                action, policy = self.get_action(history)

                # # 1: 정지, 2: 왼쪽, 3: 오른쪽
                # if action == 0:
                #     real_action = 1
                # elif action == 1:
                #     real_action = 2
                # else:
                #     real_action = 3
                #
                # # 죽었을 때 시작하기 위해 발사 행동을 함
                # if dead:
                #     action = 0
                #     real_action = 1
                #     dead = False

                # 선택한 행동으로 한 스텝을 실행
                next_observe, reward, done, info = env.step(action)

                # 각 타임스텝마다 상태 전처리
                next_state = pre_processing(next_observe, observe)
                next_state = np.reshape([next_state], (1, 240, 256, 1))
                next_history = np.append(next_state, history[:, :, :, :3], axis=3)

                # 정책의 최대값
                self.avg_p_max += np.amax(self.actor.predict(np.float32(history / 255.)))

                real_reward = reward
                if start_life > info['life']:
                    dead = True
                    start_life = info['life']

                # ###
                # if coinStatus != info["coins"]:
                #     coinStatus = info["coins"]
                #     reward = reward + 10
                # if marioStatus != info["status"]:
                #     marioStatus = info["status"]
                #     reward = reward + 200
                # if flagStatus != info["flag_get"]:
                #     flagStatus = info["flag_get"]
                #     reward = reward + 200
                # if lifeStatus != info["life"]:
                #     lifeStatus = info["life"]
                #     reward = reward - 200
                #
                # if info["x_pos"] < 10:
                #     info["x_pos"] = 10
                # if info["time"] < 10:
                #     info["time"] = 10
                #
                # reward = reward + ((info["x_pos"] / info["time"]) + info["x_pos"]) / 100

                score += real_reward
                # reward = np.clip(reward, -1., 1.)

                # 샘플을 저장
                self.append_sample(history, action, reward)

                gc.collect()

                if dead:
                    history = np.stack((next_state, next_state, next_state, next_state), axis=2)
                    history = np.reshape([history], (1, 240, 256, 4))
                else:
                    history = next_history

                # 에피소드가 끝나거나 최대 타임스텝 수에 도달하면 학습을 진행
                if self.t >= self.t_max or done:
                    self.train_model(done)
                    self.update_local_model()
                    self.t = 0

                if done:
                    # 각 에피소드 당 학습 정보를 기록
                    episode += 1
                    ep_res = "episode: {},  score: {}, step: {}".format(episode, score, step)
                    print(ep_res)

                    if episode % 20 == 0:
                        slack_msg(ep_res)

                    # stats = [score, self.avg_p_max / float(step), step]
                    # for i in range(len(stats)):
                    #     self.sess.run(self.update_ops[i], feed_dict={ self.summary_placeholders[i]: float(stats[i]) })
                    # summary_str = self.sess.run(self.summary_op)
                    # self.summary_writer.add_summary(summary_str, episode + 1)
                    self.avg_p_max = 0
                    self.avg_loss = 0
                    step = 0
Exemple #9
0
class ICMTrainer:
    """
    Compose encoder, forward/inverse, and q_model into single trainer entity
    """
    def __init__(self):
        self.env = gym_super_mario_bros.make('SuperMarioBros-v0')
        self.env = JoypadSpace(self.env, COMPLEX_MOVEMENT)

        self.replay = SMBExperienceReplay(buffer_size=BUFFER_SIZE,
                                          batch_size=BATCH_SIZE)
        self.q_model = Qnetwork()
        self.encoder = Phi()
        self.forward_model = Fnet()
        self.inverse_model = Gnet()
        all_model_params = list(self.q_model.parameters()) + list(
            self.encoder.parameters())
        all_model_params += list(self.forward_model.parameters()) + \
                            list(self.inverse_model.parameters())
        self.opt = optim.Adam(lr=0.001, params=all_model_params)

    @staticmethod
    def combined_loss(q_loss, inverse_loss, forward_loss):
        """
        overall loss fn
        lambda*Qloss + (1-beta)*forward_loss + beta*inverse_loss
        """
        loss_ = (1 - BETA) * inverse_loss
        loss_ += BETA * forward_loss
        loss_ = loss_.sum() / loss_.flatten().shape[0]
        loss = loss_ + LAMBDA * q_loss
        return loss

    def icm_loss(self,
                 state1,
                 action,
                 state2,
                 forward_scale=1.,
                 inverse_scale=1e4):
        """ calculate forward and inverse model losses for ICM """
        fwd_loss_fn = nn.MSELoss(reduction='none')
        inverse_loss_fn = nn.CrossEntropyLoss(reduction='none')

        # encode input states
        state1_hat = self.encoder(state1)
        state2_hat = self.encoder(state2)
        # detach because don't want to back-prop here on the encoder
        state2_hat_pred = self.forward_model(state1_hat.detach(),
                                             action.detach())
        forward_pred_err = fwd_loss_fn(
            state2_hat_pred, state2_hat.detach()).sum(dim=1).unsqueeze(dim=1)
        forward_pred_err *= forward_scale
        pred_action = self.inverse_model(state1_hat, state2_hat)
        inverse_pred_err = inverse_loss_fn(
            pred_action,
            action.detach().flatten()).unsqueeze(dim=1)
        inverse_pred_err *= inverse_scale
        return forward_pred_err, inverse_pred_err

    def batch_forward_pass(self, use_extrinsic=True):
        """ single forward pass that generates forward err, inverse err and q_loss"""
        # pylint: disable=E1101
        state1_batch, action_batch, reward_batch, state2_batch = self.replay.get_batch(
        )
        # reshape action/reward batches to be compatible with models
        action_batch = action_batch.view(action_batch.shape[0], 1)
        reward_batch = reward_batch.view(reward_batch.shape[0], 1)

        # run ICM
        forward_pred_err, inverse_pred_err = self.icm_loss(
            state1_batch, action_batch, state2_batch)
        # scale forward pred err using the Eta parameter
        i_reward = (1. / ETA) * forward_pred_err
        reward = i_reward.detach()
        if use_extrinsic:  # whether to include explicit rewards in training
            reward += reward_batch
        # discount expected values for next state
        qvals = self.q_model(state2_batch)
        reward += GAMMA * torch.max(qvals)
        reward_pred = self.q_model(state1_batch)
        reward_target = reward_pred.clone()
        # convert action batch (integers) to OHE
        indices = torch.stack(
            (torch.arange(action_batch.shape[0]), action_batch.squeeze()),
            dim=0)
        indices = indices.tolist()
        reward_target[indices] = reward.squeeze()

        q_loss = 1e5 * nn.MSELoss()(F.normalize(reward_pred),
                                    F.normalize(reward_target.detach()))
        return forward_pred_err, inverse_pred_err, q_loss

    def repeat_action(self, action):
        """ given action,
        repeat it specified times,
        and return combined state and rewards """
        state_deque = deque(maxlen=FRAMES_PER_STATE)
        sum_rewards = 0
        for _ in range(ACTION_REPEATS):
            state2, e_reward_, done, info = self.env.step(action)
            if done:
                break
            sum_rewards += e_reward_
            downscaled_state2 = downscale_img(state2, to_gray=True)
            # pylint: disable=E1101
            prepped_state2 = torch.from_numpy(downscaled_state2). \
                float().unsqueeze(dim=0)
            state_deque.append(prepped_state2)
        return state_deque, done, sum_rewards, info

    def train(self):
        """ full training loop """
        self.env.reset()
        state1 = prepare_initial_state(self.env.render('rgb_array'))
        losses = []
        ep_lengths = []
        episode_length = 0
        last_x_pos = self.env.env.env._x_position
        for i in range(TRAINING_STEPS):
            self.opt.zero_grad()
            episode_length += 1
            q_val_pred = self.q_model(state1)
            if i > SWITCH_TO_EPS_GREEDY:
                action = int(sample_action(q_val_pred, EPS))
            else:
                action = int(sample_action(q_val_pred))

            state_deque, done, extrinsic_reward, info = self.repeat_action(
                action)
            # pylint: disable=E1101
            state2 = torch.stack(list(state_deque), dim=1)
            self.replay.add_memory(
                state1,
                action,
                extrinsic_reward,  # summed across repeated actions
                state2)
            if i % MAX_EPISODE_LEN == 0 and i != 0:
                if (info['x_pos'] - last_x_pos) < MIN_PROGRESS:
                    done = True
                else:
                    last_x_pos = info['x_pos']
            if done:
                print("Episode over.")
                ep_lengths.append(info['x_pos'])
                self.env.reset()
                state1 = prepare_initial_state(self.env.render('rgb_array'))
                last_x_pos = self.env.env.env._x_position
                episode_length = 0
            else:
                state1 = state2
            # Enter mini-batch training
            if len(self.replay.memory) < BATCH_SIZE:
                continue

            forward_pred_err, inverse_pred_err, q_loss \
                = self.batch_forward_pass(use_extrinsic=False)
            loss = self.combined_loss(q_loss, forward_pred_err,
                                      inverse_pred_err)
            loss_list = (q_loss.mean(), forward_pred_err.flatten().mean(),
                         inverse_pred_err.flatten().mean(), episode_length)
            if i % 250 == 0:
                print("Epoch {}, Loss: {}".format(i, loss))
                print(
                    "Forward loss: {} \n Inverse loss: {} \n Qloss: {}".format(
                        forward_pred_err.mean(), inverse_pred_err.mean(),
                        q_loss.mean()))
                print(info)
            losses.append(loss_list)
            loss.backward()
            self.opt.step()
Exemple #10
0
def train():
    # Hyper parameters
    cfg = DictConfig({
        "epochs": 1,
        "lr": 1e-4,
        "use_extrinsic": True,
        "max_episode_len": 1000,
        "min_progress": 15,
        "frames_per_state": 3,
        "action_repeats": 6,
        "gamma_q": 0.85,
        "epsilon_random": 0.1,  # Sample random action with epsilon probability
        "epsilon_greedy_switch": 1000,
        "q_loss_weight": 0.01,
        "inverse_loss_weight": 0.5,
        "forward_loss_weight": 0.5,
        "intrinsic_weight": 1.0,
        "extrinsic_weight": 1.0,
    })

    # ---- setting up variables -----

    q_model = MarioModel(cfg.frames_per_state)
    icm_model = MarioICM(cfg.frames_per_state)

    optim = torch.optim.Adam(list(q_model.parameters()) +
                             list(icm_model.parameters()),
                             lr=cfg.lr)

    replay = ExperienceReplay(buffer_size=500, batch_size=100)
    env = gym_super_mario_bros.make("SuperMarioBros-v0")
    env = JoypadSpace(env, COMPLEX_MOVEMENT)

    # Counters and stats
    last_x_pos = 0
    current_episode = 0
    global_step = 0
    current_step = 0
    cumulative_reward = 0

    ep_rewards = []

    # ----- training loop ------

    for epoch in range(cfg.epochs):
        state = env.reset()
        done = False

        # Monte Carlo loop
        while not done:

            # ------------ Q Learning --------------

            if current_step == 0:
                state = prepare_initial_state(env.render("rgb_array"))
            else:
                state = prepare_multi_state(state, env.render("rgb_array"))

            q_values = q_model(state)
            action = sample_action(
                q_values,
                cfg.epsilon,
                apply_epsilon=global_step > cfg.epsilon_greedy_switch,
            )

            action_count = 0
            state2 = None
            while True:
                state2_, reward, done, info = env.step(action)
                if state2 is None:
                    state2 = state2_
                env.render()
                if action_count >= cfg.action_repeats or done:
                    break
                action_count += 1
            state2 = prepare_multi_state(state, state2)

            # Add intrinsic reward
            intrinsic_reward = get_intrinsic_reward(state, action, state2,
                                                    icm_model)
            print("in reward", intrinsic_reward.item())
            print("ex reward", reward)

            reward = (cfg.intrinsic_weight *
                      intrinsic_reward) + (cfg.extrinsic_weight * reward)

            q_loss = get_q_loss(q_values[0][action], reward, q_model, state2,
                                cfg.gamma_q)

            replay.add(state, action, reward, state2)
            state = state2

            # ------------- ICM -------------------

            state1_batch, action_batch, reward_batch, state2_batch = replay.get_batch(
            )

            action_pred, state2_encoded, state2_pred = icm_model(
                state1_batch, action_batch, state2_batch)

            inverse_loss = F.cross_entropy(action_pred, action_batch)
            forward_loss = F.mse_loss(state2_pred, state2_encoded)

            # ------------ Learning ------------

            final_loss = ((cfg.q_loss_weight * q_loss) +
                          (cfg.inverse_loss_weight * inverse_loss) +
                          (cfg.forward_loss_weight * forward_loss))

            optim.zero_grad()
            final_loss.backward()
            optim.step()

            # ------------ updates --------------

            # TODO: add loss scalars
            print("--------loss: ", final_loss.item())

            max_episode_len_reached = current_step >= cfg.max_episode_len
            no_progress = False  # TODO: Figure out the progress shit

            done = done or max_episode_len_reached or no_progress

            if done:
                if max_episode_len_reached:
                    # TODO: Add scalar: 'max episode len reached' current_episode, auto
                    pass
                elif no_progress:
                    # TODO: Add scalar: 'no progress' current_episode, auto
                    pass

                # TODO: add scalar: 'episode len' current_step, current_episode
                # TODO: Plot cumulative reward for each episode
                # TODO: Plot the x_pos after the episode
                # TODO: Plot total sum of rewards for each episode
                # TODO: Every n episodes store save the video -> imageio.mimwrite('gameplay.mp4', renders: ndArray of frames, fps=30)

                current_step = -1
                current_episode += 1

            global_step += 1
            current_step += 1
Exemple #11
0
def dqn():
    env = gym_tetris.make('TetrisA-v2')
    env = JoypadSpace(env, MOVEMENT)
    episodes = 2000
    max_steps = None
    epsilon_stop_episode = 1500
    mem_size = 20000
    discount = 0.95
    batch_size = 512
    epochs = 1
    render_every = 50
    log_every = 50
    replay_start_size = 2000
    train_every = 1
    n_neurons = [32, 32]
    render_delay = None
    activations = ['relu', 'relu', 'linear']

    agent = DQNAgent(env.get_state_size(),
                     n_neurons=n_neurons,
                     activations=activations,
                     epsilon_stop_episode=epsilon_stop_episode,
                     mem_size=mem_size,
                     discount=discount,
                     replay_start_size=replay_start_size)

    log_dir = f'logs/tetris-nn={str(n_neurons)}-mem={mem_size}-bs={batch_size}-e={epochs}-{datetime.now().strftime("%Y%m%d-%H%M%S")}'
    log = CustomTensorBoard(log_dir=log_dir)

    scores = []

    for episode in tqdm(range(episodes)):
        current_state = env.reset()
        done = False
        steps = 0

        if render_every and episode % render_every == 0:
            render = True
        else:
            render = False

        # Game
        while not done and (not max_steps or steps < max_steps):
            next_states = env.get_next_states()
            best_state = agent.best_state(next_states.values())

            best_action = None
            for action, state in next_states.items():
                if state == best_state:
                    best_action = action
                    break

            reward, done = env.play(best_action[0],
                                    best_action[1],
                                    render=render,
                                    render_delay=render_delay)

            agent.add_to_memory(current_state, next_states[best_action],
                                reward, done)
            current_state = next_states[best_action]
            steps += 1

        scores.append(env.get_game_score())

        # Train
        if episode % train_every == 0:
            agent.train(batch_size=batch_size, epochs=epochs)

        # Logs
        if log_every and episode and episode % log_every == 0:
            avg_score = mean(scores[-log_every:])
            min_score = min(scores[-log_every:])
            max_score = max(scores[-log_every:])

            log.log(episode,
                    avg_score=avg_score,
                    min_score=min_score,
                    max_score=max_score)
class MarioEnvironment(Process):
    def __init__(
            self,
            env_id,
            is_render,
            env_idx,
            child_conn,
            history_size=4,
            life_done=True,
            h=84,
            w=84, movement=COMPLEX_MOVEMENT, sticky_action=True,
            p=0.25):
        super(MarioEnvironment, self).__init__()
        self.daemon = True
        self.env = JoypadSpace(
            gym_super_mario_bros.make(env_id), COMPLEX_MOVEMENT)

        self.is_render = is_render
        self.env_idx = env_idx
        self.steps = 0
        self.episode = 0
        self.rall = 0
        self.recent_rlist = deque(maxlen=100)
        self.child_conn = child_conn

        self.life_done = life_done

        self.history_size = history_size
        self.history = np.zeros([history_size, h, w])
        self.h = h
        self.w = w

        self.reset()

    def run(self):
        super(MarioEnvironment, self).run()
        while True:
            action = self.child_conn.recv()
            if self.is_render:
                self.env.render()

            obs, reward, done, info = self.env.step(action)

            # when Mario loses life, changes the state to the terminal
            # state.
            if self.life_done:
                if self.lives > info['life'] and info['life'] > 0:
                    force_done = True
                    self.lives = info['life']
                else:
                    force_done = done
                    self.lives = info['life']
            else:
                force_done = done

            # reward range -15 ~ 15
            log_reward = reward / 15
            self.rall += log_reward

            r = log_reward

            self.history[:3, :, :] = self.history[1:, :, :]
            self.history[3, :, :] = self.pre_proc(obs)

            self.steps += 1

            if done:
                self.recent_rlist.append(self.rall)
                print(
                    "[Episode {}({})] Step: {}  Reward: {}  Recent Reward: {}  Stage: {} current x:{}   max x:{}".format(
                        self.episode,
                        self.env_idx,
                        self.steps,
                        self.rall,
                        np.mean(
                            self.recent_rlist),
                        info['stage'],
                        info['x_pos'],
                        self.max_pos))

                self.history = self.reset()

            self.child_conn.send([self.history[:, :, :], r, force_done, done, log_reward])

    def reset(self):
        self.last_action = 0
        self.steps = 0
        self.episode += 1
        self.rall = 0
        self.lives = 3
        self.stage = 1
        self.max_pos = 0
        self.get_init_state(self.env.reset())
        return self.history[:, :, :]

    def pre_proc(self, X):
        # grayscaling
        x = cv2.cvtColor(X, cv2.COLOR_RGB2GRAY)
        # resize
        x = cv2.resize(x, (self.h, self.w))

        return x

    def get_init_state(self, s):
        for i in range(self.history_size):
            self.history[i, :, :] = self.pre_proc(s)
Exemple #13
0
class Env(object):
    def __init__(self, game, **kwargs):
        self.act_space = kwargs.get("act_space")
        self.state_size = kwargs.get("state_size")
        self.burn_in = kwargs.get("burn_in")
        self.seqlen = kwargs.get("seqlen")
        self.n_step = kwargs.get("n_step")
        self.use_soft = kwargs.get("use_soft")
        self.frames = kwargs.get("frames")
        self.sample_epsilon_per_step = kwargs.get("sample_epsilon_per_step")

        self.epsilon = np.power(0.4, random.uniform(4, 8))
        self.game = game

        self.count = 0
        self.count_maxpos = []

        env = gym_super_mario_bros.make(game)
        if self.act_space == 7:
            self.env = JoypadSpace(env, SIMPLE_MOVEMENT)
        elif self.act_space == 12:
            self.env = JoypadSpace(env, COMPLEX_MOVEMENT)

        self.max_pos = -10000
        self.done = True
        self.reset()

    def step(self, a, state_in):
        maxpos = self.reset()

        self.count += 1
        if not self.use_soft:
            if self.sample_epsilon_per_step:
                self.epsilon = np.power(0.4, random.uniform(4, 8))
            if random.random() < self.epsilon:
                a = random.randint(0, self.act_space - 1)
        self.a_t = a
        gs_t1, gr_t, gdone, ginfo = self.env.step(self.a_t)
        self.env.render()
        if not gdone:
            s_t1, r_t, done, info = self.env.step(self.a_t)
            r_t += gr_t
            r_t /= 2.
        else:
            s_t1 = gs_t1
            r_t = gr_t
            done = gdone
            info = ginfo
        s_t1 = self.resize_image(s_t1)
        channels = s_t1.shape[-1]
        self.s_t = np.concatenate([s_t1, self.s_t[:, :, :-channels]], axis=-1)

        self.s.append(self.s_t)
        self.a.append(self.a_t)
        self.r.append(r_t)
        self.max_pos = max(self.max_pos, info["x_pos"])
        self.pos.append(info["x_pos"])
        if (len(self.pos) > 100) and (info["x_pos"] - self.pos[-100] < 5) and (
                self.pos[-100] - info["x_pos"] < 5):
            done = True
        self.done = done
        if self.done:
            self.mask.append(0)
        else:
            self.mask.append(1)

        self.state_in.append(state_in)
        """
        get segs
        """
        # segs = self.get_history()
        #
        # return segs
        return maxpos

    def reset(self):
        if self.done:
            self.count_maxpos.append(self.max_pos)
            print(self.game, self.max_pos, len(self.count_maxpos[1:]),
                  np.mean(self.count_maxpos[1:]))
            self.epsilon = np.power(0.4, random.uniform(4, 8))

            self.count = 0

            s_t = self.resize_image(self.env.reset())

            self.s_t = np.tile(s_t, [1, 1, self.frames])
            self.s = [self.s_t]

            self.a_t = random.randint(0, self.act_space - 1)
            self.a = [self.a_t]
            self.r = [0]
            self.mask = [1]

            self.max_pos = -10000
            self.pos = []

            state_in = np.zeros(self.state_size, dtype=np.float32)
            self.state_in = [state_in]

            self.done = False
            return self.count_maxpos
        return None

    def get_state(self):
        return self.s_t

    def get_act(self):
        return self.a_t

    def get_reward(self):
        return self.r[-1]

    def get_max_pos(self):
        return self.max_pos

    def get_state_in(self):
        return self.state_in[-1]

    @staticmethod
    def resize_image(image, size=84):
        image = Image.fromarray(image)
        image = image.convert("L")
        image = image.resize((size, size))
        image = np.array(image, np.uint8)
        return image[:, :, None]
def train_agent(args):
    # if gpu is to be used
    device = torch.device(
        "cuda" if torch.cuda.is_available() and args.ngpu > 0 else "cpu")

    # Build env (first level, right only)
    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
    env = JoypadSpace(env, SIMPLE_MOVEMENT)

    # setup networks
    init_screen = get_screen(env, device)
    _, _, screen_height, screen_width = init_screen.shape

    # Get number of actions from gym action space
    args.n_actions = env.action_space.n

    policy_net = DQN(screen_height, screen_width, args.n_actions).to(device)
    target_net = DQN(screen_height, screen_width, args.n_actions).to(device)

    if args.targetNet:
        target_net.load_state_dict(
            torch.load(args.targetNet, map_location=device))

    if args.policyNet:
        target_net.load_state_dict(
            torch.load(args.policyNet, map_location=device))

    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    optimizer = optim.RMSprop(policy_net.parameters())
    memory = ReplayMemory(10000)

    args.steps_done = 0

    num_episodes = 1

    for i_episode in range(num_episodes):
        # Initialize the environment and state
        env.reset()
        last_screen = get_screen(env, device)
        current_screen = get_screen(env, device)
        state = current_screen - last_screen
        for t in count():
            # Select and perform an action
            action = select_action(state, policy_net, args, device)
            _, reward, done, _ = env.step(action.item())
            reward = torch.tensor([reward], device=device)

            # Observe new state
            last_screen = current_screen
            current_screen = get_screen(env, device)
            if not done:
                next_state = current_screen - last_screen
            else:
                next_state = None

            # Store the transition in memory
            memory.push(state, action, next_state, reward)

            # Move to the next state
            state = next_state

            # Perform one step of the optimization (on the target network)
            optimize_model(optimizer, memory, policy_net, target_net, args,
                           device)
            if done:
                episode_durations.append(t + 1)
                break
        # Update the target network, copying all weights and biases in DQN
        if i_episode % args.target_update == 0:
            target_net.load_state_dict(policy_net.state_dict())
            torch.save(policy_net.state_dict(), args.output_policyNet)
            torch.save(target_net.state_dict(), args.output_targetNet)

        if i_episode % 10 == 0:
            print(f'{i_episode+1}/{num_episodes}: Completed Episode.')

    print('Complete')
    env.close()

    torch.save(policy_net.state_dict(), args.output_policyNet)
    torch.save(target_net.state_dict(), args.output_targetNet)
Exemple #15
0
class Env(object):
    def __init__(self, act_space, act_repeats, frames, n_step, gamma, game):
        self.act_space = act_space
        self.act_repeats = act_repeats
        self.act_repeat = random.choice(self.act_repeats)
        self.frames = frames
        self.n_step = n_step
        self.gamma = gamma

        self.max_pos = -10000

        self.count = 0

        env = gym_super_mario_bros.make(game)
        self.env = JoypadSpace(env, SIMPLE_MOVEMENT)

        s_t = self.resize_image(self.env.reset())

        self.s_t = np.tile(s_t, [1, 1, frames])
        self.s = [self.s_t]

        self.a_t = random.randint(0, act_space - 1)
        self.a = [self.a_t]
        self.a_logits = []
        self.r = []
        self.v_cur = []
        self.pos = []

        c_in = np.zeros(256, dtype=np.float32)
        h_in = np.zeros(256, dtype=np.float32)
        state_in = np.concatenate([c_in, h_in], axis=-1)
        self.state_in = [state_in]

        self.done = False

    def step(self, a, a_logits, state_in, v_cur):
        self.count += 1
        if self.count % self.act_repeat == 0:
            self.a_t = a
            self.count = 0
            self.act_repeat = random.choice(self.act_repeats)
        gs_t1, gr_t, gdone, ginfo = self.env.step(self.a_t)
        if not gdone:
            s_t1, r_t, done, info = self.env.step(self.a_t)
            r_t += gr_t
            r_t /= 2.
        else:
            s_t1 = gs_t1
            r_t = gr_t
            done = gdone
            info = ginfo
        r_t /= 15.
        s_t1 = self.resize_image(s_t1)
        channels = s_t1.shape[-1]
        self.s_t = np.concatenate([s_t1, self.s_t[:, :, :-channels]], axis=-1)

        self.s.append(self.s_t)
        self.a.append(self.a_t)
        self.a_logits.append(a_logits)
        self.r.append(r_t)
        self.v_cur.append(v_cur)
        self.max_pos = max(self.max_pos, info["x_pos"])
        self.pos.append(info["x_pos"])
        if (len(self.pos) > 500) and (info["x_pos"] - self.pos[-500] < 5) and (
                self.pos[-500] - info["x_pos"] < 5):
            done = True
        self.done = done

        self.state_in.append(state_in)

    def reset(self, force=False):
        if self.done or force:
            self.count = 0
            self.act_repeat = random.choice(self.act_repeats)

            s_t = self.resize_image(self.env.reset())

            self.s_t = np.tile(s_t, [1, 1, self.frames])
            self.s = [self.s_t]

            self.a_t = random.randint(0, self.act_space - 1)
            self.a = [self.a_t]
            self.a_logits = []
            self.r = []
            self.v_cur = []
            self.pos = []

            c_in = np.zeros(256, dtype=np.float32)
            h_in = np.zeros(256, dtype=np.float32)
            state_in = np.concatenate([c_in, h_in], axis=-1)
            self.state_in = [state_in]

            self.done = False

    def get_state(self):
        return self.s_t

    def get_act(self):
        return self.a_t

    def get_max_pos(self):
        return self.max_pos

    def reset_max_pos(self):
        self.max_pos = -10000

    def get_state_in(self):
        return self.state_in[-1]

    def get_history(self, force=False):
        if self.done or force:
            v_cur = np.array(self.v_cur + [0])
            v_cur = h_inv(v_cur)
            gaes = get_gaes(None, self.r, v_cur[:-1], v_cur[1:], self.gamma,
                            0.95)[0]
            v_tar = get_rescaled_target(gaes, 1.0, self.v_cur)
            n_step_r = get_n_step_rewards(self.r, self.n_step, self.gamma)
            seg = Seg(self.s, self.a, self.a_logits, self.r, n_step_r,
                      self.v_cur, v_tar, gaes, self.state_in)
            return seg
        return None

    @staticmethod
    def resize_image(image, size=84):
        image = Image.fromarray(image)
        image = image.convert("L")
        image = image.resize((size, size))
        image = np.array(image)
        image = image / 255.
        image = np.array(image, np.float32)
        return image[:, :, None]
Exemple #16
0
class Env(object):
    def __init__(self, act_space, act_repeats, frames, state_size, game):
        self.act_space = act_space
        self.act_repeats = act_repeats
        self.act_repeat = random.choice(self.act_repeats)
        self.frames = frames
        self.state_size = state_size
        self.game = game

        self.max_pos = -10000

        self.count = 0

        env = gym_super_mario_bros.make(game)
        if self.act_space == 7:
            self.env = JoypadSpace(env, SIMPLE_MOVEMENT)
        elif self.act_space == 12:
            self.env = JoypadSpace(env, COMPLEX_MOVEMENT)

        s_t = self.resize_image(self.env.reset())

        self.s_t = np.tile(s_t, [1, 1, frames])
        self.s = [self.s_t]

        self.a_t = random.randint(0, act_space - 1)
        self.a = [self.a_t]
        self.a_logits = []
        self.r = [0]
        self.pos = []

        self.v_cur = []

        state_in = np.zeros(self.state_size, dtype=np.float32)
        self.state_in = [state_in]

        self.done = False

    def step(self, a, a_logits, state_in):
        self.count += 1
        if self.count % self.act_repeat == 0:
            self.a_t = a
            self.count = 0
            self.act_repeat = random.choice(self.act_repeats)
        gs_t1, gr_t, gdone, ginfo = self.env.step(self.a_t)
        self.env.render()
        if not gdone:
            s_t1, r_t, done, info = self.env.step(self.a_t)
            r_t += gr_t
            r_t /= 2.
        else:
            s_t1 = gs_t1
            r_t = gr_t
            done = gdone
            info = ginfo
        r_t /= 15.
        s_t1 = self.resize_image(s_t1)
        channels = s_t1.shape[-1]
        self.s_t = np.concatenate([s_t1, self.s_t[:, :, :-channels]], axis=-1)

        self.s.append(self.s_t)
        self.a.append(self.a_t)
        self.a_logits.append(a_logits)
        self.r.append(r_t)
        self.max_pos = max(self.max_pos, info["x_pos"])
        self.pos.append(info["x_pos"])
        if (len(self.pos) > 500) and (info["x_pos"] - self.pos[-500] < 5) and (
                self.pos[-500] - info["x_pos"] < 5):
            done = True
        self.done = done

        self.state_in.append(state_in)

    def update_v(self, v_cur):
        self.v_cur.append(v_cur)

    def reset(self, force=False):
        if self.done or force:
            max_pos = self.max_pos
            self.max_pos = -10000
            logging.info("  Max Position  %s : %d" % (self.game, max_pos))
            self.count = 0
            self.act_repeat = random.choice(self.act_repeats)

            s_t = self.resize_image(self.env.reset())

            self.s_t = np.tile(s_t, [1, 1, self.frames])
            self.s = [self.s_t]

            self.a_t = random.randint(0, self.act_space - 1)
            self.a = [self.a_t]
            self.a_logits = []
            self.r = [0]
            self.pos = []

            self.v_cur = []

            state_in = np.zeros(self.state_size, dtype=np.float32)
            self.state_in = [state_in]

            self.done = False

    def get_state(self):
        return self.s_t

    def get_act(self):
        return self.a_t

    def get_max_pos(self):
        return self.max_pos

    def reset_max_pos(self):
        self.max_pos = -10000

    def get_state_in(self):
        return self.state_in[-1]

    def get_history(self, force=False):
        if self.done or force:
            if self.done:
                gaes = get_gaes(None, self.r, self.v_cur, self.v_cur[1:] + [0],
                                0.99, 0.95)[0]
                seg = Seg(self.s, self.a, self.a_logits, self.r, gaes,
                          self.v_cur, self.state_in)
                return seg
            if force and len(self.r) > 1:
                gaes = get_gaes(None, self.r[:-1], self.v_cur[:-1],
                                self.v_cur[1:], 0.99, 0.95)[0]
                seg = Seg(self.s[:-1], self.a[:-1], self.a_logits[:-1],
                          self.r[:-1], gaes, self.v_cur[:-1],
                          self.state_in[:-1])
                return seg
        return None

    @staticmethod
    def resize_image(image, size=84):
        image = Image.fromarray(image)
        image = image.convert("L")
        image = image.resize((size, size))
        image = np.array(image)
        image = image / 255.
        image = np.array(image, np.float32)
        return image[:, :, None]
Exemple #17
0
        os.makedirs(params['path_logs_dir'])
    shutil.copy('./params.json', params['path_logs_dir'] + '/params.json')
    writer = SummaryWriter(params['path_logs_dir'])
    dummy_input_to_policy_net = torch.randn(
        1, json_params['size_resized_image'],
        json_params['size_resized_image']).float().to(
            params['device']).unsqueeze(0)
    dummy_input_to_target_net = torch.randn(
        1, json_params['size_resized_image'],
        json_params['size_resized_image']).float().to(
            params['device']).unsqueeze(0)
    writer.add_graph(agent.brain.policy_net, dummy_input_to_policy_net)
    writer.add_graph(agent.brain.target_net, dummy_input_to_target_net)

    for episode in range(1, json_params['num_episodes'] + 1):
        observation = env.reset()
        state = preprocess(observation, json_params['size_resized_image'])
        t = done = total_rewards = total_loss = total_max_q_val = 0

        while True:
            if json_params['render']:
                env.render()

            t += 1
            action = agent.get_action(state)
            observation, reward, done, _ = env.step(action)
            if done:
                next_state = None
            else:
                next_state = preprocess(observation,
                                        json_params['size_resized_image'])
Exemple #18
0
    qtargets = reward_batch.squeeze() + params['gamma'] * (
        (1 - done_batch.squeeze()) * torch.max(qtargets_, dim=1)[0])
    X = qvals.gather(dim=1, index=action_batch).squeeze()

    return loss_fn(X, qtargets.detach())


eps = 1
losses = []
ep_lengths = []
e_reward = 0.0
episode_length = 0
epochs = 7127431
500000
env.reset()
state1 = prepare_initial_state(env.render('rgb_array'))
state_deque = deque(maxlen=params['frames_per_state'])
last_x_pos = env.env.env._x_position
start_time = time.time()

for i in range(epochs):
    optimizer.zero_grad()
    episode_length += 1
    qval_pred = Qmodel(state1)
    action = int(policy(qval_pred, eps))

    for j in range(params['action_repeats']):
        state2, e_reward_, done, info = env.step(action)
        last_x_pos = info['x_pos']
        if done:
Exemple #19
0
if __name__ == "__main__":
    tf.reset_default_graph()
    gpu_options = tf.GPUOptions(allow_growth=True)
    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

    env = gym_super_mario_bros.make('SuperMarioBrosRandomStages-v0')
    env = JoypadSpace(env, SIMPLE_MOVEMENT)
    action_size = env.action_space.n

    # envs[0].set_render(True)

    train_model = A2CAgent("train_model", False, sess, input_shape, action_size,
                           lr, GAMMA, LAMBDA, max_grad_norm, ent_coef, vf_coef, clip_range, True)
    while True:
        state_generator = StateGenerator(frame_size, stack_size)
        state = state_generator.get_stacked_frames(env.reset(), True)

        episodes_reward = 0
        while True:
            

            policy, value = train_model.get_actions_and_values(np.array([state]))
            action = np.random.choice(np.arange(action_size), p=np.squeeze(policy))
            
            for i in range(0, skip_frames):
                env.render()
                raw_state, frame_reward, done, info = env.step(action)
                if frame_reward == -15 or done:
                    raw_state = env.reset()
                    break
Exemple #20
0
# create the network
policy_net = DQNetwork(stacked_frame_dim=FRAME_DIM,
                       num_actions=env.action_space.n)
target_net = DQNetwork(stacked_frame_dim=FRAME_DIM,
                       num_actions=env.action_space.n)

# create the replay memory
replay_memory = ReplayMemory(REPLAY_MEMORY_CAPACITY)

# play the episodes
current_exploration = EXPLORATION_MAX
total_steps = 0
reward_history = []
mean_reward_history = []
for episode in range(NUM_EPISODES):
    state = env.reset()

    # play one game
    current_reward = 0
    for steps in count(MAX_STEPS_PER_GAME):
        # render the environment
        if RENDER_ENVIRONMENT:
            env.render()

        # get the next action
        action = get_next_action(state, env.action_space.n,
                                 current_exploration)

        # perform the action
        next_state, reward, done, info = env.step(action)
Exemple #21
0
class EnvWrapper():
    def __init__(self, frame_size, skip_frames, stack_size):
        self.env = gym_super_mario_bros.make('SuperMarioBrosRandomStages-v0')
        self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT)
        self.agent = None
        self.frame_size = frame_size
        self.stack_size = stack_size
        self.action_size = self.env.action_space.n
        self.skip_frames = skip_frames
        self.render = False
        self.state_generator = StateGenerator(self.frame_size, self.stack_size)

        self.env.reset()
        raw_state, _, _, self.info = self.env.step(0)
        self.state = self.state_generator.get_stacked_frames(raw_state, True)

        self.states = []
        self.policies = []
        self.actions = []
        self.rewards = []
        self.values = []
        self.dones = []

        self.episode = 0
        self.episodeReward = 0
        self.maxEpisodeReward = 0
        self.current_episode_reward = 0

        self.done = False

    def step(self, n):
        for _ in range(n):
            policy, value = self.agent.get_actions_and_values(
                np.array([self.state]))
            action = np.random.choice(self.action_size, p=np.squeeze(policy))
            reward = 0

            for i in range(0, self.skip_frames):
                raw_state, frame_reward, done, info = self.env.step(action)
                if frame_reward == -15 or done:
                    self.episode += 1
                    done = True
                    if frame_reward == -15:
                        reward = -15 * self.skip_frames
                    else:
                        reward = 15 * self.skip_frames

                    raw_state = self.env.reset()

                    break
                else:
                    reward += frame_reward
                    reward += (5 if
                               (info["score"] - self.info["score"]) > 0 else 0)

            reward /= (15 * self.skip_frames)

            self.current_episode_reward += reward

            next_state = self.state_generator.get_stacked_frames(
                raw_state, done, frame_reward == 15
                or (done and self.episode % 100 == 0),
                self.current_episode_reward)

            self.states.append(self.state)
            self.policies.append(np.squeeze(policy))
            self.actions.append(action)
            self.rewards.append(reward)
            self.values.append(np.squeeze(value))
            self.dones.append(done)

            self.state = next_state
            self.done = done
            self.info = info

            if self.done:
                self.episodeReward = self.current_episode_reward

                if self.maxEpisodeReward < self.episodeReward:
                    self.maxEpisodeReward = self.episodeReward

                self.current_episode_reward = 0

    def get_experiences(self):
        if self.done:
            next_state_value = 0
        else:
            next_state_value = np.squeeze(
                self.agent.get_value(np.array([self.state])))

        states = self.states
        actions = self.actions
        policies = self.policies
        rewards = self.rewards
        values = self.values
        dones = [1 if done else 0 for done in self.dones]
        next_values = values[1:] + [next_state_value]

        self.states = []
        self.policies = []
        self.actions = []
        self.rewards = []
        self.values = []
        self.dones = []

        return states, policies, actions, rewards, values, next_values, dones

    def get_action_size(self):
        return self.action_size

    def set_agent(self, agent):
        self.agent = agent

    def set_render(self, render):
        self.render = render

    def get_max_and_current_episode_reward(self):
        return self.maxEpisodeReward, self.episodeReward
Exemple #22
0
def main():
    env = gym_super_mario_bros.make('SuperMarioBros-v0')
    env = JoypadSpace(env, SIMPLE_MOVEMENT)

    agent = DQNAgent(action_size=7)

    scores, episodes, global_step = [], [], 0

    global_start = datetime.now()
    local_start = datetime.now()

    print()
    print("=" * 100)
    print("RL environment initialized")
    print("=" * 100)
    print()
    gc.collect()

    for e in range(1000):
        e = e + 1
        done = False
        dead = False

        step, score, start_life = 0, 0, 5
        observe = env.reset()

        for _ in range(random.randint(1, agent.no_op_steps)):
            observe, _, _, _ = env.step(1)

        state = agent.pre_processing(observe)
        history = np.stack((state, state, state, state), axis=2)
        history = np.reshape([history], (1, 240, 256, 4))

        count_epsilon = 0
        count_greedy = 0

        coinStatus = 0
        marioStatus = "small"
        flagStatus = False
        softReward = 0
        lifeStatus = 2

        while not done:
            # if agent.render:
            #     env.render()
            global_step += 1
            step += 1
            # 바로 전 4개의 상태로 행동을 선택
            action, res = agent.get_action(history)
            if res:
                count_epsilon += 1
            else:
                count_greedy += 1

            # 선택한 행동으로 환경에서 한 타임스텝 진행
            observe, reward, done, info = env.step(action)
            # 각 타임스텝마다 상태 전처리
            next_state = agent.pre_processing(observe)
            next_state = np.reshape([next_state], (1, 240, 256, 1))
            next_history = np.append(next_state, history[:, :, :, :3], axis=3)
            agent.avg_q_max += np.amax(agent.model.predict(np.float32(history / 255.))[0])
            if start_life > info['life']:
                dead = True
                start_life = info['life']
            # reward = np.clip(reward, -1., 1.)
            real_reward = reward

            ###
            ###
            ###
            # reward = reward
            # if coinStatus != info["coins"]:
            #     coinStatus = info["coins"]
            #     reward = reward + 10
            # if marioStatus != info["status"]:
            #     marioStatus = info["status"]
            #     reward = reward + 200
            # if flagStatus != info["flag_get"]:
            #     flagStatus = info["flag_get"]
            #     reward = reward + 200
            # if lifeStatus != info["life"]:
            #     lifeStatus = info["life"]
            #     reward = reward - 20
            #
            # if info["x_pos"] < 10:
            #     info["x_pos"] = 10
            # if info["time"] < 10:
            #     info["time"] = 10
            #
            # reward = reward + math.log((info["x_pos"] / info["time"]) + info["x_pos"])

            # 샘플 <s, a, r, s'>을 리플레이 메모리에 저장 후 학습
            agent.append_sample(history, action, reward, next_history, dead)
            if len(agent.memory) >= agent.train_start:
                agent.train_model()
            # 일정 시간마다 타겟모델을 모델의 가중치로 업데이트
            if global_step % agent.update_target_rate == 0:
                agent.update_target_model()

            # score += reward
            score += real_reward

            if dead:
                dead = False
            else:
                history = next_history

            if global_step == 0:
                pass
            elif global_step % 1000 == 0:
                print("local step : {}, time : {} sec, epsilon : {}".format(global_step, (datetime.now() - local_start).seconds, agent.epsilon))
                local_start = datetime.now()

            if done:
                ep_result = "episode : {}, score : {}, memory : {}, step : {}".format(e, score, len(agent.memory), global_step)
                print(ep_result)
                print("epsilon : {}, greedy : {}".format(count_epsilon, count_greedy))
                print()
                print("time elapsed : {} sec".format((datetime.now() - global_start).seconds))
                global_start = datetime.now()
                agent.epsilon = agent.epsilon - agent.epsilon_decay_step
                print("epsilon decay to {}!".format(agent.epsilon))
                print()

                slack_msg(ep_result)

                # if score > 2000 and score <= 3000:
                #     agent.epsilon = 0.075
                # elif score > 3000 and score <= 5000:
                #     agent.epsilon = 0.05
                # elif score > 5000 and score <= 10000:
                #     agent.epsilon = 0.005

                agent.avg_q_max, agent.avg_loss, global_step = 0, 0, 0

        # 1000 에피소드마다 모델 저장
        if e == 0:
            pass
        elif e % 2 == 0:
            agent.model.save_weights("./dqn.h5")
            # dump(agent.memory, "memory.joblib")
            print("model saved!")
            print()

        gc.collect()
Exemple #23
0
class DQLMarioAgent(DQLAgent.DQLAgent):
    def __init__(self, action_type, batch_size, model_type, success_margin,
                 success_score, memory_size, record_video, target_model,
                 project, wrapper_type):
        super().__init__(action_type, batch_size, model_type, success_margin,
                         success_score, memory_size, record_video,
                         target_model, project)

        self.env = gym_super_mario_bros.make('SuperMarioBros-v0')
        if wrapper_type == 'COMPLEX':
            self.env = JoypadSpace(self.env, COMPLEX_MOVEMENT)  # -> 12
        elif wrapper_type == 'SIMPLE':
            self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT)  # -> 7
        else:
            self.env = JoypadSpace(self.env, RIGHT_ONLY)  # -> 5

        self.action_size = self.env.action_space.n
        self.num_states = 1
        self.state_single_size = 80
        self.state_size = (self.state_single_size, self.state_single_size)
        self.action = self.env.action_space.sample()
        self.first_last_x_pos = self.env.env.env._x_position
        self.max_distance = self.first_last_x_pos

        self.DLModel = NNModel.DLModel(env=self.env,
                                       action_size=self.action_size,
                                       state_size=self.state_single_size,
                                       states=self.num_states,
                                       model_type=model_type,
                                       output_dir=self.others_dir)

    def append_new_frame(self):
        """ Save generated env's frame """
        self.renders.append(
            img_as_ubyte(
                resize(self.env.render(mode='rgb_array'), (480, 480, 3))))

    def get_first_state(self):
        """
        :return: inicial state
        """
        first_state = self.env.reset()
        return Utils.prepare_initial_state(first_state,
                                           self.state_size,
                                           channels=1)

    def get_first_x_pos(self):
        """
        :return: inicial x position
        """
        return self.first_last_x_pos

    def prepare_state(self, next_state, channels=1):
        """
        Remove upper image info, reduce channels, and reduce image size
        :param channels: number of layers
        :param next_state: state to process
        :return: preprocessed image generated by env
        """
        return Utils.prepare_initial_state(next_state,
                                           self.state_size,
                                           channels=channels)

    def reset_max_distance(self):
        """ Reset episode max distance """
        self.max_distance = self.first_last_x_pos

    def update_max_distance(self, dist):
        """
        Update episode max distance
        :param dist: new distance
        :return: new max distance
        """
        if dist > self.max_distance:
            self.max_distance = dist
        return self.max_distance