Example #1
0
    def __init__(self, agent, env, config: Config):
        self.agent = agent
        self.env = env
        self.config = config

        self.outputdir = get_output_folder(self.config.output, self.config.env)
        self.agent.save_config(self.outputdir)
        self.board_logger = TensorBoardLogger(self.outputdir)
        print(self.env.action_space.low, self.env.action_space.high)
Example #2
0
    def __init__(self, agent, env, config: Config):
        self.agent = agent
        self.env = env
        self.config = config

        # Linear epsilon decay
        epsilon_final = self.config.epsilon_min
        epsilon_start = self.config.epsilon
        eps_steps = self.config.eps_fraction * float(self.config.frames)
        self.epsilon_by_frame = lambda frame_idx: epsilon_start + (min(1.0, float(frame_idx) / eps_steps)) \
                                                  * (epsilon_final - epsilon_start)

        self.outputdir = get_output_folder(self.config.output, self.config.env)
        self.agent.save_config(self.outputdir)
        self.board_logger = TensorBoardLogger(self.outputdir)
Example #3
0
    def __init__(self, agent, env, config: Config):
        self.agent = agent
        self.env = env
        self.config = config

        # non-Linear epsilon decay
        epsilon_final = self.config.epsilon_min
        epsilon_start = self.config.epsilon
        epsilon_decay = self.config.eps_decay
        self.epsilon_by_frame = lambda frame_idx: epsilon_final + (
            epsilon_start - epsilon_final) * math.exp(-1. * frame_idx /
                                                      epsilon_decay)

        self.outputdir = get_output_folder(self.config.output, self.config.env)
        self.agent.save_config(self.outputdir)
        self.board_logger = TensorBoardLogger(self.outputdir)
Example #4
0
    def __init__(self, agent, config: Config, record=False):

        self.agent = agent
        self.config = config
        self.outputdir = get_output_folder()

        # if record:
        #     os.makedirs('video', exist_ok=True)
        #     filepath = self.outputdir + '/video/' + config.env + '-' + time_seq()
        #     env = wrappers.Monitor(env, filepath,
        #                            video_callable=lambda episode_id: episode_id % self.config.record_ep_interval == 0)

        # self.env = env
        # self.env.seed(config.seed)

        self.agent.is_training = True

        self.agent.save_config(self.outputdir)
        self.board_logger = TensorBoardLogger(self.outputdir)
Example #5
0
class Trainer:
    def __init__(self, agent, env, config: Config):
        self.agent = agent
        self.env = env
        self.config = config

        # non-Linear epsilon decay
        epsilon_final = self.config.epsilon_min
        epsilon_start = self.config.epsilon
        epsilon_decay = self.config.eps_decay
        self.epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(
            -1. * frame_idx / epsilon_decay)

        self.outputdir = get_output_folder(self.config.output, self.config.env)
        # print("outputdir:", self.outputdir)
        # input()
        self.agent.save_config(self.outputdir)
        self.board_logger = TensorBoardLogger(self.outputdir)

    def train(self, pre_fr=0):
        losses = []
        all_rewards = []
        episode_reward = 0
        ep_num = 0
        is_win = False

        state = self.env.reset()
        for fr in range(pre_fr + 1, self.config.frames + 1):
            epsilon = self.epsilon_by_frame(fr)
            action = self.agent.act(state, epsilon)

            next_state, reward, done, _ = self.env.step(action)
            self.agent.buffer.add(state, action, reward, next_state, done)

            state = next_state
            episode_reward += reward

            loss = 0
            if self.agent.buffer.size() > self.config.batch_size:
                loss = self.agent.learning(fr)
                losses.append(loss)
                self.board_logger.scalar_summary('Loss per frame', fr, loss)

            if fr % self.config.print_interval == 0:
                print("frames: %5d, reward: %5f, loss: %4f episode: %4d" % (fr, np.mean(all_rewards[-10:]), loss, ep_num))

            if fr % self.config.log_interval == 0:
                self.board_logger.scalar_summary('Reward per episode', ep_num, all_rewards[-1])

            if self.config.checkpoint and fr % self.config.checkpoint_interval == 0:
                self.agent.save_checkpoint(fr, self.outputdir)

            if done:
                state = self.env.reset()
                all_rewards.append(episode_reward)
                episode_reward = 0
                ep_num += 1
                avg_reward = float(np.mean(all_rewards[-100:]))
                self.board_logger.scalar_summary('Best 100-episodes average reward', ep_num, avg_reward)

                if len(all_rewards) >= 100 and avg_reward >= self.config.win_reward and all_rewards[-1] > self.config.win_reward:
                    is_win = True
                    self.agent.save_model(self.outputdir, 'best')
                    print('Ran %d episodes best 100-episodes average reward is %3f. Solved after %d trials ✔' % (ep_num, avg_reward, ep_num - 100))
                    if self.config.win_break:
                        break

        if not is_win:
            print('Did not solve after %d episodes' % ep_num)
            self.agent.save_model(self.outputdir, 'last')
Example #6
0
class Trainer:
    def __init__(self, agent, config: Config, record=False):

        self.agent = agent
        self.config = config
        self.outputdir = get_output_folder()

        # if record:
        #     os.makedirs('video', exist_ok=True)
        #     filepath = self.outputdir + '/video/' + config.env + '-' + time_seq()
        #     env = wrappers.Monitor(env, filepath,
        #                            video_callable=lambda episode_id: episode_id % self.config.record_ep_interval == 0)

        # self.env = env
        # self.env.seed(config.seed)

        self.agent.is_training = True

        self.agent.save_config(self.outputdir)
        self.board_logger = TensorBoardLogger(self.outputdir)

    async def train(self, pre_episodes=0, pre_total_step=0):
        total_step = pre_total_step
        all_rewards = []
        result_dir = os.path.join('./logs/', util.now_str())
        os.makedirs(result_dir, exist_ok=True)
        header = ["num_episode", "total_reward", "episode_length"]
        recorder = util.RecordHistory(os.path.join(result_dir, "history.csv"),
                                      header)
        recorder.generate_csv()
        for ep in range(pre_episodes + 1, self.config.episodes + 1):
            await util.sendCommand(util.COMMAND_MAP[util.Commands.RESET.value])
            s0 = await util.getState()
            # s0 = self.env.reset()
            # self.agent.reset()

            done = False
            step = 0
            actor_loss, critics_loss, reward = 0, 0, 0
            done_count = 0

            # decay noise
            self.agent.decay_epsilon()

            while done_count < 100:
                action = self.agent.get_action(s0)
                # translate action to motor speed here
                lms = int(action[0] * 127)
                rms = int(action[1] * 127)

                s1, r1, done, _ = await util.getNextState(lms, rms)
                # s1, r1, done = self.env.step(action)
                if done:
                    done_count += 1
                self.agent.buffer.add(s0, action, r1, done, s1)
                s0 = s1

                if self.agent.buffer.size() > self.config.batch_size:
                    loss_a, loss_c = self.agent.learning()
                    actor_loss += loss_a
                    critics_loss += loss_c

                reward += r1
                step += 1
                total_step += 1

                if step + 1 > self.config.max_steps:
                    break

            all_rewards.append(reward)
            avg_reward = float(np.mean(all_rewards[-100:]))
            self.board_logger.scalar_summary('Reward per episode', ep,
                                             all_rewards[-1])
            self.board_logger.scalar_summary(
                'Best 100-episodes average reward', ep, avg_reward)

            print(
                'total step: %5d, episodes %3d, episode_step: %5d, episode_reward: %5f'
                % (total_step, ep, step, reward))

            history = {
                "num_episode": ep,
                "total_reward": reward,
                "episode_length": step,
            }

            recorder.add_histry(history)

            # check point
            if self.config.checkpoint and ep % self.config.checkpoint_interval == 0:
                self.agent.save_checkpoint(ep, total_step, self.outputdir)

        # save model at last
        self.agent.save_model(self.outputdir)

        asyncio.get_event_loop().stop()
Example #7
0
class Trainer:
    def __init__(self, agent, env, config: Config, record=False):

        self.agent = agent
        self.config = config
        self.outputdir = get_output_folder(self.config.output, self.config.env)

        if record:
            os.makedirs('video', exist_ok=True)
            filepath = self.outputdir + '/video/' + config.env + '-' + time_seq(
            )
            env = wrappers.Monitor(env,
                                   filepath,
                                   video_callable=lambda episode_id: episode_id
                                   % self.config.record_ep_interval == 0)

        self.env = env
        self.env.seed(config.seed)

        self.agent.is_training = True

        self.agent.save_config(self.outputdir)
        self.board_logger = TensorBoardLogger(self.outputdir)

    def train(self, pre_episodes=0, pre_total_step=0):
        total_step = pre_total_step

        all_rewards = []
        for ep in range(pre_episodes + 1, self.config.episodes + 1):
            s0 = self.env.reset()
            self.agent.reset()

            done = False
            step = 0
            actor_loss, critics_loss, reward = 0, 0, 0

            # decay noise
            self.agent.decay_epsilon()

            while not done:
                action = self.agent.get_action(s0)

                s1, r1, done, info = self.env.step(action)
                self.agent.buffer.add(s0, action, r1, done, s1)
                s0 = s1

                if self.agent.buffer.size() > self.config.batch_size:
                    loss_a, loss_c = self.agent.learning()
                    actor_loss += loss_a
                    critics_loss += loss_c

                reward += r1
                step += 1
                total_step += 1

                if step + 1 > self.config.max_steps:
                    break

            all_rewards.append(reward)
            avg_reward = float(np.mean(all_rewards[-100:]))
            self.board_logger.scalar_summary('Reward per episode', ep,
                                             all_rewards[-1])
            self.board_logger.scalar_summary(
                'Best 100-episodes average reward', ep, avg_reward)

            print(
                'total step: %5d, episodes %3d, episode_step: %5d, episode_reward: %5f'
                % (total_step, ep, step, reward))

            # check point
            if self.config.checkpoint and ep % self.config.checkpoint_interval == 0:
                self.agent.save_checkpoint(ep, total_step, self.outputdir)

        # save model at last
        self.agent.save_model(self.outputdir)
Example #8
0
class Trainer:
    def __init__(self, agent, env, config: Config):
        self.agent = agent
        self.env = env
        self.config = config

        # non-Linear epsilon decay
        epsilon_final = self.config.epsilon_min
        epsilon_start = self.config.epsilon
        epsilon_decay = self.config.eps_decay
        self.epsilon_by_frame = lambda frame_idx: epsilon_final + (
            epsilon_start - epsilon_final) * math.exp(-1. * frame_idx /
                                                      epsilon_decay)

        self.outputdir = get_output_folder(self.config.output, self.config.env)
        self.agent.save_config(self.outputdir)
        self.board_logger = TensorBoardLogger(self.outputdir)
        print(self.outputdir)

    def train(self, pre_fr=0):
        losses = []
        all_rewards = []
        episode_reward = 0
        ep_num = 0
        is_win = False
        start = time.time()
        state = self.env.reset()
        for fr in range(pre_fr + 1, self.config.frames + 1):
            if fr % self.config.gif_interval >= 1 and fr % self.config.gif_interval <= 200:
                if fr % self.config.gif_interval == 1:
                    frames = []
                img = state[0, 0:3].transpose(1, 2, 0).astype('uint8')
                frames.append(Image.fromarray(img).convert('RGB'))
                if fr % self.config.gif_interval == 200:
                    imageio.mimsave('record.gif', frames, 'GIF', duration=0.1)

            epsilon = self.epsilon_by_frame(fr)
            action = self.agent.act(state, epsilon)

            next_state, reward, done, _ = self.env.step(action)
            self.agent.buffer.add(state, action, reward, next_state, done)

            state = next_state
            episode_reward += reward

            loss = 0
            if fr > self.config.init_buff and fr % self.config.learning_interval == 0:
                loss = self.agent.learning(fr)
                losses.append(loss)
                self.board_logger.scalar_summary('Loss per frame', fr, loss)

            if fr % self.config.print_interval == 0:
                print(
                    "TIME {}  num timesteps {}, FPS {} \n Loss {:.3f}, avrage reward {:.1f}"
                    .format(
                        time.strftime("%Hh %Mm %Ss",
                                      time.gmtime(time.time() - start)), fr,
                        int(fr / (time.time() - start)), loss,
                        np.mean(all_rewards[-10:])))

            if fr % self.config.log_interval == 0:
                self.board_logger.scalar_summary('Reward per episode', ep_num,
                                                 all_rewards[-1])

            if self.config.checkpoint and fr % self.config.checkpoint_interval == 0:
                self.agent.save_checkpoint(fr, self.outputdir)

            if done:
                state = self.env.reset()
                all_rewards.append(episode_reward)
                episode_reward = 0
                ep_num += 1
                avg_reward = float(np.mean(all_rewards[-100:]))
                self.board_logger.scalar_summary(
                    'Best 100-episodes average reward', ep_num, avg_reward)

                if len(
                        all_rewards
                ) >= 100 and avg_reward >= self.config.win_reward and all_rewards[
                        -1] > self.config.win_reward:
                    is_win = True
                    self.agent.save_model(self.outputdir, 'best')
                    print(
                        'Ran %d episodes best 100-episodes average reward is %3f. Solved after %d trials ✔'
                        % (ep_num, avg_reward, ep_num - 100))
                    if self.config.win_break:
                        break

        if not is_win:
            print('Did not solve after %d episodes' % ep_num)
            self.agent.save_model(self.outputdir, 'last')
Example #9
0
class Trainer:
    def __init__(self, agent, env, config: Config):
        self.agent = agent
        self.env = env
        self.config = config

        self.outputdir = get_output_folder(self.config.output, self.config.env)
        self.agent.save_config(self.outputdir)
        self.board_logger = TensorBoardLogger(self.outputdir)
        print(self.env.action_space.low, self.env.action_space.high)

    def train(self, pre_fr=0):
        t = 0
        all_rewards = []
        tmp_reward = 0
        episode_reward = 0
        ep_num = 0
        is_win = False

        state = self.env.reset()

        for fr in range(pre_fr + 1, self.config.frames + 1):
            t += 1

            # self.env.render()
            action = self.agent.act(state)
            action = action + np.random.normal(
                0,
                self.config.exploration_noise,
                size=self.env.action_space.shape[0])
            action = action.clip(self.env.action_space.low,
                                 self.env.action_space.high)

            next_state, reward, done, _ = self.env.step(action)

            self.agent.buffer.add(state, action, reward, next_state,
                                  float(done))

            state = next_state
            episode_reward += reward

            if fr % self.config.print_interval == 0:
                print("frames: %5d, reward: %5f, episode: %4d" %
                      (fr, np.mean(all_rewards[-10:]), ep_num))

            if fr % self.config.log_interval == 0:
                self.board_logger.scalar_summary('Reward per episode', ep_num,
                                                 all_rewards[-1])

            if self.config.checkpoint and fr % self.config.checkpoint_interval == 0:
                self.agent.save_checkpoint(fr, self.outputdir)

            if done or t == (self.config.max_timesteps):
                self.agent.learning(fr, t)
                t = 0

                state = self.env.reset()

                all_rewards.append(episode_reward)
                episode_reward = 0
                ep_num += 1
                avg_reward = float(np.mean(all_rewards[-100:]))
                self.board_logger.scalar_summary(
                    'Best 100-episodes average reward', ep_num, avg_reward)

                if len(
                        all_rewards
                ) >= 100 and avg_reward >= self.config.win_reward and all_rewards[
                        -1] > self.config.win_reward:
                    is_win = True
                    self.agent.save_model(self.outputdir, 'best')
                    print(
                        'Ran %d episodes best 100-episodes average reward is %3f. Solved after %d trials ✔'
                        % (ep_num, avg_reward, ep_num - 100))
                    if self.config.win_break:
                        break
                elif len(all_rewards) >= 100 and avg_reward > tmp_reward:
                    tmp_reward = avg_reward
                    self.agent.save_model(self.outputdir, 'tmp')
                    print(
                        'Ran %d episodes tmp 100-episodes average reward is %3f. tmp Solved after %d trials'
                        % (ep_num, avg_reward, ep_num - 100))

        if not is_win:
            print('Did not solve after %d episodes' % ep_num)
            self.agent.save_model(self.outputdir, 'last')
Example #10
0
class Trainer:
    def __init__(self, agent, env, config: Config):
        self.agent = agent
        self.env = env
        self.config = config
        self.SaveImage = True

        if not os.path.exists('./history'):
            os.mkdir('./history')

        # non-Linear epsilon decay
        epsilon_final = self.config.epsilon_min
        epsilon_start = self.config.epsilon
        epsilon_decay = self.config.eps_decay

        self.epsilon_by_frame = lambda frame_idx: epsilon_final + (
            epsilon_start - epsilon_final) * math.exp(-1. * frame_idx /
                                                      epsilon_decay)

        self.outputdir = get_output_folder(self.config.output, self.config.env)
        self.agent.save_config(self.outputdir)
        self.board_logger = TensorBoardLogger(self.outputdir)

    def train(self, pre_fr=0):
        losses = []
        all_rewards = []
        episode_reward = 0
        ep_num = 0
        is_win = False

        state = self.env.reset()  # (360, 480)

        current_, v, a = self.env.get_info()
        '''
        # RGB
        state = np.reshape([cv2.resize(state, (self.config.image_size, self.config.image_size)).transpose(2,0,1)], (1, 3, self.config.image_size, self.config.image_size)) #(1, 3, 96, 96)
        history = np.stack((state, state, state, state, state, state), axis=1) # (1, 3*6, 96, 96)
        history = np.reshape([np.concatenate(history)], (1, 18, self.config.image_size, self.config.image_size))    #(1, 18, 96, 96)
        '''

        # Gray
        state = np.reshape([
            cv2.resize(state, (self.config.image_size, self.config.image_size))
        ], (1, self.config.image_size, self.config.image_size))  # (1, 96, 96)
        history = np.stack((state, state, state, state, state, state),
                           axis=1)  # (1, 6, 96, 96)

        npv = np.ones(
            [1, 1, self.config.image_size, self.config.image_size]) * int(
                3.6 * math.sqrt(v.x**2 + v.y**2 + v.z**2))  # (1, 6, 96, 96)
        npa = np.ones(
            [1, 1, self.config.image_size, self.config.image_size]) * int(
                math.sqrt(a.x**2 + a.y**2 + a.z**2))  # (1, 6, 96, 96)
        history_value = np.append(history, npv, axis=1)  # (1, 7, 96, 96)
        history_value = np.append(history_value, npa, axis=1)  # (1, 8, 96, 96)

        if self.SaveImage:
            img = history_value.transpose(0, 2, 3, 1)
            scipy.misc.imsave('history/history0.jpg', img[0][:, :, 0])
            scipy.misc.imsave('history/history1.jpg', img[0][:, :, 1])
            scipy.misc.imsave('history/history2.jpg', img[0][:, :, 2])
            scipy.misc.imsave('history/history3.jpg', img[0][:, :, 3])
            scipy.misc.imsave('history/history4.jpg', img[0][:, :, 4])
            scipy.misc.imsave('history/history5.jpg', img[0][:, :, 5])
            scipy.misc.imsave('history/history6.jpg', img[0][:, :, 6])  # npv
            scipy.misc.imsave('history/history7.jpg', img[0][:, :, 7])  # npa

        for fr in range(pre_fr + 1, self.config.frames + 1):
            # self.env.render()
            epsilon = self.epsilon_by_frame(fr)

            # action = self.agent.act(state, epsilon)
            action = self.agent.act(history_value, epsilon)

            next_state, reward, done, _ = self.env.step(action)
            next_, v, a = self.env.get_info()

            # Recalculate reward
            ll = int(
                math.sqrt((next_.x - current_.x)**2 +
                          (next_.y - current_.y)**2 +
                          (next_.z - current_.z)**2))
            vv = int(3.6 * math.sqrt(v.x**2 + v.y**2 + v.z**2))
            aa = int(math.sqrt(a.x**2 + a.y**2 + a.z**2))
            if not done: reward += 2 * (ll - 2) + 3 if vv > 40 else -10
            '''
            # Draw path trajectory
            # self.env.draw_waypoint_union(self.env.world.debug, current_, next_)
            # self.env.draw_string(self.env.world.debug, current_, str('%15.0f km/h' % vv) )
            '''

            next_history = np.reshape([
                cv2.resize(next_state,
                           (self.config.image_size, self.config.image_size))
            ], (1, 1, self.config.image_size,
                self.config.image_size))  # (1, 1, 96, 96)
            next_history = np.append(next_history,
                                     history[:, :5, :, :],
                                     axis=1)  # (1, 6, 96, 96)

            npv = np.ones([
                1, 1, self.config.image_size, self.config.image_size
            ]) * vv  # (1, 6, 96, 96)
            npa = np.ones([
                1, 1, self.config.image_size, self.config.image_size
            ]) * aa  # (1, 6, 96, 96)
            next_history_value = np.append(next_history, npv,
                                           axis=1)  # (1, 7, 96, 96)
            next_history_value = np.append(next_history_value, npa,
                                           axis=1)  # (1, 8, 96, 96)

            if self.SaveImage:
                img = next_history_value.transpose(0, 2, 3, 1)
                scipy.misc.imsave('history/history' + str(fr) + '0.jpg',
                                  img[0][:, :, 0])
                scipy.misc.imsave('history/history' + str(fr) + '1.jpg',
                                  img[0][:, :, 1])
                scipy.misc.imsave('history/history' + str(fr) + '2.jpg',
                                  img[0][:, :, 2])
                scipy.misc.imsave('history/history' + str(fr) + '3.jpg',
                                  img[0][:, :, 3])
                scipy.misc.imsave('history/history' + str(fr) + '4.jpg',
                                  img[0][:, :, 4])
                scipy.misc.imsave('history/history' + str(fr) + '5.jpg',
                                  img[0][:, :, 5])
                scipy.misc.imsave('history/history' + str(fr) + '6.jpg',
                                  img[0][:, :, 6])  # npv
                scipy.misc.imsave('history/history' + str(fr) + '7.jpg',
                                  img[0][:, :, 7])  # npa

            # self.agent.buffer.add(state, action, reward, next_state, done)
            self.agent.buffer.add(history_value, action, reward,
                                  next_history_value, done)

            current_ = next_
            state = next_state
            history = next_history
            history_value = next_history_value
            episode_reward += reward

            loss = 0
            # if self.agent.buffer.size() > self.config.batch_size:
            if self.agent.buffer.size() > self.config.min_buff:
                loss = self.agent.learning(fr)
                losses.append(loss)
                self.board_logger.scalar_summary('Loss per frame', fr, loss)

            if fr % self.config.print_interval == 0:
                print(
                    "frames: %5d, reward: %5f, loss: %4f episode: %4d, epsilon: %4f"
                    % (fr, np.mean(all_rewards[-10:]), loss, ep_num,
                       self.epsilon_by_frame(fr)))

            if fr % self.config.log_interval == 0:
                self.board_logger.scalar_summary('Reward per episode', ep_num,
                                                 all_rewards[-1])

            if self.config.checkpoint and fr % self.config.checkpoint_interval == 0:
                self.agent.save_checkpoint(fr, self.outputdir)

            if done:

                for actor in self.env.actor_list:
                    actor.destroy()
                    # carla.command.DestroyActor(actor)
                self.env.vehicle.destroy()
                # print("All cleaned up!")

                state = self.env.reset()
                current_, v, a = self.env.get_info()

                # Gray
                state = np.reshape([
                    cv2.resize(
                        state,
                        (self.config.image_size, self.config.image_size))
                ], (1, self.config.image_size,
                    self.config.image_size))  # (1, 96, 96)
                history = np.stack((state, state, state, state, state, state),
                                   axis=1)  # (1, 6, 96, 96)

                npv = np.ones([
                    1, 1, self.config.image_size, self.config.image_size
                ]) * int(3.6 *
                         math.sqrt(v.x**2 + v.y**2 + v.z**2))  # (1, 6, 96, 96)
                npa = np.ones([
                    1, 1, self.config.image_size, self.config.image_size
                ]) * int(math.sqrt(a.x**2 + a.y**2 + a.z**2))  # (1, 6, 96, 96)
                history_value = np.append(history, npv,
                                          axis=1)  # (1, 7, 96, 96)
                history_value = np.append(history_value, npa,
                                          axis=1)  # (1, 8, 96, 96)

                all_rewards.append(episode_reward)
                episode_reward = 0
                ep_num += 1
                avg_reward = float(np.mean(all_rewards[-100:]))
                self.board_logger.scalar_summary(
                    'Best 100-episodes average reward', ep_num, avg_reward)

                if len(
                        all_rewards
                ) >= 100 and avg_reward >= self.config.win_reward and all_rewards[
                        -1] > self.config.win_reward:
                    is_win = True
                    self.agent.save_model(self.outputdir, 'best')
                    print(
                        'Ran %d episodes best 100-episodes average reward is %3f. Solved after %d trials ✔'
                        % (ep_num, avg_reward, ep_num - 100))
                    if self.config.win_break:
                        break

        if not is_win:
            print('Did not solve after %d episodes' % ep_num)
            self.agent.save_model(self.outputdir, 'last')