Exemple #1
0
    def train(self, tasks, browser):
        env = WebBotEnv(tasks=tasks, browser=browser)
        stats = []

        found_tasklets = {}
        try:
            for episode in range(1, self.n_episodes + 1):
                env.reset()
                task = env.current_task.snapshot()
                self.logger.info("Episode %d/%d, task: %s" %
                                 (episode, self.n_episodes, task.task_str))
                max_reward = 0
                while True:
                    if task.done or task.reward < -10:
                        break
                    env.render()

                    actions = task.get_preferred_actions()
                    if len(actions) == 0:
                        actions = task.state.possible_actions
                    candidate_actions = []
                    action_scores = {}
                    for action in actions:
                        action_score = self._get_action_score(task, action)
                        action_scores[action] = action_score
                        if action_score > 0.1:
                            candidate_actions.append(action)
                    if len(candidate_actions) == 0:
                        candidate_actions = Utils.top_n(action_scores,
                                                        5,
                                                        reverse=True)
                    action = random.choice(candidate_actions)

                    env.step(action)
                    self._set_action_score(task, action, score=task.reward)

                    task_ = env.current_task.snapshot()
                    task = task_
                    self.logger.info("\taction:%s, %s" %
                                     (action, task.get_reward_str()))

                    tasklet = task.get_tasklet()
                    if tasklet not in found_tasklets:
                        found_tasklets[tasklet] = (task.total_reward, episode,
                                                   task.state.screenshot)
                    if task.total_reward > max_reward:
                        max_reward = task.total_reward

                stats.append([episode, max_reward])
                self.logger.info("Episode %d/%d, max_reward %.2f" %
                                 (episode, self.n_episodes, max_reward))
            env.destroy()
        except Exception as e:
            import traceback
            traceback.print_exc()
            self.logger.info("failed with error: %s" % e)
        return found_tasklets
 def execute(self, tasks, browser, visualize=False):
     env = WebBotEnv(tasks=tasks, browser=browser, visualize=visualize)
     for task in tasks:
         # initial observation
         env.reset(new_task=task)
         task = env.current_task.snapshot()
         self.logger.info("Executing task: %s" % task.task_str)
         while True:
             if task.done:
                 break
             env.render()
             action, q = self.choose_action_with_model(task)
             env.step(action)
             # self.fe.plot_feature(task, action)
             task_ = env.current_task.snapshot()
             task = task_
             self.logger.info("\tExploit, action:%s, reward:%.2f, done:%s" %
                              (action, task.reward, task.done))
         self.logger.info("Got total_reward %.2f in task: %s" %
                          (task.total_reward, task.task_str))
     self.logger.info("Done executing tasks.")
    def execute(self, tasks, browser):
        env = WebBotEnv(tasks=tasks, browser=browser)
        for task in tasks:
            env.reset(new_task=task)
            task = env.current_task.snapshot()

            while True:
                if task.done:
                    break
                env.render()

                actions = task.get_preferred_actions()
                action2p = self.model.predict(task, actions)
                action = Utils.weighted_choice(action2p)
                env.step(action)
                task_ = env.current_task.snapshot()
                task = task_
                self.logger.info("\tExploit, action:%s, reward:%.2f, done:%s" %
                                 (action, task.reward, task.done))
            self.logger.info("Got total_reward %.2f in task." %
                             task.total_reward)
        env.destroy()
        self.logger.info("Done testing tasks.")
    def train(self, tasks, browser):
        env = WebBotEnv(tasks=tasks, browser=browser)
        stats = []

        def save_progress(save_stats=True,
                          save_fig=True,
                          save_model=False,
                          save_memory=False):
            try:
                if save_stats:
                    stats_path = os.path.join(self.model_dir,
                                              "training_stats.json")
                    json.dump(stats, open(stats_path, "w"), indent=2)
                if save_fig:
                    stats_png_path = os.path.join(self.log_dir,
                                                  "training_stats.png")
                    self._plot_training_stats(stats,
                                              self.et.n_explore_episodes,
                                              stats_png_path)
                if save_model:
                    self.save_model()
                if save_memory:
                    self.replay_memory.save(self.model_dir)
            except Exception as e:
                self.logger.warning(e)

        def resume_progress():
            # resume model
            self.load_model()
            # resume memory
            self.replay_memory.load(self.model_dir)
            # resume stats
            stats_path = os.path.join(self.model_dir, "training_stats.json")
            if os.path.exists(stats_path):
                stats.append(json.load(open(stats_path)))

        if self.resume:
            resume_progress()

        if self.demo_dir:
            self.demo_memory.load(self.demo_dir)
            for task in tasks:
                self.demo_memory.update_rewards(task)
            for i in range(self.demo_pretrain_steps):
                self._learn(memory_source="demo")
            self.logger.info("Done pre-training on demos.")

        found_tasklets = {}
        for episode in range(1, self.n_episodes + 1):
            # initial observation
            env.reset()
            task = env.current_task.snapshot()
            self.logger.info("Episode %d/%d, task: %s" %
                             (episode, self.n_episodes, task.task_str))

            max_reward = 0
            while True:
                # break while loop when end of this episode
                if task.done or task.reward < -10:
                    break
                env.render()
                epsilon = self.et.get_epsilon(episode, task)

                # RL choose action based on current task snapshot
                if np.random.uniform() < epsilon:
                    action_type = "Explore"
                    action = self.et.choose_action_to_explore(task)
                else:
                    action_type = "Exploit"
                    action, q = self.choose_action_with_model(
                        task, q_func=self.q_eval)
                env.step(action)

                # self.fe.plot_feature(task, action)
                task_ = env.current_task.snapshot()
                self.replay_memory.store_transition(
                    Transition(task=task, action=action, task_=task_))
                # swap observation
                task = task_
                self.logger.info(
                    "\t%s, epsilon:%.3f, action:%s, %s" %
                    (action_type, epsilon, action, task.get_reward_str()))

                tasklet = task.get_tasklet()
                if tasklet not in found_tasklets:
                    found_tasklets[tasklet] = (task.total_reward, episode,
                                               task.state.screenshot)
                if task.total_reward > max_reward:
                    max_reward = task.total_reward

            if episode > self.et.n_explore_episodes:
                max_q, q_error = self._learn()
            else:
                max_q, q_error = None, None
            epsilon = self.et.get_epsilon(episode=episode)
            stats.append([episode, epsilon, max_reward, max_q, q_error])
            self.logger.info(
                "Episode %d/%d, epsilon %.3f, max_reward %.2f, max_q %.3f, q_error %.3f"
                % (episode, self.n_episodes, epsilon, max_reward, max_q
                   or np.nan, q_error or np.nan))
            if episode % self.n_backup_episodes == 0:
                save_progress(save_fig=True,
                              save_model=False,
                              save_memory=False)
        save_progress(save_fig=True, save_model=True, save_memory=False)
        env.destroy()
        return found_tasklets
    def train(self, tasks, browser):
        env = WebBotEnv(tasks=tasks, browser=browser)
        stats = []

        def save_progress(save_stats=True, save_fig=True, save_model=True):
            try:
                if save_stats:
                    stats_path = os.path.join(self.model_dir,
                                              "training_stats.json")
                    json.dump(stats, open(stats_path, "w"), indent=2)
                if save_fig:
                    stats_png_path = os.path.join(self.log_dir,
                                                  "training_stats.png")
                    self._plot_training_stats(stats, stats_png_path)
                if save_model:
                    self.save_model()
            except Exception as e:
                self.logger.warning(e)

        def resume_progress():
            # resume model
            self.load_model()
            stats_path = os.path.join(self.model_dir, "training_stats.json")
            if os.path.exists(stats_path):
                stats.append(json.load(open(stats_path)))

        if self.resume:
            resume_progress()

        found_tasklets = {}
        try:
            for episode in range(1, self.n_episodes + 1):
                env.reset()
                task = env.current_task.snapshot()
                self.logger.info("Episode %d/%d, task: %s" %
                                 (episode, self.n_episodes, task.task_str))

                max_reward = 0
                max_reward_task_snapshot = None
                tried_form_actions = []
                while True:
                    if task.done or task.reward < -10:
                        break
                    env.render()
                    epsilon = self.et.get_epsilon(episode, task)
                    # if episode == 1 and self.et.supervised_model and self.et.explore_policy == "supervised":
                    #     action_type = "Guided"
                    #     action = self.choose_action_with_supervised_model(task)
                    # el
                    candidate_actions = []
                    interacted_form_ids = [
                        form.unique_id for form, _ in tried_form_actions
                    ]
                    for candidate_action in self.get_candidate_actions(task):
                        if isinstance(candidate_action, FormAction) and \
                                candidate_action.form.unique_id in interacted_form_ids:
                            continue
                        candidate_actions.append(candidate_action)

                    if len(candidate_actions) == 0:
                        break
                    rand = np.random.uniform()
                    action_category, action, q = "Unknown", None, 0
                    if rand > epsilon:
                        action_category = "Exploit"
                        action, q = self.choose_action_with_model(
                            task, candidate_actions)
                    if rand <= epsilon or q == 0:
                        action_category = "Explore"
                        action = self.choose_action_to_explore(
                            task, candidate_actions)
                    # self.fe.plot_feature(task, action)
                    if action is None:
                        break

                    init_task = task.snapshot()
                    if isinstance(action, FormAction):
                        form = action.form
                        form_actions, action_categories = form.try_solve(
                            epsilon)
                        init_reward = task.total_reward
                        for i, form_action in enumerate(form_actions):
                            form_action_element = task.state.get_element_by_locator(
                                form_action.element.locator)
                            if form_action_element is None:
                                form_action.value = None
                            if form_action.value is None:
                                continue
                            env.step(form_action)
                            task = env.current_task.snapshot()
                            self.logger.info(
                                "\t%s, epsilon:%.3f, action:%s, %s" %
                                (action_categories[i], epsilon, form_action,
                                 task.get_reward_str()))
                        tried_form_actions.append((form, form_actions))
                        self.logger.info(
                            f" {action} achieved {task.total_reward - init_reward:.2f}"
                        )
                    else:
                        env.step(action)
                        task = env.current_task.snapshot()
                        self.logger.info("\t%s, epsilon:%.3f, action:%s, %s" %
                                         (action_category, epsilon, action,
                                          task.get_reward_str()))
                    self._learn(init_task, action, task)

                    if task.total_reward > max_reward:
                        max_reward = task.total_reward
                        max_reward_task_snapshot = task.snapshot()

                if max_reward_task_snapshot is not None:
                    max_reward_tasklet = max_reward_task_snapshot.get_tasklet()
                    if max_reward_tasklet not in found_tasklets:
                        found_tasklets[max_reward_tasklet] = \
                            (max_reward_task_snapshot.total_reward, episode, max_reward_task_snapshot.state.screenshot)

                # learn form
                for (form, form_actions) in tried_form_actions:
                    form.store_actions_actual_reward(form_actions, max_reward)
                self.form_manager.learn()

                epsilon = self.et.get_epsilon(episode=episode)
                stats.append([episode, epsilon, max_reward])
                if episode % self.n_backup_episodes == 0:
                    save_progress(save_fig=True, save_model=False)
                self.logger.info(
                    "Episode %d/%d, epsilon %.3f, max_reward %.2f" %
                    (episode, self.n_episodes, epsilon, max_reward))
            save_progress(save_fig=True, save_model=True)
            env.destroy()
        except Exception as e:
            import traceback
            traceback.print_exc()
            self.logger.info("failed with error: %s" % e)
        return found_tasklets