def test_flatten_and_classify(self): word_lists = ControllableTaskTeacher.build_wordlists( ParlaiParser().parse_args([])) utterances = [ "hello there", "hi there dad, what's up", "not much, do you know where your sister is?", "I have not seen her, I thought she was with grandpa", "well, if you see her, let me know", "will do!", "ok, have a good day", "bye bye! tell mom I say hello", ] tokens = ['f0m1', 'f1m1', 'f0m0', 'f1m0'] episode = [ Message({ 'text': utterances[i], 'labels': [utterances[i + 1]], 'episode_done': False, }) for i in range(0, len(utterances) - 1, 2) ] episode[-1].force_set('episode_done', True) new_episode = flatten_and_classify(episode, -1, word_lists) assert len(new_episode) == 4 assert all(ex['text'].endswith(tok) for ex, tok in zip( new_episode, tokens)), f"new episode: {new_episode}"
def _setup_data(self, opt: Opt) -> List[List[Message]]: """ Flatten and classify the normal task data. Save/load where applicable. :param opt: options dict. """ # create save directory, if it does not already exist self.original_task_name = ':'.join(opt['task'].split(':')[2:]) self.save_dir = self._get_save_path( opt['datapath'], str(datetime.datetime.today()) ) os.makedirs(self.save_dir, exist_ok=True) fname = f"{opt['datatype'].split(':')[0]}.json" self.save_path = os.path.join(self.save_dir, fname) data = self.load_data(opt, fname) if data is not None: # successfully load data return data # build the original teacher original_task_module = get_original_task_module(opt) teacher_opt = deepcopy(opt) teacher_opt['task'] = self.original_task_name teacher = original_task_module(teacher_opt) total_exs = teacher.num_examples() if self.opt['max_examples'] > 0: total_exs = min(self.opt['max_examples'], total_exs) progress_bar = tqdm( total=total_exs, unit='ex', unit_scale=True, desc='Building flattened data' ) all_episodes = [] num_exs = 0 while num_exs < total_exs: current_episode = [] episode_done = False while not episode_done: action = Message(teacher.act()) current_episode.append(action) episode_done = action.get('episode_done', False) num_exs += 1 # flatten the episode into 1-example episodes with context flattened_ep = flatten_and_classify( current_episode, opt['flatten_max_context_length'], include_labels=opt['flatten_include_labels'], delimiter=opt['flatten_delimiter'], word_lists=self.word_lists, ) all_episodes += flattened_ep progress_bar.update(len(flattened_ep)) # save data for future use self.save_data(all_episodes) return all_episodes