Beispiel #1
0
 def __init__(self, opt: Opt, agents=None, shared=None, default_world=None):
     super().__init__(opt)
     self.worlds: List[World] = []
     for index, k in enumerate(opt['task'].split(',')):
         k = k.strip()
         if k:
             if shared:
                 # Create worlds based on shared data.
                 s = shared['worlds'][index]
                 self.worlds.append(s['world_class'](s['opt'], None, s))
             else:
                 # Agents are already specified.
                 opt_singletask = copy.deepcopy(opt)
                 opt_singletask['task'] = k
                 self.worlds.append(
                     create_task_world(opt_singletask,
                                       agents,
                                       default_world=default_world))
     self.world_idx = -1
     self.new_world = True
     self.parleys = -1
     # Check to see if we are training
     self.is_training = DatatypeHelper.is_training(opt.get('datatype'))
     # Check to see if we should shuffle
     self.should_shuffle = DatatypeHelper.should_shuffle(
         opt.get('datatype'))
     # Make multi-task task probabilities.
     self.cum_task_weights = [1] * len(self.worlds)
     self.task_choices = range(len(self.worlds))
     weights = self.opt.get('multitask_weights', [1])
     # Warn about multi-task weights being ignored if we are in a datatype that doesn't involve shuffling
     if weights != [1] and not self.should_shuffle:
         warn_once(
             f"WARNING: multitask weights are ignored for datatype {opt.get('datatype')} as we iterate through tasks in a round robin"
         )
     if weights == 'stochastic':
         weights = [w.num_episodes() for w in self.worlds]
     sum = 0
     for i in self.task_choices:
         if len(weights) > i:
             weight = weights[i]
         else:
             weight = 1
         self.cum_task_weights[i] = weight + sum
         sum += weight
     task_ids: Dict[str, Teacher] = {}
     # Having overlap in teacher ids will cause issues for metrics aggregation.
     for each_world in self.worlds:
         world_id = each_world.getID()
         if world_id in task_ids:
             world_class = each_world.get_agents()[0].__class__
             error_once(
                 f"{task_ids[world_id]} and {world_class} teachers have overlap "
                 f"in id '{world_id}'. This will cause their metrics to be "
                 "intermingled. Change the id attribute of one to remove this "
                 "message.")
         else:
             task_ids[world_id] = each_world.get_task_agent()
Beispiel #2
0
 def _path(self, opt):
     fold = DatatypeHelper.fold(opt["datatype"])
     if fold == "train" or fold == "valid":
         folder = os.path.join(opt["datapath"], "metalwoz", "train")
     else:
         folder = os.path.join(opt["datapath"], "metalwoz", "test")
     return folder, fold
Beispiel #3
0
 def __init__(self, opt: Opt, shared=None):
     self.fold = DatatypeHelper.fold(opt["datatype"])
     opt["datafile"] = self.fold
     self.dpath = os.path.join(opt["datapath"], "multiwoz_v22")
     build_.build(opt)
     self.last_call = {}
     super().__init__(opt, shared)
Beispiel #4
0
 def _load_data(self, fold, domains):
     chunks = []
     for section in domains:
         domain = []
         with PathManager.open(
                 os.path.join(self.dpath, section + "_all.tsv")) as f:
             reader = csv.reader(f, delimiter="\t")
             next(reader)
             lines = list(reader)
         episode = []
         prev_idx = 0
         for line in lines:
             data = {}
             data["id"] = line[0]
             data["speaker"] = line[3]
             data["text"] = line[4]
             data["dialogue_acts"] = line[5:]
             data["domain"] = section
             if prev_idx != data["id"]:
                 domain.append(episode)
                 episode = []
                 prev_idx = data["id"]
             episode.append(data)
         domain.append(episode)
         chunks.append(domain)
     # deterministic shuffle data for splits
     return DatatypeHelper.split_subset_data_by_fold(
         fold, chunks, 0.8, 0.1, 0.1)
Beispiel #5
0
 def _path(self, opt):
     fold = DatatypeHelper.fold(opt['datatype'])
     if fold == 'train' or fold == 'valid':
         folder = os.path.join(opt['datapath'], 'metalwoz', 'train')
     else:
         folder = os.path.join(opt['datapath'], 'metalwoz', 'test')
     return folder, fold
Beispiel #6
0
    def load_data(self, datapath):
        folder, fold = os.path.split(datapath)
        with PathManager.open(os.path.join(folder, "tasks.txt")) as taskf:
            tasks_table = pd.read_json(taskf, lines=True)

        dfolder = os.path.join(folder, "dialogues")

        data = []

        for filename in PathManager.ls(dfolder):
            domain = filename.replace(".txt", "")
            if (self.opt["metalwoz_domains"]
                    and domain not in self.opt["metalwoz_domains"]):
                continue
            fullfn = os.path.join(dfolder, filename)
            with PathManager.open(fullfn) as dataf:
                lines = pd.read_json(dataf, lines=True)
                lines = lines.merge(tasks_table, on="task_id")
                data.append(lines.to_dict("records"))

        # Quick check to make sure we didn't fat-finger the spelling of some domain
        if self.opt["metalwoz_domains"]:
            assert len(data) == len(self.opt["metalwoz_domains"])

        if "test" in self.fold:
            flat = []
            for domain in data:
                flat.extend(domain)
            return flat

        return DatatypeHelper.split_subset_data_by_fold(
            self.fold, data, 0.8, 0.1, 0.1)
Beispiel #7
0
 def get(self, episode_idx=0, entry_idx=0):
     field = (
         'labels'
         if DatatypeHelper.is_training(self.opt['datatype'])
         else 'eval_labels'
     )
     return Message({'text': '1 2 3 4', field: ['1 2 3 4'], 'episode_done': True})
Beispiel #8
0
    def __init__(self, opt, shared=None):
        self.fold = DatatypeHelper.fold(opt['datatype'])
        self.hf_split = self.hf_splits_mapping[self.fold]
        self.data_path = self._path(opt)
        opt['datafile'] = self.data_path

        self.id = "huggingface"
        super().__init__(opt, shared)
Beispiel #9
0
 def __init__(self, opt: Opt, shared=None):
     self.fold = DatatypeHelper.fold(opt["datatype"])
     opt["datafile"] = self.fold
     self.dpath = os.path.join(opt["datapath"], "msr_e2e")
     if shared is None:
         warn_once(
             "MsrE2E is a beta dataset, and format may significantly change."
         )
         build_.build(opt)
     super().__init__(opt, shared)
Beispiel #10
0
 def __init__(self, opt: Opt, agents=None, shared=None, default_world=None):
     super().__init__(opt)
     self.worlds: List[World] = []
     for index, k in enumerate(opt['task'].split(',')):
         k = k.strip()
         if k:
             if shared:
                 # Create worlds based on shared data.
                 s = shared['worlds'][index]
                 self.worlds.append(s['world_class'](s['opt'], None, s))
             else:
                 # Agents are already specified.
                 opt_singletask = copy.deepcopy(opt)
                 opt_singletask['task'] = k
                 self.worlds.append(
                     create_task_world(
                         opt_singletask, agents, default_world=default_world
                     )
                 )
     self.world_idx = -1
     self.new_world = True
     self.parleys = -1
     # Check to see if we are training
     self.is_training = DatatypeHelper.is_training(opt.get('datatype'))
     # Make multi-task task probabilities.
     self.cum_task_weights = [1] * len(self.worlds)
     self.task_choices = range(len(self.worlds))
     weights = self.opt.get('multitask_weights', [1])
     if weights == 'stochastic':
         weights = [w.num_episodes() for w in self.worlds]
     sum = 0
     for i in self.task_choices:
         if len(weights) > i:
             weight = weights[i]
         else:
             weight = 1
         self.cum_task_weights[i] = weight + sum
         sum += weight
     task_ids: Dict[str, Teacher] = {}
     # Having overlap in teacher ids will cause issues for metrics aggregation.
     for each_world in self.worlds:
         world_id = each_world.getID()
         if world_id in task_ids:
             raise AssertionError(
                 '{} and {} teachers have overlap in id {}.'.format(
                     task_ids[world_id],
                     each_world.get_agents()[0].__class__,
                     world_id,
                 )
             )
         else:
             task_ids[world_id] = each_world.get_task_agent()
Beispiel #11
0
 def __init__(self, opt, shared=None):
     build(opt)
     self.use_html = opt.get('use_html', False)
     self.use_long_answer = opt.get('use_long_answer', False)
     self.use_context = opt.get('use_context', False)
     self.id = 'natural_questions'
     self.opt = copy.deepcopy(opt)
     self.dtype = DatatypeHelper.fold(self.opt['datatype'])
     if self.dtype == 'test':
         logging.error(
             "No test split for this teacher; overriding to valid")
         self.dtype = 'valid'
     self.dpath = os.path.join(self.opt['datapath'], DATASET_NAME_LOCAL,
                               self.dtype)
     self.n_samples = None
     super().__init__(self.opt, shared)
Beispiel #12
0
 def __init__(self, opt: Opt, shared=None):
     if not hasattr(self, "fold"):
         self.fold = DatatypeHelper.fold(opt["datatype"])
     super().__init__(opt, shared)
     self.epochDone = False
     self.batchsize = opt.get("batchsize", 1)
     self.max_episodes = len(self.episodes)
     if opt.get("num_episodes", 0) > 0:
         self.max_episodes = min(self.max_episodes, opt.get("num_episodes"))
     self.episode_idx = opt.get("batchindex", 0)
     self._setup_next_episode()
     self.round_idx = 0  # for some downstream utt + sysUttAndApiCallAgents.
     if is_distributed():  # cause gotta manually handle
         rank = get_rank()
         chunk_size = ceil(self.max_episodes / num_workers())
         self.episode_idx += rank * chunk_size
         self.max_episodes = min(self.max_episodes, (rank + 1) * chunk_size)
Beispiel #13
0
def create_task(opt: Opt, user_agents, default_world=None):
    """
    Create a world + task_agents (aka a task).

    Assuming ``opt['task']="task_dir:teacher_class:options"`` e.g. ``"babi:Task1k:1"``
    or ``"#babi-1k"`` or ``"#QA"``, see ``parlai/tasks/tasks.py`` and see
    ``parlai/tasks/task_list.py`` for list of tasks.
    """
    task = opt.get('task')
    if not task:
        raise RuntimeError('No task specified. Please select a task with ' +
                           '--task {task_name}.')
    if type(user_agents) != list:
        user_agents = [user_agents]

    # Convert any hashtag task labels to task directory path names.
    # (e.g. "#QA" to the list of tasks that are QA tasks).
    opt = copy.deepcopy(opt)
    opt['task'] = ids_to_tasks(opt['task'])
    logging.info(f"creating task(s): {opt['task']}")

    if ',' not in opt['task']:
        # Single task
        world = create_task_world(opt,
                                  user_agents,
                                  default_world=default_world)
    else:
        # Multitask teacher/agent
        # TODO: remove and replace with multiteachers only?
        world = MultiWorld(opt, user_agents, default_world=default_world)

    if DatatypeHelper.is_training(
            opt['datatype']) and opt.get('num_workers', 0) > 0:
        # note that we never use Background preprocessing in the valid/test
        # worlds, as we are unable to call Teacher.observe(model_act) in BG
        # preprocessing, so we are unable to compute Metrics or accurately
        # differentiate MultiWorld stats.
        world = BackgroundDriverWorld(opt, world)
    elif opt.get('batchsize', 1) > 1 and opt.get('dynamic_batching'):
        world = DynamicBatchWorld(opt, world)
    elif opt.get('batchsize', 1) > 1:
        # otherwise check if should use batchworld
        world = BatchWorld(opt, world)

    return world
Beispiel #14
0
 def __init__(self, opt, shared=None):
     opt = opt.copy()
     fold = DatatypeHelper.fold(opt['datatype'])
     opt['datafile'] = os.path.join(os.path.dirname(__file__),
                                    f'test/convai2_{fold}.yml')
     super().__init__(opt, shared)
Beispiel #15
0
 def __init__(self, opt, shared=None):
     opt = copy.deepcopy(opt)
     self.fold = DatatypeHelper.fold(opt['datatype'])
     build(opt)
     opt['datafile'] = _path(opt, self.fold + '.txt')
     super().__init__(opt, shared)
Beispiel #16
0
 def __init__(self, opt, shared=None):
     opt = copy.deepcopy(opt)
     opt['datafile'] = _path(opt, '')
     opt['cands_datafile'] = opt['datafile']
     self.fold = DatatypeHelper.fold(opt['datatype'])
     super().__init__(opt, shared)
Beispiel #17
0
 def __init__(self, opt: Opt, shared=None):
     self.fold = DatatypeHelper.fold(opt["datatype"])
     self.dpath = os.path.join(opt["datapath"], "multidogo")
     opt["datafile"] = self.fold
     build_.build(opt)
     super().__init__(opt, shared)