def __init__(self, opt: Opt, agents=None, shared=None, default_world=None): super().__init__(opt) self.worlds: List[World] = [] for index, k in enumerate(opt['task'].split(',')): k = k.strip() if k: if shared: # Create worlds based on shared data. s = shared['worlds'][index] self.worlds.append(s['world_class'](s['opt'], None, s)) else: # Agents are already specified. opt_singletask = copy.deepcopy(opt) opt_singletask['task'] = k self.worlds.append( create_task_world(opt_singletask, agents, default_world=default_world)) self.world_idx = -1 self.new_world = True self.parleys = -1 # Check to see if we are training self.is_training = DatatypeHelper.is_training(opt.get('datatype')) # Check to see if we should shuffle self.should_shuffle = DatatypeHelper.should_shuffle( opt.get('datatype')) # Make multi-task task probabilities. self.cum_task_weights = [1] * len(self.worlds) self.task_choices = range(len(self.worlds)) weights = self.opt.get('multitask_weights', [1]) # Warn about multi-task weights being ignored if we are in a datatype that doesn't involve shuffling if weights != [1] and not self.should_shuffle: warn_once( f"WARNING: multitask weights are ignored for datatype {opt.get('datatype')} as we iterate through tasks in a round robin" ) if weights == 'stochastic': weights = [w.num_episodes() for w in self.worlds] sum = 0 for i in self.task_choices: if len(weights) > i: weight = weights[i] else: weight = 1 self.cum_task_weights[i] = weight + sum sum += weight task_ids: Dict[str, Teacher] = {} # Having overlap in teacher ids will cause issues for metrics aggregation. for each_world in self.worlds: world_id = each_world.getID() if world_id in task_ids: world_class = each_world.get_agents()[0].__class__ error_once( f"{task_ids[world_id]} and {world_class} teachers have overlap " f"in id '{world_id}'. This will cause their metrics to be " "intermingled. Change the id attribute of one to remove this " "message.") else: task_ids[world_id] = each_world.get_task_agent()
def _path(self, opt): fold = DatatypeHelper.fold(opt["datatype"]) if fold == "train" or fold == "valid": folder = os.path.join(opt["datapath"], "metalwoz", "train") else: folder = os.path.join(opt["datapath"], "metalwoz", "test") return folder, fold
def __init__(self, opt: Opt, shared=None): self.fold = DatatypeHelper.fold(opt["datatype"]) opt["datafile"] = self.fold self.dpath = os.path.join(opt["datapath"], "multiwoz_v22") build_.build(opt) self.last_call = {} super().__init__(opt, shared)
def _load_data(self, fold, domains): chunks = [] for section in domains: domain = [] with PathManager.open( os.path.join(self.dpath, section + "_all.tsv")) as f: reader = csv.reader(f, delimiter="\t") next(reader) lines = list(reader) episode = [] prev_idx = 0 for line in lines: data = {} data["id"] = line[0] data["speaker"] = line[3] data["text"] = line[4] data["dialogue_acts"] = line[5:] data["domain"] = section if prev_idx != data["id"]: domain.append(episode) episode = [] prev_idx = data["id"] episode.append(data) domain.append(episode) chunks.append(domain) # deterministic shuffle data for splits return DatatypeHelper.split_subset_data_by_fold( fold, chunks, 0.8, 0.1, 0.1)
def _path(self, opt): fold = DatatypeHelper.fold(opt['datatype']) if fold == 'train' or fold == 'valid': folder = os.path.join(opt['datapath'], 'metalwoz', 'train') else: folder = os.path.join(opt['datapath'], 'metalwoz', 'test') return folder, fold
def load_data(self, datapath): folder, fold = os.path.split(datapath) with PathManager.open(os.path.join(folder, "tasks.txt")) as taskf: tasks_table = pd.read_json(taskf, lines=True) dfolder = os.path.join(folder, "dialogues") data = [] for filename in PathManager.ls(dfolder): domain = filename.replace(".txt", "") if (self.opt["metalwoz_domains"] and domain not in self.opt["metalwoz_domains"]): continue fullfn = os.path.join(dfolder, filename) with PathManager.open(fullfn) as dataf: lines = pd.read_json(dataf, lines=True) lines = lines.merge(tasks_table, on="task_id") data.append(lines.to_dict("records")) # Quick check to make sure we didn't fat-finger the spelling of some domain if self.opt["metalwoz_domains"]: assert len(data) == len(self.opt["metalwoz_domains"]) if "test" in self.fold: flat = [] for domain in data: flat.extend(domain) return flat return DatatypeHelper.split_subset_data_by_fold( self.fold, data, 0.8, 0.1, 0.1)
def get(self, episode_idx=0, entry_idx=0): field = ( 'labels' if DatatypeHelper.is_training(self.opt['datatype']) else 'eval_labels' ) return Message({'text': '1 2 3 4', field: ['1 2 3 4'], 'episode_done': True})
def __init__(self, opt, shared=None): self.fold = DatatypeHelper.fold(opt['datatype']) self.hf_split = self.hf_splits_mapping[self.fold] self.data_path = self._path(opt) opt['datafile'] = self.data_path self.id = "huggingface" super().__init__(opt, shared)
def __init__(self, opt: Opt, shared=None): self.fold = DatatypeHelper.fold(opt["datatype"]) opt["datafile"] = self.fold self.dpath = os.path.join(opt["datapath"], "msr_e2e") if shared is None: warn_once( "MsrE2E is a beta dataset, and format may significantly change." ) build_.build(opt) super().__init__(opt, shared)
def __init__(self, opt: Opt, agents=None, shared=None, default_world=None): super().__init__(opt) self.worlds: List[World] = [] for index, k in enumerate(opt['task'].split(',')): k = k.strip() if k: if shared: # Create worlds based on shared data. s = shared['worlds'][index] self.worlds.append(s['world_class'](s['opt'], None, s)) else: # Agents are already specified. opt_singletask = copy.deepcopy(opt) opt_singletask['task'] = k self.worlds.append( create_task_world( opt_singletask, agents, default_world=default_world ) ) self.world_idx = -1 self.new_world = True self.parleys = -1 # Check to see if we are training self.is_training = DatatypeHelper.is_training(opt.get('datatype')) # Make multi-task task probabilities. self.cum_task_weights = [1] * len(self.worlds) self.task_choices = range(len(self.worlds)) weights = self.opt.get('multitask_weights', [1]) if weights == 'stochastic': weights = [w.num_episodes() for w in self.worlds] sum = 0 for i in self.task_choices: if len(weights) > i: weight = weights[i] else: weight = 1 self.cum_task_weights[i] = weight + sum sum += weight task_ids: Dict[str, Teacher] = {} # Having overlap in teacher ids will cause issues for metrics aggregation. for each_world in self.worlds: world_id = each_world.getID() if world_id in task_ids: raise AssertionError( '{} and {} teachers have overlap in id {}.'.format( task_ids[world_id], each_world.get_agents()[0].__class__, world_id, ) ) else: task_ids[world_id] = each_world.get_task_agent()
def __init__(self, opt, shared=None): build(opt) self.use_html = opt.get('use_html', False) self.use_long_answer = opt.get('use_long_answer', False) self.use_context = opt.get('use_context', False) self.id = 'natural_questions' self.opt = copy.deepcopy(opt) self.dtype = DatatypeHelper.fold(self.opt['datatype']) if self.dtype == 'test': logging.error( "No test split for this teacher; overriding to valid") self.dtype = 'valid' self.dpath = os.path.join(self.opt['datapath'], DATASET_NAME_LOCAL, self.dtype) self.n_samples = None super().__init__(self.opt, shared)
def __init__(self, opt: Opt, shared=None): if not hasattr(self, "fold"): self.fold = DatatypeHelper.fold(opt["datatype"]) super().__init__(opt, shared) self.epochDone = False self.batchsize = opt.get("batchsize", 1) self.max_episodes = len(self.episodes) if opt.get("num_episodes", 0) > 0: self.max_episodes = min(self.max_episodes, opt.get("num_episodes")) self.episode_idx = opt.get("batchindex", 0) self._setup_next_episode() self.round_idx = 0 # for some downstream utt + sysUttAndApiCallAgents. if is_distributed(): # cause gotta manually handle rank = get_rank() chunk_size = ceil(self.max_episodes / num_workers()) self.episode_idx += rank * chunk_size self.max_episodes = min(self.max_episodes, (rank + 1) * chunk_size)
def create_task(opt: Opt, user_agents, default_world=None): """ Create a world + task_agents (aka a task). Assuming ``opt['task']="task_dir:teacher_class:options"`` e.g. ``"babi:Task1k:1"`` or ``"#babi-1k"`` or ``"#QA"``, see ``parlai/tasks/tasks.py`` and see ``parlai/tasks/task_list.py`` for list of tasks. """ task = opt.get('task') if not task: raise RuntimeError('No task specified. Please select a task with ' + '--task {task_name}.') if type(user_agents) != list: user_agents = [user_agents] # Convert any hashtag task labels to task directory path names. # (e.g. "#QA" to the list of tasks that are QA tasks). opt = copy.deepcopy(opt) opt['task'] = ids_to_tasks(opt['task']) logging.info(f"creating task(s): {opt['task']}") if ',' not in opt['task']: # Single task world = create_task_world(opt, user_agents, default_world=default_world) else: # Multitask teacher/agent # TODO: remove and replace with multiteachers only? world = MultiWorld(opt, user_agents, default_world=default_world) if DatatypeHelper.is_training( opt['datatype']) and opt.get('num_workers', 0) > 0: # note that we never use Background preprocessing in the valid/test # worlds, as we are unable to call Teacher.observe(model_act) in BG # preprocessing, so we are unable to compute Metrics or accurately # differentiate MultiWorld stats. world = BackgroundDriverWorld(opt, world) elif opt.get('batchsize', 1) > 1 and opt.get('dynamic_batching'): world = DynamicBatchWorld(opt, world) elif opt.get('batchsize', 1) > 1: # otherwise check if should use batchworld world = BatchWorld(opt, world) return world
def __init__(self, opt, shared=None): opt = opt.copy() fold = DatatypeHelper.fold(opt['datatype']) opt['datafile'] = os.path.join(os.path.dirname(__file__), f'test/convai2_{fold}.yml') super().__init__(opt, shared)
def __init__(self, opt, shared=None): opt = copy.deepcopy(opt) self.fold = DatatypeHelper.fold(opt['datatype']) build(opt) opt['datafile'] = _path(opt, self.fold + '.txt') super().__init__(opt, shared)
def __init__(self, opt, shared=None): opt = copy.deepcopy(opt) opt['datafile'] = _path(opt, '') opt['cands_datafile'] = opt['datafile'] self.fold = DatatypeHelper.fold(opt['datatype']) super().__init__(opt, shared)
def __init__(self, opt: Opt, shared=None): self.fold = DatatypeHelper.fold(opt["datatype"]) self.dpath = os.path.join(opt["datapath"], "multidogo") opt["datafile"] = self.fold build_.build(opt) super().__init__(opt, shared)