def test_shuffle(self): """Simple test to ensure that dataloader is initialized with correct data sampler """ dts = ['train', 'valid', 'test'] exts = ['', ':stream', ':ordered', ':stream:ordered'] shuffle_opts = [False, True] task = 'babi:task1k:1' for dt in dts: for ext in exts: datatype = dt + ext for shuffle in shuffle_opts: opt_defaults = { 'pytorch_teacher_task': task, 'datatype': datatype, 'shuffle': shuffle } with testing_utils.capture_output() as _: parser = display_setup_args() parser.set_defaults(**opt_defaults) opt = parser.parse_args() teacher = create_task_agent_from_taskname(opt)[0] if ('ordered' in datatype or ('stream' in datatype and not opt.get('shuffle')) or 'train' not in datatype): self.assertIsInstance( teacher.pytorch_dataloader.sampler, Sequential, 'PytorchDataTeacher failed with args: {}'. format(opt)) else: self.assertIsInstance( teacher.pytorch_dataloader.sampler, RandomSampler, 'PytorchDataTeacher failed with args: {}'. format(opt))
def interactive_rank(opt, print_parser=None): # Create model and assign it to the specified task human = create_agent(opt) task = create_task_agent_from_taskname(opt)[0] metrics = Metrics(opt) episodes = 0 def print_metrics(): report = metrics.report() report['episodes'] = episodes print(report) # Show some example dialogs: try: while not task.epoch_done(): msg = task.act() print('[{id}]: {text}'.format(id=task.getID(), text=msg.get('text', ''))) cands = list(msg.get('label_candidates', [])) random.shuffle(cands) for i, c in enumerate(cands): print(' [{i}]: {c}'.format(i=i, c=c)) print('[ Please choose a response from the list. ]') choice = None while choice is None: choice = human.act().get('text') try: choice = int(choice) if choice >= 0 and choice < len(cands): choice = cands[choice] else: print( '[ Try again: you selected {i} but the ' 'candidates are indexed from 0 to {j}. ]' ''.format(i=choice, j=len(cands) - 1) ) choice = None except (TypeError, ValueError): print('[ Try again: you did not enter a valid index. ]') choice = None print('[ You chose ]: {}'.format(choice)) reply = {'text_candidates': [choice]} labels = msg.get('eval_labels', msg.get('labels')) metrics.update(reply, labels) if msg.get('episode_done'): episodes += 1 print_metrics() print('------------------------------') print('[ True reply ]: {}'.format(labels[0])) if msg.get('episode_done'): print('******************************') except KeyboardInterrupt: pass print() print_metrics()
def test_pytd_teacher(self): """ Test that the pytorch teacher works with given Pytorch Datasets as well I'll be using the Flickr30k dataset to ensure that the observations are the same. """ defaults = parser_defaults.copy() defaults['datatype'] = 'train:stream' defaults['image_mode'] = 'ascii' f = io.StringIO() with redirect_stdout(f): # Get processed act from agent parser = display_setup_args() defaults['pytorch_teacher_dataset'] = 'flickr30k' del defaults['pytorch_teacher_task'] parser.set_defaults(**defaults) opt = parser.parse_args() teacher = create_task_agent_from_taskname(opt)[0] pytorch_teacher_act = teacher.act() parser = display_setup_args() defaults['task'] = 'flickr30k' del defaults['pytorch_teacher_dataset'] parser.set_defaults(**defaults) opt = parser.parse_args() teacher = create_task_agent_from_taskname(opt)[0] regular_teacher_act = teacher.act() keys = set(pytorch_teacher_act.keys()).intersection( set(regular_teacher_act.keys())) self.assertTrue(len(keys) != 0) for key in keys: self.assertTrue( pytorch_teacher_act[key] == regular_teacher_act[key], 'PytorchDataTeacher does not have the same value ' 'as regular teacher for act key: {}'.format(key)) print('\n------Passed `test_pytd_teacher`------\n')
def get_teacher_act(defaults, teacher_processed=False, agent_to=None): parser = train_setup_args() parser.set_defaults(**defaults) opt = parser.parse_args() build_dict(opt) with testing_utils.capture_output() as _: teacher = create_task_agent_from_taskname(opt)[0] agent = create_agent(opt) act = teacher.act() if teacher_processed: return act, agent return agent.observe(act), agent
def test_pytd_teacher(self): """ Test that the pytorch teacher works with given Pytorch Datasets as well """ defaults = integration_test_parser_defaults.copy() defaults['datatype'] = 'train:stream' defaults['image_mode'] = 'ascii' with testing_utils.capture_output(): # Get processed act from agent parser = display_setup_args() defaults['pytorch_teacher_dataset'] = 'integration_tests' del defaults['pytorch_teacher_task'] parser.set_defaults(**defaults) opt = parser.parse_args() teacher = create_task_agent_from_taskname(opt)[0] pytorch_teacher_act = teacher.act() parser = display_setup_args() defaults['task'] = 'integration_tests' del defaults['pytorch_teacher_dataset'] parser.set_defaults(**defaults) opt = parser.parse_args() teacher = create_task_agent_from_taskname(opt)[0] regular_teacher_act = teacher.act() keys = set(pytorch_teacher_act.keys()).intersection( set(regular_teacher_act.keys())) self.assertTrue(len(keys) != 0) for key in keys: self.assertTrue( pytorch_teacher_act[key] == regular_teacher_act[key], 'PytorchDataTeacher does not have the same value ' 'as regular teacher for act key: {}. ' 'Values: {}; {}'.format(key, pytorch_teacher_act[key], regular_teacher_act[key]), )
def setup_title_to_passage(opt): print('[ Setting up Title to Passage Dict ]') saved_dp = os.path.join(os.getcwd() + '/data/', 'title_to_passage.pkl') if os.path.exists(saved_dp): print('[ Loading from saved location, {} ]'.format(saved_dp)) with open(saved_dp, 'rb') as f: title_to_passage = pickle.load(f) return title_to_passage topics_path = '{}/personas_with_wiki_links.txt'.format(os.getcwd()) topics = [] with open(topics_path) as f: text = f.read() personas = text.split('\n\n') for persona in personas: persona = persona.split('\n') for i in range(1, len(persona)): p_i = persona[i] if 'https' in p_i: topic = unquote(p_i[p_i.rfind('/') + 1 :]).replace('_', ' ') topics.append(topic) ordered_opt = opt.copy() ordered_opt['datatype'] = 'train:ordered:stream' ordered_opt['batchsize'] = 1 ordered_opt['numthreads'] = 1 ordered_opt['task'] = 'wikipedia:full:key-value' teacher = create_task_agent_from_taskname(ordered_opt)[0] title_to_passage = {} i = 0 length = teacher.num_episodes() pbar = tqdm.tqdm(total=length) while not teacher.epoch_done(): pbar.update(1) i += 1 action = teacher.act() title = action['text'] if title in topics: text = action['labels'][0] title_to_passage[title] = text pbar.close() print('[ Finished Building Title to Passage dict; saving now]') with open(saved_dp, 'wb') as f: pickle.dump(title_to_passage, f) return title_to_passage
def _create_task_agents(opt: Opt): """ Create task agent(s) for the given task name. It does this by calling the create_agent function in agents.py of the given task. If create_agents function does not exist, it just looks for the teacher (agent) class defined by the task name directly. (This saves the task creator bothering to define the create_agents function when it is not needed.) """ my_module = load_task_module(opt['task']) try: # Tries to call the create_agent function in agents.py task_agents = my_module.create_agents(opt) # type: ignore except AttributeError: # Create_agent not found, so try to create the teacher directly. return create_task_agent_from_taskname(opt) if type(task_agents) != list: task_agents = [task_agents] return task_agents
def test_shuffle(self): """Simple test to ensure that dataloader is initialized with correct data sampler """ dts = ['train', 'valid', 'test'] exts = ['', ':stream', ':ordered', ':stream:ordered'] shuffle_opts = [False, True] task = 'babi:task1k:1' for dt in dts: for ext in exts: datatype = dt + ext for shuffle in shuffle_opts: opt_defaults = { 'pytorch_teacher_task': task, 'datatype': datatype, 'shuffle': shuffle } print('Testing test_shuffle with args {}'.format( opt_defaults)) f = io.StringIO() with redirect_stdout(f): parser = display_setup_args() parser.set_defaults(**opt_defaults) opt = parser.parse_args() teacher = create_task_agent_from_taskname(opt)[0] if ('ordered' in datatype or ('stream' in datatype and not opt.get('shuffle')) or 'train' not in datatype): self.assertTrue( type(teacher.pytorch_dataloader.sampler) is Sequential, 'PytorchDataTeacher failed with args: {}'.format( opt)) else: self.assertTrue( type(teacher.pytorch_dataloader.sampler) is RandomSampler, 'PytorchDataTeacher failed with args: {}'.format( opt)) print('\n------Passed `test_shuffle`------\n')
def _create_task_agents(opt): """ Create task agent(s) for the given task name. It does this by calling the create_agent function in agents.py of the given task. If create_agents function does not exist, it just looks for the teacher (agent) class defined by the task name directly. (This saves the task creator bothering to define the create_agents function when it is not needed.) """ sp = opt['task'].strip() repo = 'parlai' if sp.startswith('internal:'): # To switch to local repo, useful for non-public projects # (make a directory called 'parlai_internal' with your private agents) repo = 'parlai_internal' sp = sp[9:] sp = sp.split(':') if '.' in sp[0]: # The case of opt['task'] = 'parlai.tasks.squad.agents:DefaultTeacher' # (i.e. specifying your own path directly) module_name = sp[0] elif sp[0] == 'pytorch_teacher': module_name = 'parlai.core.pytorch_data_teacher' else: task = sp[0].lower() module_name = "%s.tasks.%s.agents" % (repo, task) my_module = importlib.import_module(module_name) try: # Tries to call the create_agent function in agents.py task = sp[0].lower() task_agents = my_module.create_agents(opt, task) except AttributeError: # Create_agent not found, so try to create the teacher directly. return create_task_agent_from_taskname(opt) if type(task_agents) != list: task_agents = [task_agents] return task_agents
def store_contents(opt, task, save_path, context_length=-1, include_labels=True): """Preprocess and store a corpus of documents in sqlite. Args: task: ParlAI tasks of text (and possibly values) to store. save_path: Path to output sqlite db. num_workers: Number of parallel processes to use when reading docs. """ if os.path.isfile(save_path): raise RuntimeError('%s already exists! Not overwriting.' % save_path) logger.info('Reading into database...') conn = sqlite3.connect(save_path) c = conn.cursor() c.execute('CREATE TABLE documents (id INTEGER PRIMARY KEY, text, value);') if not task: logger.info('No data to initialize table: just creating table.') logger.info('Add more data by passing observations to the agent.') logger.info('Committing...') conn.commit() conn.close() return ordered_opt = opt.copy() dt = opt.get('datatype', '').split(':') ordered_opt['datatype'] = ':'.join([dt[0], 'ordered'] + dt[1:]) ordered_opt['batchsize'] = 1 ordered_opt['numthreads'] = 1 ordered_opt['task'] = task teacher = create_task_agent_from_taskname(ordered_opt)[0] episode_done = False current = [] triples = [] context_length = context_length if context_length >= 0 else None context = deque(maxlen=context_length) with tqdm(total=teacher.num_episodes()) as pbar: while not teacher.epoch_done(): # collect examples in episode while not episode_done: action = teacher.act() current.append(action) episode_done = action['episode_done'] for ex in current: if 'text' in ex: text = ex['text'] context.append(text) if len(context) > 1: text = '\n'.join(context) # add labels to context labels = ex.get('labels', ex.get('eval_labels')) if labels is not None: label = random.choice(labels) if include_labels: context.append(label) # use None for ID to auto-assign doc ids--we don't need to # ever reverse-lookup them triples.append((None, text, label)) c.executemany('INSERT OR IGNORE INTO documents VALUES (?,?,?)', triples) pbar.update() # reset flags and content episode_done = False triples.clear() current.clear() context.clear() logger.info('Read %d examples from %d episodes.' % (teacher.num_examples(), teacher.num_episodes())) logger.info('Committing...') conn.commit() conn.close()
def create_agents(opt, task): if not opt.get('interactive_task', False): return create_task_agent_from_taskname(opt) else: # interactive task has no task agents (they are attached as user agents) return []
def store_contents(opt, task, save_path, context_length=-1, include_labels=True): """Preprocess and store a corpus of documents in sqlite. Args: task: ParlAI tasks of text (and possibly values) to store. save_path: Path to output sqlite db. num_workers: Number of parallel processes to use when reading docs. """ if os.path.isfile(save_path): raise RuntimeError('%s already exists! Not overwriting.' % save_path) logger.info('Reading into database...') conn = sqlite3.connect(save_path) c = conn.cursor() c.execute('CREATE TABLE documents (id INTEGER PRIMARY KEY, text, value);') if not task: logger.info('No data to initialize table: just creating table.') logger.info('Add more data by passing observations to the agent.') logger.info('Committing...') conn.commit() conn.close() return ordered_opt = opt.copy() dt = opt.get('datatype', '').split(':') ordered_opt['datatype'] = ':'.join([dt[0], 'ordered'] + dt[1:]) ordered_opt['batchsize'] = 1 ordered_opt['numthreads'] = 1 ordered_opt['task'] = task teacher = create_task_agent_from_taskname(ordered_opt)[0] episode_done = False current = [] triples = [] context_length = context_length if context_length >= 0 else None context = deque(maxlen=context_length) with tqdm(total=teacher.num_episodes()) as pbar: while not teacher.epoch_done(): # collect examples in episode while not episode_done: action = teacher.act() current.append(action) episode_done = action['episode_done'] for ex in current: if 'text' in ex: text = ex['text'] context.append(text) if len(context) > 1: text = '\n'.join(context) # add labels to context labels = ex.get('labels', ex.get('eval_labels')) if labels is not None: label = random.choice(labels) if include_labels: context.append(label) # use None for ID to auto-assign doc ids--we don't need to # ever reverse-lookup them triples.append((None, text, label)) c.executemany('INSERT OR IGNORE INTO documents VALUES (?,?,?)', triples) pbar.update() # reset flags and content episode_done = False triples.clear() current.clear() context.clear() logger.info('Read %d examples from %d episodes.' % ( teacher.num_examples(), teacher.num_episodes())) logger.info('Committing...') conn.commit() conn.close()