def vectorize(self, *args, **kwargs): """ Override vectorize for generative models. """ kwargs['add_start'] = True # need start token for BART kwargs['add_end'] = True return TorchAgent.vectorize(self, *args, **kwargs)
def vectorize(self, *args, **kwargs): """ Override vectorize for T5. T5 dict already adds the end token. """ kwargs['add_start'] = False # model does this in module code kwargs['add_end'] = False # T5 tokenizer takes care of this return TorchAgent.vectorize(self, *args, **kwargs)
def test_vectorize(self): opt = {} opt['no_cuda'] = True opt['history_tokens'] = 10000 opt['history_dialog'] = 10 opt['history_replies'] = 'label_else_model' dict = MockDict() shared = {'opt': opt, 'dict': dict} agent = TorchAgent(opt, shared) observation = {} observation["text"] = "What does the dog do?" observation["labels"] = ["The dog jumps over the cat."] obs_vec = agent.vectorize(observation) self.assertTrue( 'text_vec' in obs_vec, "Field \'text_vec\' missing from vectorized observation") self.assertTrue(obs_vec['text_vec'].numpy().tolist() == [1, 3, 5], "Vectorized text is incorrect.") self.assertTrue( 'labels_vec' in obs_vec, "Field \'labels_vec\' missing from vectorized observation") self.assertTrue( obs_vec['labels_vec'][0].numpy().tolist() == [ 1, 3, 5, dict.END_IDX ], "Vectorized label is incorrect.") observation = {} observation["text"] = "What does the dog do?" observation["eval_labels"] = ["The dog jumps over the cat."] obs_vec = agent.vectorize(observation) self.assertTrue( 'eval_labels_vec' in obs_vec, "Field \'eval_labels_vec\' missing from vectorized observation") self.assertTrue( obs_vec['eval_labels_vec'][0].numpy().tolist() == [ 1, 3, 5, dict.END_IDX ], "Vectorized label is incorrect.")
def test_vectorize(self): """ Goal of this test is to make sure that the vectorize function is actually adding a new field. """ try: from parlai.core.torch_agent import TorchAgent except ImportError as e: if 'pytorch' in e.msg: print('Skipping TestTorchAgent.test_vectorize, no pytorch.') return opt = {} opt['no_cuda'] = True opt['truncate'] = 10000 opt['history_dialog'] = 10 opt['history_replies'] = 'label_else_model' mdict = MockDict() shared = {'opt': opt, 'dict': mdict} agent = TorchAgent(opt, shared) observation = {} observation["text"] = "What does the dog do?" observation["labels"] = ["The dog jumps over the cat."] # add start and end obs_vec = agent.vectorize(observation, add_start=True, add_end=True) self.assertTrue( 'text_vec' in obs_vec, "Field 'text_vec' missing from vectorized observation") self.assertTrue(obs_vec['text_vec'].numpy().tolist() == [7, 8, 9], "Vectorized text is incorrect.") self.assertTrue( 'labels_vec' in obs_vec, "Field 'labels_vec' missing from vectorized observation") self.assertTrue( obs_vec['labels_vec'].numpy().tolist() == [ mdict.START_IDX, 7, 8, 9, mdict.END_IDX ], "Vectorized label is incorrect.") # no start, add end obs_vec = agent.vectorize(observation, add_start=False, add_end=True) self.assertTrue( obs_vec['labels_vec'].numpy().tolist() == [7, 8, 9, mdict.END_IDX], "Vectorized label is incorrect.") # add start, no end obs_vec = agent.vectorize(observation, add_start=True, add_end=False) self.assertTrue( obs_vec['labels_vec'].numpy().tolist() == [ mdict.START_IDX, 7, 8, 9 ], "Vectorized label is incorrect.") # no start, no end obs_vec = agent.vectorize(observation, add_start=False, add_end=False) self.assertTrue(obs_vec['labels_vec'].numpy().tolist() == [7, 8, 9], "Vectorized label is incorrect.") observation = {} observation["text"] = "What does the dog do?" observation["eval_labels"] = ["The dog jumps over the cat."] # eval_labels obs_vec = agent.vectorize(observation) self.assertTrue( 'eval_labels_vec' in obs_vec, "Field \'eval_labels_vec\' missing from vectorized observation") self.assertTrue( obs_vec['eval_labels_vec'].numpy().tolist() == [ mdict.START_IDX, 7, 8, 9, mdict.END_IDX ], "Vectorized label is incorrect.") # truncate obs_vec = agent.vectorize(observation, truncate=3) self.assertTrue( 'eval_labels_vec' in obs_vec, "Field \'eval_labels_vec\' missing from vectorized observation") self.assertTrue( obs_vec['eval_labels_vec'].numpy().tolist() == [ 8, 9, mdict.END_IDX ], "Vectorized label is incorrect.") # truncate obs_vec = agent.vectorize(observation, truncate=10) self.assertTrue( 'eval_labels_vec' in obs_vec, "Field \'eval_labels_vec\' missing from vectorized observation") self.assertTrue( obs_vec['eval_labels_vec'].numpy().tolist() == [ mdict.START_IDX, 7, 8, 9, mdict.END_IDX ], "Vectorized label is incorrect.")
def test_map_unmap(self): try: from parlai.core.torch_agent import TorchAgent, Output except ImportError as e: if 'pytorch' in e.msg: print('Skipping TestTorchAgent.test_map_unmap, no pytorch.') return observations = [] observations.append({ "text": "What is a painting?", "labels": ["Paint on a canvas."] }) observations.append({}) observations.append({}) observations.append({ "text": "What is a painting?", "labels": ["Paint on a canvas."] }) observations.append({}) observations.append({}) opt = {} opt['no_cuda'] = True opt['truncate'] = 10000 opt['history_dialog'] = 10 opt['history_replies'] = 'label_else_model' mdict = MockDict() shared = {'opt': opt, 'dict': mdict} agent = TorchAgent(opt, shared) vec_observations = [agent.vectorize(obs) for obs in observations] batch = agent.batchify(vec_observations) self.assertTrue(batch.text_vec is not None, "Missing 'text_vecs' field.") self.assertTrue( batch.text_vec.numpy().tolist() == [[7, 8, 9], [7, 8, 9]], "Incorrectly vectorized text field of obs_batch.") self.assertTrue(batch.label_vec is not None, "Missing 'label_vec' field.") self.assertTrue( batch.label_vec.numpy().tolist() == [[ mdict.START_IDX, 7, 8, 9, mdict.END_IDX ], [mdict.START_IDX, 7, 8, 9, mdict.END_IDX]], "Incorrectly vectorized text field of obs_batch.") self.assertTrue( batch.labels == ["Paint on a canvas.", "Paint on a canvas."], "Doesn't return correct labels: " + str(batch.labels)) true_i = [0, 3] self.assertTrue( all(batch.valid_indices[i] == true_i[i] for i in range(2)), "Returns incorrect indices of valid observations.") observations = [] observations.append({ "text": "What is a painting?", "eval_labels": ["Paint on a canvas."] }) observations.append({}) observations.append({}) observations.append({ "text": "What is a painting?", "eval_labels": ["Paint on a canvas."] }) observations.append({}) observations.append({}) vec_observations = [agent.vectorize(obs) for obs in observations] batch = agent.batchify(vec_observations) self.assertTrue(batch.label_vec is not None, "Missing \'eval_label_vec\' field.") self.assertTrue( batch.label_vec.numpy().tolist() == [[ mdict.START_IDX, 7, 8, 9, mdict.END_IDX ], [mdict.START_IDX, 7, 8, 9, mdict.END_IDX]], "Incorrectly vectorized text field of obs_batch.") batch_reply = [{} for i in range(6)] predictions = ["Oil on a canvas.", "Oil on a canvas."] output = Output(predictions, None) expected_unmapped = batch_reply.copy() expected_unmapped[0]["text"] = "Oil on a canvas." expected_unmapped[3]["text"] = "Oil on a canvas." self.assertTrue( agent.match_batch(batch_reply, batch.valid_indices, output) == expected_unmapped, "Unmapped predictions do not match expected results.")
def test_vectorize(self): """ Make sure that the vectorize function is actually adding a new field. """ try: from parlai.core.torch_agent import TorchAgent except ImportError as e: if 'pytorch' in e.msg: print('Skipping TestTorchAgent.test_vectorize, no pytorch.') return from parlai.core.params import ParlaiParser parser = ParlaiParser() TorchAgent.add_cmdline_args(parser) parser.set_params(no_cuda=True) opt = parser.parse_args(print_args=False) mdict = MockDict() shared = {'opt': opt, 'dict': mdict} agent = TorchAgent(opt, shared) observation = {} observation["text"] = "What does the dog do?" observation["labels"] = ["The dog jumps over the cat."] # add start and end obs_vec = agent.vectorize(observation, add_start=True, add_end=True) self.assertTrue( 'text_vec' in obs_vec, "Field 'text_vec' missing from vectorized observation") self.assertTrue(obs_vec['text_vec'].numpy().tolist() == [7, 8, 9], "Vectorized text is incorrect.") self.assertTrue( 'labels_vec' in obs_vec, "Field 'labels_vec' missing from vectorized observation") self.assertTrue( obs_vec['labels_vec'].numpy().tolist() == [ mdict.START_IDX, 7, 8, 9, mdict.END_IDX ], "Vectorized label is incorrect.") # no start, add end obs_vec = agent.vectorize(observation, add_start=False, add_end=True) self.assertTrue( obs_vec['labels_vec'].numpy().tolist() == [7, 8, 9, mdict.END_IDX], "Vectorized label is incorrect.") # add start, no end obs_vec = agent.vectorize(observation, add_start=True, add_end=False) self.assertTrue( obs_vec['labels_vec'].numpy().tolist() == [ mdict.START_IDX, 7, 8, 9 ], "Vectorized label is incorrect.") # no start, no end obs_vec = agent.vectorize(observation, add_start=False, add_end=False) self.assertTrue(obs_vec['labels_vec'].numpy().tolist() == [7, 8, 9], "Vectorized label is incorrect.") observation = {} observation["text"] = "What does the dog do?" observation["eval_labels"] = ["The dog jumps over the cat."] # eval_labels obs_vec = agent.vectorize(observation) self.assertTrue( 'eval_labels_vec' in obs_vec, "Field \'eval_labels_vec\' missing from vectorized observation") self.assertTrue( obs_vec['eval_labels_vec'].numpy().tolist() == [ mdict.START_IDX, 7, 8, 9, mdict.END_IDX ], "Vectorized label is incorrect.") # truncate obs_vec = agent.vectorize(observation, truncate=2) self.assertTrue( 'eval_labels_vec' in obs_vec, "Field \'eval_labels_vec\' missing from vectorized observation") self.assertTrue( obs_vec['eval_labels_vec'].numpy().tolist() == [ mdict.START_IDX, 7 ], "Vectorized label is incorrect: " + str(obs_vec['eval_labels_vec'])) # truncate obs_vec = agent.vectorize(observation, truncate=10) self.assertTrue( 'eval_labels_vec' in obs_vec, "Field \'eval_labels_vec\' missing from vectorized observation") self.assertTrue( obs_vec['eval_labels_vec'].numpy().tolist() == [ mdict.START_IDX, 7, 8, 9, mdict.END_IDX ], "Vectorized label is incorrect.")
def test_map_unmap(self): observations = [] observations.append({ "text": "What is a painting?", "labels": ["Paint on a canvas."] }) observations.append({}) observations.append({}) observations.append({ "text": "What is a painting?", "labels": ["Paint on a canvas."] }) observations.append({}) observations.append({}) opt = {} opt['no_cuda'] = True opt['history_tokens'] = 10000 opt['history_dialog'] = 10 opt['history_replies'] = 'label_else_model' dict = MockDict() shared = {'opt': opt, 'dict': dict} agent = TorchAgent(opt, shared) vec_observations = [agent.vectorize(obs) for obs in observations] mapped_valid = agent.map_valid(vec_observations) text_vecs, label_vecs, labels, valid_inds = mapped_valid self.assertTrue(text_vecs is not None, "Missing \'text_vecs\' field.") self.assertTrue(text_vecs.numpy().tolist() == [[1, 3, 5], [1, 3, 5]], "Incorrectly vectorized text field of obs_batch.") self.assertTrue(label_vecs is not None, "Missing \'label_vec\' field.") self.assertTrue( label_vecs.numpy().tolist() == [[1, 3, 5, 2], [1, 3, 5, 2]], "Incorrectly vectorized text field of obs_batch.") self.assertTrue(labels == ["Paint on a canvas.", "Paint on a canvas."], "Doesn't return correct labels.") self.assertTrue(valid_inds == [0, 3], "Returns incorrect indices of valid observations.") observations = [] observations.append({ "text": "What is a painting?", "eval_labels": ["Paint on a canvas."] }) observations.append({}) observations.append({}) observations.append({ "text": "What is a painting?", "eval_labels": ["Paint on a canvas."] }) observations.append({}) observations.append({}) vec_observations = [agent.vectorize(obs) for obs in observations] mapped_valid = agent.map_valid(vec_observations) text_vecs, label_vecs, labels, valid_inds = mapped_valid self.assertTrue(label_vecs is not None, "Missing \'label_vec\' field.") self.assertTrue( label_vecs.numpy().tolist() == [[1, 3, 5, 2], [1, 3, 5, 2]], "Incorrectly vectorized text field of obs_batch.") predictions = ["Oil on a canvas.", "Oil on a canvas."] expected_unmapped = [ "Oil on a canvas.", None, None, "Oil on a canvas.", None, None ] self.assertTrue( agent.unmap_valid(predictions, valid_inds, 6) == expected_unmapped, "Unmapped predictions do not match expected results.")
def test_map_unmap(self): try: from parlai.core.torch_agent import TorchAgent except ImportError as e: if 'pytorch' in e.msg: print('Skipping TestTorchAgent.test_map_unmap, no pytorch.') return observations = [] observations.append({ "text": "What is a painting?", "labels": ["Paint on a canvas."] }) observations.append({}) observations.append({}) observations.append({ "text": "What is a painting?", "labels": ["Paint on a canvas."] }) observations.append({}) observations.append({}) opt = {} opt['no_cuda'] = True opt['truncate'] = 10000 opt['history_dialog'] = 10 opt['history_replies'] = 'label_else_model' mdict = MockDict() shared = {'opt': opt, 'dict': mdict} agent = TorchAgent(opt, shared) vec_observations = [agent.vectorize(obs) for obs in observations] mapped_valid = agent.map_valid(vec_observations) text_vecs, text_lengths, label_vecs, labels, valid_inds = mapped_valid self.assertTrue(text_vecs is not None, "Missing \'text_vecs\' field.") self.assertTrue(text_vecs.numpy().tolist() == [[7, 8, 9], [7, 8, 9]], "Incorrectly vectorized text field of obs_batch.") self.assertTrue(text_lengths.numpy().tolist() == [3, 3], "Incorrect text vector lengths returned.") self.assertTrue(label_vecs is not None, "Missing \'label_vec\' field.") self.assertTrue( label_vecs.numpy().tolist() == [[ mdict.START_IDX, 7, 8, 9, mdict.END_IDX ], [mdict.START_IDX, 7, 8, 9, mdict.END_IDX]], "Incorrectly vectorized text field of obs_batch.") self.assertTrue(labels == ["Paint on a canvas.", "Paint on a canvas."], "Doesn't return correct labels.") self.assertTrue(valid_inds == [0, 3], "Returns incorrect indices of valid observations.") observations = [] observations.append({ "text": "What is a painting?", "eval_labels": ["Paint on a canvas."] }) observations.append({}) observations.append({}) observations.append({ "text": "What is a painting?", "eval_labels": ["Paint on a canvas."] }) observations.append({}) observations.append({}) vec_observations = [agent.vectorize(obs) for obs in observations] mapped_valid = agent.map_valid(vec_observations) text_vecs, text_lengths, label_vecs, labels, valid_inds = mapped_valid self.assertTrue(label_vecs is not None, "Missing \'eval_label_vec\' field.") self.assertTrue( label_vecs.numpy().tolist() == [[ mdict.START_IDX, 7, 8, 9, mdict.END_IDX ], [mdict.START_IDX, 7, 8, 9, mdict.END_IDX]], "Incorrectly vectorized text field of obs_batch.") predictions = ["Oil on a canvas.", "Oil on a canvas."] expected_unmapped = [ "Oil on a canvas.", None, None, "Oil on a canvas.", None, None ] self.assertTrue( agent.unmap_valid(predictions, valid_inds, 6) == expected_unmapped, "Unmapped predictions do not match expected results.")