def make_data(input_file: str, output_file: str, archived_model_file: str, max_num_decoded_sequences: int) -> None: reader = NlvrDatasetReader(output_agendas=True) model = load_archive(archived_model_file).model if not isinstance(model, NlvrCoverageSemanticParser): model_type = type(model) raise RuntimeError( f"Expected an archived NlvrCoverageSemanticParser, but found {model_type} instead" ) # Tweaking the decoder trainer to coerce the it to generate a k-best list. Setting k to 100 # here, so that we can filter out the inconsistent ones later. model._decoder_trainer._max_num_decoded_sequences = 100 num_outputs = 0 num_sentences = 0 with open(output_file, "w") as outfile: for line in open(input_file): num_sentences += 1 input_data = json.loads(line) sentence = input_data["sentence"] structured_representations = input_data["worlds"] labels = input_data["labels"] instance = reader.text_to_instance(sentence, structured_representations) outputs = model.forward_on_instance(instance) action_strings = outputs["best_action_strings"] logical_forms = outputs["logical_form"] correct_sequences = [] # Checking for consistency worlds = [ NlvrWorld(structure) for structure in structured_representations ] for sequence, logical_form in zip(action_strings, logical_forms): denotations = [world.execute(logical_form) for world in worlds] denotations_are_correct = [ label.lower() == str(denotation).lower() for label, denotation in zip(labels, denotations) ] if all(denotations_are_correct): correct_sequences.append(sequence) correct_sequences = correct_sequences[:max_num_decoded_sequences] if not correct_sequences: continue output_data = { "id": input_data["identifier"], "sentence": sentence, "correct_sequences": correct_sequences, "worlds": structured_representations, "labels": input_data["labels"], } json.dump(output_data, outfile) outfile.write("\n") num_outputs += 1 outfile.close() sys.stderr.write( f"{num_outputs} out of {num_sentences} sentences have outputs.")
def test_agenda_indices_are_correct(self): reader = NlvrDatasetReader() test_file = str(self.FIXTURES_ROOT / "data" / "nlvr" / "sample_ungrouped_data.jsonl") dataset = reader.read(test_file) instances = list(dataset) instance = instances[0] sentence_tokens = instance.fields["sentence"].tokens sentence = " ".join([t.text for t in sentence_tokens]) agenda = [item.sequence_index for item in instance.fields["agenda"].field_list] actions = [action.rule for action in instance.fields["actions"].field_list] agenda_actions = [actions[i] for i in agenda] world = instance.fields["worlds"].field_list[0].as_tensor({}) expected_agenda_actions = world.get_agenda_for_sentence(sentence, add_paths_to_agenda=False) assert expected_agenda_actions == agenda_actions
def test_reader_reads_processed_data(self): # Processed data contains action sequences that yield the correct denotations, obtained from # an offline search. test_file = str(self.FIXTURES_ROOT / "data" / "nlvr" / "sample_processed_data.jsonl") dataset = NlvrDatasetReader().read(test_file) instances = list(dataset) assert len(instances) == 2 instance = instances[0] assert instance.fields.keys() == { "sentence", "target_action_sequences", "worlds", "actions", "labels", "identifier", 'metadata' } all_action_sequence_indices = instance.fields[ "target_action_sequences"].field_list assert len(all_action_sequence_indices) == 20 action_sequence_indices = [ item.sequence_index for item in all_action_sequence_indices[0].field_list ] actions = [ action.rule for action in instance.fields["actions"].field_list ] action_sequence = [ actions[rule_id] for rule_id in action_sequence_indices ] assert action_sequence == [ '@start@ -> bool', 'bool -> [<Set[Object]:bool>, Set[Object]]', '<Set[Object]:bool> -> object_exists', 'Set[Object] -> [<Set[Object]:Set[Object]>, Set[Object]]', '<Set[Object]:Set[Object]> -> touch_corner', 'Set[Object] -> [<Set[Object]:Set[Object]>, Set[Object]]', '<Set[Object]:Set[Object]> -> circle', 'Set[Object] -> all_objects' ]
def test_reader_reads_processed_data(self): # Processed data contains action sequences that yield the correct denotations, obtained from # an offline search. test_file = unicode(self.FIXTURES_ROOT / u"data" / u"nlvr" / u"sample_processed_data.jsonl") dataset = NlvrDatasetReader().read(test_file) instances = list(dataset) assert len(instances) == 2 instance = instances[0] assert list(instance.fields.keys()) == set([ u"sentence", u"target_action_sequences", u"worlds", u"actions", u"labels", u"identifier" ]) all_action_sequence_indices = instance.fields[ u"target_action_sequences"].field_list assert len(all_action_sequence_indices) == 20 action_sequence_indices = [ item.sequence_index for item in all_action_sequence_indices[0].field_list ] actions = [ action.rule for action in instance.fields[u"actions"].field_list ] action_sequence = [ actions[rule_id] for rule_id in action_sequence_indices ] assert action_sequence == [ u'@start@ -> t', u't -> [<o,t>, o]', u'<o,t> -> object_exists', u'o -> [<o,o>, o]', u'<o,o> -> touch_corner', u'o -> [<o,o>, o]', u'<o,o> -> circle', u'o -> all_objects' ]
def test_reader_reads_ungrouped_data(self): test_file = "tests/fixtures/data/nlvr/sample_ungrouped_data.jsonl" dataset = NlvrDatasetReader().read(test_file) instances = list(dataset) assert len(instances) == 3 instance = instances[0] assert instance.fields.keys() == { 'sentence', 'agenda', 'worlds', 'actions', 'labels', 'identifier' } sentence_tokens = instance.fields["sentence"].tokens expected_tokens = [ 'There', 'is', 'a', 'circle', 'closely', 'touching', 'a', 'corner', 'of', 'a', 'box', '.' ] assert [t.text for t in sentence_tokens] == expected_tokens actions = [ action.rule for action in instance.fields["actions"].field_list ] assert len(actions) == 115 agenda = [ item.sequence_index for item in instance.fields["agenda"].field_list ] agenda_strings = [actions[rule_id] for rule_id in agenda] assert set(agenda_strings) == set([ '<o,o> -> circle', '<o,t> -> object_exists', '<o,o> -> touch_corner' ]) worlds = [ world_field.as_tensor({}) for world_field in instance.fields["worlds"].field_list ] assert isinstance(worlds[0], NlvrWorld) label = instance.fields["labels"].field_list[0].label assert label == "true"
def test_reader_reads_processed_data(self): # Processed data contains action sequences that yield the correct denotations, obtained from # an offline search. test_file = "tests/fixtures/data/nlvr/sample_processed_data.jsonl" dataset = NlvrDatasetReader().read(test_file) instances = list(dataset) assert len(instances) == 2 instance = instances[0] assert instance.fields.keys() == { "sentence", "target_action_sequences", "worlds", "actions", "labels", "identifier" } all_action_sequence_indices = instance.fields[ "target_action_sequences"].field_list assert len(all_action_sequence_indices) == 20 action_sequence_indices = [ item.sequence_index for item in all_action_sequence_indices[0].field_list ] actions = [ action.rule for action in instance.fields["actions"].field_list ] action_sequence = [ actions[rule_id] for rule_id in action_sequence_indices ] assert action_sequence == [ '@start@ -> t', 't -> [<o,t>, o]', '<o,t> -> object_exists', 'o -> [<o,o>, o]', '<o,o> -> touch_corner', 'o -> [<o,o>, o]', '<o,o> -> circle', 'o -> all_objects' ]
def test_reader_reads_grouped_data(self): test_file = str(self.FIXTURES_ROOT / "data" / "nlvr" / "sample_grouped_data.jsonl") dataset = NlvrDatasetReader().read(test_file) instances = list(dataset) assert len(instances) == 2 instance = instances[0] assert instance.fields.keys() == { "sentence", "agenda", "worlds", "actions", "labels", "identifier", "metadata", } sentence_tokens = instance.fields["sentence"].tokens expected_tokens = [ "There", "is", "a", "circle", "closely", "touching", "a", "corner", "of", "a", "box", ".", ] assert [t.text for t in sentence_tokens] == expected_tokens actions = [ action.rule for action in instance.fields["actions"].field_list ] assert len(actions) == 115 agenda = [ item.sequence_index for item in instance.fields["agenda"].field_list ] agenda_strings = [actions[rule_id] for rule_id in agenda] assert set(agenda_strings) == { "<Set[Object]:Set[Object]> -> circle", "<Set[Object]:Set[Object]> -> touch_corner", "<Set[Object]:bool> -> object_exists", } worlds = [ world_field.as_tensor({}) for world_field in instance.fields["worlds"].field_list ] assert all([isinstance(world, NlvrLanguage) for world in worlds]) labels = [ label.label for label in instance.fields["labels"].field_list ] assert labels == ["true", "false", "true", "false"]
def test_agenda_indices_are_correct(self): reader = NlvrDatasetReader() test_file = str(self.FIXTURES_ROOT / "data" / "nlvr" / "sample_ungrouped_data.jsonl") dataset = reader.read(test_file) instances = list(dataset) instance = instances[0] sentence_tokens = instance.fields["sentence"].tokens sentence = " ".join([t.text for t in sentence_tokens]) agenda = [ item.sequence_index for item in instance.fields["agenda"].field_list ] actions = [ action.rule for action in instance.fields["actions"].field_list ] agenda_actions = [actions[i] for i in agenda] world = instance.fields["worlds"].field_list[0].as_tensor({}) expected_agenda_actions = world.get_agenda_for_sentence(sentence) assert expected_agenda_actions == agenda_actions
def test_reader_reads_grouped_data(self): test_file = unicode(self.FIXTURES_ROOT / u"data" / u"nlvr" / u"sample_grouped_data.jsonl") dataset = NlvrDatasetReader().read(test_file) instances = list(dataset) assert len(instances) == 2 instance = instances[0] assert list(instance.fields.keys()) == set([ u'sentence', u'agenda', u'worlds', u'actions', u'labels', u'identifier' ]) sentence_tokens = instance.fields[u"sentence"].tokens expected_tokens = [ u'There', u'is', u'a', u'circle', u'closely', u'touching', u'a', u'corner', u'of', u'a', u'box', u'.' ] assert [t.text for t in sentence_tokens] == expected_tokens actions = [ action.rule for action in instance.fields[u"actions"].field_list ] assert len(actions) == 115 agenda = [ item.sequence_index for item in instance.fields[u"agenda"].field_list ] agenda_strings = [actions[rule_id] for rule_id in agenda] assert set(agenda_strings) == set([ u'<o,o> -> circle', u'<o,o> -> touch_corner', u'<o,t> -> object_exists' ]) worlds = [ world_field.as_tensor({}) for world_field in instance.fields[u"worlds"].field_list ] assert all([isinstance(world, NlvrWorld) for world in worlds]) labels = [ label.label for label in instance.fields[u"labels"].field_list ] assert labels == [u"true", u"false", u"true", u"false"]
def test_reader_reads_ungrouped_data(self): test_file = str(self.FIXTURES_ROOT / "data" / "nlvr" / "sample_ungrouped_data.jsonl") dataset = NlvrDatasetReader().read(test_file) instances = list(dataset) assert len(instances) == 3 instance = instances[0] assert instance.fields.keys() == { 'sentence', 'agenda', 'worlds', 'actions', 'labels', 'identifier', 'metadata' } sentence_tokens = instance.fields["sentence"].tokens expected_tokens = [ 'There', 'is', 'a', 'circle', 'closely', 'touching', 'a', 'corner', 'of', 'a', 'box', '.' ] assert [t.text for t in sentence_tokens] == expected_tokens actions = [ action.rule for action in instance.fields["actions"].field_list ] assert len(actions) == 115 agenda = [ item.sequence_index for item in instance.fields["agenda"].field_list ] agenda_strings = [actions[rule_id] for rule_id in agenda] assert set(agenda_strings) == { '<Set[Object]:Set[Object]> -> circle', '<Set[Object]:bool> -> object_exists', '<Set[Object]:Set[Object]> -> touch_corner' } worlds = [ world_field.as_tensor({}) for world_field in instance.fields["worlds"].field_list ] assert isinstance(worlds[0], NlvrLanguage) label = instance.fields["labels"].field_list[0].label assert label == "true"