def make_data(input_file: str, output_file: str, archived_model_file: str,
              max_num_decoded_sequences: int) -> None:
    reader = NlvrDatasetReader(output_agendas=True)
    model = load_archive(archived_model_file).model
    if not isinstance(model, NlvrCoverageSemanticParser):
        model_type = type(model)
        raise RuntimeError(
            f"Expected an archived NlvrCoverageSemanticParser, but found {model_type} instead"
        )
    # Tweaking the decoder trainer to coerce the it to generate a k-best list. Setting k to 100
    # here, so that we can filter out the inconsistent ones later.
    model._decoder_trainer._max_num_decoded_sequences = 100
    num_outputs = 0
    num_sentences = 0
    with open(output_file, "w") as outfile:
        for line in open(input_file):
            num_sentences += 1
            input_data = json.loads(line)
            sentence = input_data["sentence"]
            structured_representations = input_data["worlds"]
            labels = input_data["labels"]
            instance = reader.text_to_instance(sentence,
                                               structured_representations)
            outputs = model.forward_on_instance(instance)
            action_strings = outputs["best_action_strings"]
            logical_forms = outputs["logical_form"]
            correct_sequences = []
            # Checking for consistency
            worlds = [
                NlvrWorld(structure)
                for structure in structured_representations
            ]
            for sequence, logical_form in zip(action_strings, logical_forms):
                denotations = [world.execute(logical_form) for world in worlds]
                denotations_are_correct = [
                    label.lower() == str(denotation).lower()
                    for label, denotation in zip(labels, denotations)
                ]
                if all(denotations_are_correct):
                    correct_sequences.append(sequence)
            correct_sequences = correct_sequences[:max_num_decoded_sequences]
            if not correct_sequences:
                continue
            output_data = {
                "id": input_data["identifier"],
                "sentence": sentence,
                "correct_sequences": correct_sequences,
                "worlds": structured_representations,
                "labels": input_data["labels"],
            }
            json.dump(output_data, outfile)
            outfile.write("\n")
            num_outputs += 1
        outfile.close()
    sys.stderr.write(
        f"{num_outputs} out of {num_sentences} sentences have outputs.")
Beispiel #2
0
 def test_agenda_indices_are_correct(self):
     reader = NlvrDatasetReader()
     test_file = str(self.FIXTURES_ROOT / "data" / "nlvr" /
                     "sample_ungrouped_data.jsonl")
     dataset = reader.read(test_file)
     instances = list(dataset)
     instance = instances[0]
     sentence_tokens = instance.fields["sentence"].tokens
     sentence = " ".join([t.text for t in sentence_tokens])
     agenda = [item.sequence_index for item in instance.fields["agenda"].field_list]
     actions = [action.rule for action in instance.fields["actions"].field_list]
     agenda_actions = [actions[i] for i in agenda]
     world = instance.fields["worlds"].field_list[0].as_tensor({})
     expected_agenda_actions = world.get_agenda_for_sentence(sentence, add_paths_to_agenda=False)
     assert expected_agenda_actions == agenda_actions
Beispiel #3
0
 def test_reader_reads_processed_data(self):
     # Processed data contains action sequences that yield the correct denotations, obtained from
     # an offline search.
     test_file = str(self.FIXTURES_ROOT / "data" / "nlvr" /
                     "sample_processed_data.jsonl")
     dataset = NlvrDatasetReader().read(test_file)
     instances = list(dataset)
     assert len(instances) == 2
     instance = instances[0]
     assert instance.fields.keys() == {
         "sentence", "target_action_sequences", "worlds", "actions",
         "labels", "identifier", 'metadata'
     }
     all_action_sequence_indices = instance.fields[
         "target_action_sequences"].field_list
     assert len(all_action_sequence_indices) == 20
     action_sequence_indices = [
         item.sequence_index
         for item in all_action_sequence_indices[0].field_list
     ]
     actions = [
         action.rule for action in instance.fields["actions"].field_list
     ]
     action_sequence = [
         actions[rule_id] for rule_id in action_sequence_indices
     ]
     assert action_sequence == [
         '@start@ -> bool', 'bool -> [<Set[Object]:bool>, Set[Object]]',
         '<Set[Object]:bool> -> object_exists',
         'Set[Object] -> [<Set[Object]:Set[Object]>, Set[Object]]',
         '<Set[Object]:Set[Object]> -> touch_corner',
         'Set[Object] -> [<Set[Object]:Set[Object]>, Set[Object]]',
         '<Set[Object]:Set[Object]> -> circle', 'Set[Object] -> all_objects'
     ]
Beispiel #4
0
 def test_reader_reads_processed_data(self):
     # Processed data contains action sequences that yield the correct denotations, obtained from
     # an offline search.
     test_file = unicode(self.FIXTURES_ROOT / u"data" / u"nlvr" /
                         u"sample_processed_data.jsonl")
     dataset = NlvrDatasetReader().read(test_file)
     instances = list(dataset)
     assert len(instances) == 2
     instance = instances[0]
     assert list(instance.fields.keys()) == set([
         u"sentence", u"target_action_sequences", u"worlds", u"actions",
         u"labels", u"identifier"
     ])
     all_action_sequence_indices = instance.fields[
         u"target_action_sequences"].field_list
     assert len(all_action_sequence_indices) == 20
     action_sequence_indices = [
         item.sequence_index
         for item in all_action_sequence_indices[0].field_list
     ]
     actions = [
         action.rule for action in instance.fields[u"actions"].field_list
     ]
     action_sequence = [
         actions[rule_id] for rule_id in action_sequence_indices
     ]
     assert action_sequence == [
         u'@start@ -> t', u't -> [<o,t>, o]', u'<o,t> -> object_exists',
         u'o -> [<o,o>, o]', u'<o,o> -> touch_corner', u'o -> [<o,o>, o]',
         u'<o,o> -> circle', u'o -> all_objects'
     ]
Beispiel #5
0
 def test_reader_reads_ungrouped_data(self):
     test_file = "tests/fixtures/data/nlvr/sample_ungrouped_data.jsonl"
     dataset = NlvrDatasetReader().read(test_file)
     instances = list(dataset)
     assert len(instances) == 3
     instance = instances[0]
     assert instance.fields.keys() == {
         'sentence', 'agenda', 'worlds', 'actions', 'labels', 'identifier'
     }
     sentence_tokens = instance.fields["sentence"].tokens
     expected_tokens = [
         'There', 'is', 'a', 'circle', 'closely', 'touching', 'a', 'corner',
         'of', 'a', 'box', '.'
     ]
     assert [t.text for t in sentence_tokens] == expected_tokens
     actions = [
         action.rule for action in instance.fields["actions"].field_list
     ]
     assert len(actions) == 115
     agenda = [
         item.sequence_index
         for item in instance.fields["agenda"].field_list
     ]
     agenda_strings = [actions[rule_id] for rule_id in agenda]
     assert set(agenda_strings) == set([
         '<o,o> -> circle', '<o,t> -> object_exists',
         '<o,o> -> touch_corner'
     ])
     worlds = [
         world_field.as_tensor({})
         for world_field in instance.fields["worlds"].field_list
     ]
     assert isinstance(worlds[0], NlvrWorld)
     label = instance.fields["labels"].field_list[0].label
     assert label == "true"
Beispiel #6
0
 def test_reader_reads_processed_data(self):
     # Processed data contains action sequences that yield the correct denotations, obtained from
     # an offline search.
     test_file = "tests/fixtures/data/nlvr/sample_processed_data.jsonl"
     dataset = NlvrDatasetReader().read(test_file)
     instances = list(dataset)
     assert len(instances) == 2
     instance = instances[0]
     assert instance.fields.keys() == {
         "sentence", "target_action_sequences", "worlds", "actions",
         "labels", "identifier"
     }
     all_action_sequence_indices = instance.fields[
         "target_action_sequences"].field_list
     assert len(all_action_sequence_indices) == 20
     action_sequence_indices = [
         item.sequence_index
         for item in all_action_sequence_indices[0].field_list
     ]
     actions = [
         action.rule for action in instance.fields["actions"].field_list
     ]
     action_sequence = [
         actions[rule_id] for rule_id in action_sequence_indices
     ]
     assert action_sequence == [
         '@start@ -> t', 't -> [<o,t>, o]', '<o,t> -> object_exists',
         'o -> [<o,o>, o]', '<o,o> -> touch_corner', 'o -> [<o,o>, o]',
         '<o,o> -> circle', 'o -> all_objects'
     ]
Beispiel #7
0
 def test_reader_reads_grouped_data(self):
     test_file = str(self.FIXTURES_ROOT / "data" / "nlvr" /
                     "sample_grouped_data.jsonl")
     dataset = NlvrDatasetReader().read(test_file)
     instances = list(dataset)
     assert len(instances) == 2
     instance = instances[0]
     assert instance.fields.keys() == {
         "sentence",
         "agenda",
         "worlds",
         "actions",
         "labels",
         "identifier",
         "metadata",
     }
     sentence_tokens = instance.fields["sentence"].tokens
     expected_tokens = [
         "There",
         "is",
         "a",
         "circle",
         "closely",
         "touching",
         "a",
         "corner",
         "of",
         "a",
         "box",
         ".",
     ]
     assert [t.text for t in sentence_tokens] == expected_tokens
     actions = [
         action.rule for action in instance.fields["actions"].field_list
     ]
     assert len(actions) == 115
     agenda = [
         item.sequence_index
         for item in instance.fields["agenda"].field_list
     ]
     agenda_strings = [actions[rule_id] for rule_id in agenda]
     assert set(agenda_strings) == {
         "<Set[Object]:Set[Object]> -> circle",
         "<Set[Object]:Set[Object]> -> touch_corner",
         "<Set[Object]:bool> -> object_exists",
     }
     worlds = [
         world_field.as_tensor({})
         for world_field in instance.fields["worlds"].field_list
     ]
     assert all([isinstance(world, NlvrLanguage) for world in worlds])
     labels = [
         label.label for label in instance.fields["labels"].field_list
     ]
     assert labels == ["true", "false", "true", "false"]
Beispiel #8
0
 def test_agenda_indices_are_correct(self):
     reader = NlvrDatasetReader()
     test_file = str(self.FIXTURES_ROOT / "data" / "nlvr" /
                     "sample_ungrouped_data.jsonl")
     dataset = reader.read(test_file)
     instances = list(dataset)
     instance = instances[0]
     sentence_tokens = instance.fields["sentence"].tokens
     sentence = " ".join([t.text for t in sentence_tokens])
     agenda = [
         item.sequence_index
         for item in instance.fields["agenda"].field_list
     ]
     actions = [
         action.rule for action in instance.fields["actions"].field_list
     ]
     agenda_actions = [actions[i] for i in agenda]
     world = instance.fields["worlds"].field_list[0].as_tensor({})
     expected_agenda_actions = world.get_agenda_for_sentence(sentence)
     assert expected_agenda_actions == agenda_actions
Beispiel #9
0
 def test_reader_reads_grouped_data(self):
     test_file = unicode(self.FIXTURES_ROOT / u"data" / u"nlvr" /
                         u"sample_grouped_data.jsonl")
     dataset = NlvrDatasetReader().read(test_file)
     instances = list(dataset)
     assert len(instances) == 2
     instance = instances[0]
     assert list(instance.fields.keys()) == set([
         u'sentence', u'agenda', u'worlds', u'actions', u'labels',
         u'identifier'
     ])
     sentence_tokens = instance.fields[u"sentence"].tokens
     expected_tokens = [
         u'There', u'is', u'a', u'circle', u'closely', u'touching', u'a',
         u'corner', u'of', u'a', u'box', u'.'
     ]
     assert [t.text for t in sentence_tokens] == expected_tokens
     actions = [
         action.rule for action in instance.fields[u"actions"].field_list
     ]
     assert len(actions) == 115
     agenda = [
         item.sequence_index
         for item in instance.fields[u"agenda"].field_list
     ]
     agenda_strings = [actions[rule_id] for rule_id in agenda]
     assert set(agenda_strings) == set([
         u'<o,o> -> circle', u'<o,o> -> touch_corner',
         u'<o,t> -> object_exists'
     ])
     worlds = [
         world_field.as_tensor({})
         for world_field in instance.fields[u"worlds"].field_list
     ]
     assert all([isinstance(world, NlvrWorld) for world in worlds])
     labels = [
         label.label for label in instance.fields[u"labels"].field_list
     ]
     assert labels == [u"true", u"false", u"true", u"false"]
Beispiel #10
0
 def test_reader_reads_ungrouped_data(self):
     test_file = str(self.FIXTURES_ROOT / "data" / "nlvr" /
                     "sample_ungrouped_data.jsonl")
     dataset = NlvrDatasetReader().read(test_file)
     instances = list(dataset)
     assert len(instances) == 3
     instance = instances[0]
     assert instance.fields.keys() == {
         'sentence', 'agenda', 'worlds', 'actions', 'labels', 'identifier',
         'metadata'
     }
     sentence_tokens = instance.fields["sentence"].tokens
     expected_tokens = [
         'There', 'is', 'a', 'circle', 'closely', 'touching', 'a', 'corner',
         'of', 'a', 'box', '.'
     ]
     assert [t.text for t in sentence_tokens] == expected_tokens
     actions = [
         action.rule for action in instance.fields["actions"].field_list
     ]
     assert len(actions) == 115
     agenda = [
         item.sequence_index
         for item in instance.fields["agenda"].field_list
     ]
     agenda_strings = [actions[rule_id] for rule_id in agenda]
     assert set(agenda_strings) == {
         '<Set[Object]:Set[Object]> -> circle',
         '<Set[Object]:bool> -> object_exists',
         '<Set[Object]:Set[Object]> -> touch_corner'
     }
     worlds = [
         world_field.as_tensor({})
         for world_field in instance.fields["worlds"].field_list
     ]
     assert isinstance(worlds[0], NlvrLanguage)
     label = instance.fields["labels"].field_list[0].label
     assert label == "true"