def test_knowledge_graph_has_correct_neighbors(self):
     question = "when was the attendance greater than 5000?"
     question_tokens = self.tokenizer.tokenize(question)
     test_file = f'{self.FIXTURES_ROOT}/data/wikitables/sample_table.tagged'
     table_question_context = TableQuestionContext.read_from_file(
         test_file, question_tokens)
     knowledge_graph = table_question_context.get_table_knowledge_graph()
     neighbors = knowledge_graph.neighbors
     # '5000' is neighbors with number and date columns. '-1' is in entities because there is a
     # date column, which is its only neighbor.
     assert set(neighbors.keys()) == {
         'date_column:year', 'number_column:year', 'string_column:year',
         'number_column:division', 'string_column:division',
         'string_column:league', 'string_column:regular_season',
         'number_column:regular_season', 'string_column:playoffs',
         'string_column:open_cup', 'number_column:open_cup',
         'string_column:avg_attendance', 'number_column:avg_attendance',
         '5000', '-1'
     }
     assert set(neighbors['date_column:year']) == {'5000', '-1'}
     assert neighbors['number_column:division'] == ['5000']
     assert neighbors['string_column:league'] == []
     assert neighbors['string_column:regular_season'] == []
     assert neighbors['string_column:playoffs'] == []
     assert neighbors['string_column:open_cup'] == []
     assert neighbors['number_column:avg_attendance'] == ['5000']
     assert set(neighbors['5000']) == {
         'date_column:year', 'number_column:division',
         'number_column:avg_attendance', 'number_column:regular_season',
         'number_column:year', 'number_column:open_cup'
     }
     assert neighbors['-1'] == ['date_column:year']
 def test_rank_number_extraction(self):
     question = "what was the first tamil-language film in 1943?"
     question_tokens = self.tokenizer.tokenize(question)
     test_file = f'{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-1.table'
     table_question_context = TableQuestionContext.read_from_file(
         test_file, question_tokens)
     _, numbers = table_question_context.get_entities_from_question()
     assert numbers == [("1", 3), ('1943', 9)]
 def test_entity_extraction_from_question_with_quotes(self):
     question = "how many times does \"friendly\" appear in the competition column?"
     question_tokens = self.tokenizer.tokenize(question)
     test_file = 'fixtures/data/wikitables/tables/346.tagged'
     table_question_context = TableQuestionContext.read_from_file(
         test_file, question_tokens)
     entities, _ = table_question_context.get_entities_from_question()
     assert entities == [('string:friendly', ['string_column:competition'])]
 def test_date_column_type_extraction_1(self):
     question = "how many were elected?"
     question_tokens = self.tokenizer.tokenize(question)
     test_file = f'{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-5.table'
     table_question_context = TableQuestionContext.read_from_file(
         test_file, question_tokens)
     data = table_question_context.table_data[0]
     assert "date_column:first_elected" in data
 def test_multiword_entity_extraction(self):
     question = "was the positioning better the year of the france venue or the year of the south korea venue?"
     question_tokens = self.tokenizer.tokenize(question)
     test_file = f'{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-3.table'
     table_question_context = TableQuestionContext.read_from_file(
         test_file, question_tokens)
     entities, _ = table_question_context.get_entities_from_question()
     assert entities == [("string:france", ["string_column:venue"]),
                         ("string:south_korea", ["string_column:venue"])]
def search(tables_directory: str, data: JsonDict, output_path: str,
           max_path_length: int, max_num_logical_forms: int, use_agenda: bool,
           output_separate_files: bool, conservative_agenda: bool) -> None:
    print(f"Starting search with {len(data)} instances", file=sys.stderr)
    executor_logger = logging.getLogger(
        'weak_supervision.semparse.executors.wikitables_variable_free_executor'
    )
    executor_logger.setLevel(logging.ERROR)
    tokenizer = WordTokenizer()
    if output_separate_files and not os.path.exists(output_path):
        os.makedirs(output_path)
    if not output_separate_files:
        output_file_pointer = open(output_path, "w")
    for instance_data in data:
        utterance = instance_data["question"]
        question_id = instance_data["id"]
        if utterance.startswith('"') and utterance.endswith('"'):
            utterance = utterance[1:-1]
        # For example: csv/200-csv/47.csv -> tagged/200-tagged/47.tagged
        table_file = instance_data["table_filename"].replace("csv", "tagged")
        target_list = instance_data["target_values"]
        tokenized_question = tokenizer.tokenize(utterance)
        table_file = f"{tables_directory}/{table_file}"
        context = TableQuestionContext.read_from_file(table_file,
                                                      tokenized_question)
        world = WikiTablesVariableFreeWorld(context)
        walker = ActionSpaceWalker(world, max_path_length=max_path_length)
        correct_logical_forms = []
        if use_agenda:
            agenda = world.get_agenda(conservative=conservative_agenda)
            allow_partial_match = not conservative_agenda
            all_logical_forms = walker.get_logical_forms_with_agenda(
                agenda=agenda,
                max_num_logical_forms=10000,
                allow_partial_match=allow_partial_match)
        else:
            all_logical_forms = walker.get_all_logical_forms(
                max_num_logical_forms=10000)
        for logical_form in all_logical_forms:
            if world.evaluate_logical_form(logical_form, target_list):
                correct_logical_forms.append(logical_form)
        if output_separate_files and correct_logical_forms:
            with gzip.open(f"{output_path}/{question_id}.gz",
                           "wt") as output_file_pointer:
                for logical_form in correct_logical_forms:
                    print(logical_form, file=output_file_pointer)
        elif not output_separate_files:
            print(f"{question_id} {utterance}", file=output_file_pointer)
            if use_agenda:
                print(f"Agenda: {agenda}", file=output_file_pointer)
            if not correct_logical_forms:
                print("NO LOGICAL FORMS FOUND!", file=output_file_pointer)
            for logical_form in correct_logical_forms[:max_num_logical_forms]:
                print(logical_form, file=output_file_pointer)
            print(file=output_file_pointer)
    if not output_separate_files:
        output_file_pointer.close()
 def setUp(self):
     super().setUp()
     question_tokens = [Token(x) for x in ['what', 'was', 'the', 'last', 'year', '2013', '?']]
     self.table_file = self.FIXTURES_ROOT / 'data' / 'wikitables' / 'sample_table.tagged'
     self.table_context = TableQuestionContext.read_from_file(self.table_file, question_tokens)
     self.world_with_2013 = WikiTablesVariableFreeWorld(self.table_context)
     usl_league_tokens = [Token(x) for x in ['what', 'was', 'the', 'last', 'year', 'with', 'usl',
                                             'a', 'league', '?']]
     self.world_with_usl_a_league = self._get_world_with_question_tokens(usl_league_tokens)
 def test_number_comparison_works(self):
     # TableQuestionContext normlaizes all strings according to some rules. We want to ensure
     # that the original numerical values of number cells is being correctly processed here.
     tokens = WordTokenizer().tokenize("when was the attendance the highest?")
     tagged_file = self.FIXTURES_ROOT / "data" / "corenlp_processed_tables" / "TEST-2.table"
     context = TableQuestionContext.read_from_file(tagged_file, tokens)
     executor = WikiTablesVariableFreeExecutor(context.table_data)
     result = executor.execute("(select_date (argmax all_rows number_column:attendance) date_column:date)")
     assert result == Date(-1, 11, 10)
 def test_date_extraction(self):
     question = "how many laps did matt kenset complete on february 26, 2006."
     question_tokens = self.tokenizer.tokenize(question)
     test_file = f'{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-8.table'
     table_question_context = TableQuestionContext.read_from_file(
         test_file, question_tokens)
     _, number_entities = table_question_context.get_entities_from_question(
     )
     assert number_entities == [("2", 8), ("26", 9), ("2006", 11)]
 def test_date_extraction_2(self):
     question = """how many different players scored for the san jose earthquakes during their
                   1979 home opener against the timbers?"""
     question_tokens = self.tokenizer.tokenize(question)
     test_file = f'{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-6.table'
     table_question_context = TableQuestionContext.read_from_file(
         test_file, question_tokens)
     _, number_entities = table_question_context.get_entities_from_question(
     )
     assert number_entities == [("1979", 12)]
 def test_null_extraction(self):
     question = "on what date did the eagles score the least points?"
     question_tokens = self.tokenizer.tokenize(question)
     test_file = f'{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-2.table'
     table_question_context = TableQuestionContext.read_from_file(
         test_file, question_tokens)
     entities, numbers = table_question_context.get_entities_from_question()
     # "Eagles" does not appear in the table.
     assert entities == []
     assert numbers == []
 def test_number_extraction(self):
     question = """how many players on the 191617 illinois fighting illini men's basketball team
                   had more than 100 points scored?"""
     question_tokens = self.tokenizer.tokenize(question)
     test_file = f'{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-7.table'
     table_question_context = TableQuestionContext.read_from_file(
         test_file, question_tokens)
     _, number_entities = table_question_context.get_entities_from_question(
     )
     assert number_entities == [("191617", 5), ("100", 16)]
 def test_string_column_types_extraction(self):
     question = "how many were elected?"
     question_tokens = self.tokenizer.tokenize(question)
     test_file = f'{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-10.table'
     table_question_context = TableQuestionContext.read_from_file(
         test_file, question_tokens)
     data = table_question_context.table_data[0]
     assert "string_column:birthplace" in data
     assert "string_column:advocate" in data
     assert "string_column:notability" in data
     assert "string_column:name" in data
 def test_number_and_entity_extraction(self):
     question = "other than m1 how many notations have 1 in them?"
     question_tokens = self.tokenizer.tokenize(question)
     test_file = f"{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-11.table"
     table_question_context = TableQuestionContext.read_from_file(
         test_file, question_tokens)
     string_entities, number_entities = table_question_context.get_entities_from_question(
     )
     assert string_entities == [("string:m1", ["string_column:notation"]),
                                ("string:1", ["string_column:position"])]
     assert number_entities == [("1", 2), ("1", 7)]
 def test_numerical_column_type_extraction(self):
     question = """how many players on the 191617 illinois fighting illini men's basketball team
                   had more than 100 points scored?"""
     question_tokens = self.tokenizer.tokenize(question)
     test_file = f'{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-7.table'
     table_question_context = TableQuestionContext.read_from_file(
         test_file, question_tokens)
     data = table_question_context.table_data[0]
     assert "number_column:games_played" in data
     assert "number_column:field_goals" in data
     assert "number_column:free_throws" in data
     assert "number_column:points" in data
 def test_get_knowledge_graph(self):
     question = "other than m1 how many notations have 1 in them?"
     question_tokens = self.tokenizer.tokenize(question)
     test_file = f"{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-11.table"
     table_question_context = TableQuestionContext.read_from_file(
         test_file, question_tokens)
     knowledge_graph = table_question_context.get_table_knowledge_graph()
     entities = knowledge_graph.entities
     # -1 is not in entities because there are no date columns in the table.
     assert sorted(entities) == [
         '1', 'number_column:notation', 'number_column:position',
         'string:1', 'string:m1', 'string_column:mnemonic',
         'string_column:notation', 'string_column:position',
         'string_column:short_name', 'string_column:swara'
     ]
     neighbors = knowledge_graph.neighbors
     # Each number extracted from the question will have all number and date columns as
     # neighbors. Each string entity extracted from the question will only have the corresponding
     # column as the neighbor.
     assert set(neighbors['1']) == {
         'number_column:notation', 'number_column:position'
     }
     assert neighbors['string_column:mnemonic'] == []
     assert neighbors['string_column:short_name'] == []
     assert neighbors['string_column:swara'] == []
     assert neighbors['number_column:position'] == ['1']
     assert neighbors['number_column:notation'] == ['1']
     assert neighbors['string_column:position'] == ['string:1']
     assert neighbors['string:1'] == ['string_column:position']
     assert neighbors['string:m1'] == ['string_column:notation']
     assert neighbors['string_column:notation'] == ['string:m1']
     entity_text = knowledge_graph.entity_text
     assert entity_text == {
         '1': '1',
         'string:m1': 'm1',
         'string:1': '1',
         'number_column:notation': 'notation',
         'string_column:notation': 'notation',
         'string_column:mnemonic': 'mnemonic',
         'string_column:short_name': 'short name',
         'string_column:swara': 'swara',
         'string_column:position': 'position',
         'number_column:position': 'position'
     }
 def _get_world_with_question_tokens(self, tokens: List[Token]) -> WikiTablesVariableFreeWorld:
     table_context = TableQuestionContext.read_from_file(self.table_file, tokens)
     world = WikiTablesVariableFreeWorld(table_context)
     return world
 def test_table_data(self):
     question = "what was the attendance when usl a league played?"
     question_tokens = self.tokenizer.tokenize(question)
     test_file = f'{self.FIXTURES_ROOT}/data/wikitables/sample_table.tagged'
     table_question_context = TableQuestionContext.read_from_file(
         test_file, question_tokens)
     assert table_question_context.table_data == [{
         'date_column:year':
         Date(2001, -1, -1),
         'string_column:year':
         '2001',
         'number_column:year':
         2001.0,
         'number_column:division':
         2.0,
         'string_column:division':
         '2',
         'string_column:league':
         'usl_a_league',
         'string_column:regular_season':
         '4th_western',
         'number_column:regular_season':
         4.0,
         'string_column:playoffs':
         'quarterfinals',
         'string_column:open_cup':
         'did_not_qualify',
         'number_column:open_cup':
         None,
         'string_column:avg_attendance':
         '7_169',
         'number_column:avg_attendance':
         7169.0
     }, {
         'date_column:year':
         Date(2005, -1, -1),
         'string_column:year':
         '2005',
         'number_column:year':
         2005.0,
         'number_column:division':
         2.0,
         'string_column:division':
         '2',
         'string_column:league':
         'usl_first_division',
         'string_column:regular_season':
         '5th',
         'number_column:regular_season':
         5.0,
         'string_column:playoffs':
         'quarterfinals',
         'string_column:open_cup':
         '4th_round',
         'number_column:open_cup':
         4.0,
         'string_column:avg_attendance':
         '6_028',
         'number_column:avg_attendance':
         6028.0
     }]