def test_knowledge_graph_has_correct_neighbors(self): question = "when was the attendance greater than 5000?" question_tokens = self.tokenizer.tokenize(question) test_file = f'{self.FIXTURES_ROOT}/data/wikitables/sample_table.tagged' table_question_context = TableQuestionContext.read_from_file( test_file, question_tokens) knowledge_graph = table_question_context.get_table_knowledge_graph() neighbors = knowledge_graph.neighbors # '5000' is neighbors with number and date columns. '-1' is in entities because there is a # date column, which is its only neighbor. assert set(neighbors.keys()) == { 'date_column:year', 'number_column:year', 'string_column:year', 'number_column:division', 'string_column:division', 'string_column:league', 'string_column:regular_season', 'number_column:regular_season', 'string_column:playoffs', 'string_column:open_cup', 'number_column:open_cup', 'string_column:avg_attendance', 'number_column:avg_attendance', '5000', '-1' } assert set(neighbors['date_column:year']) == {'5000', '-1'} assert neighbors['number_column:division'] == ['5000'] assert neighbors['string_column:league'] == [] assert neighbors['string_column:regular_season'] == [] assert neighbors['string_column:playoffs'] == [] assert neighbors['string_column:open_cup'] == [] assert neighbors['number_column:avg_attendance'] == ['5000'] assert set(neighbors['5000']) == { 'date_column:year', 'number_column:division', 'number_column:avg_attendance', 'number_column:regular_season', 'number_column:year', 'number_column:open_cup' } assert neighbors['-1'] == ['date_column:year']
def test_rank_number_extraction(self): question = "what was the first tamil-language film in 1943?" question_tokens = self.tokenizer.tokenize(question) test_file = f'{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-1.table' table_question_context = TableQuestionContext.read_from_file( test_file, question_tokens) _, numbers = table_question_context.get_entities_from_question() assert numbers == [("1", 3), ('1943', 9)]
def test_entity_extraction_from_question_with_quotes(self): question = "how many times does \"friendly\" appear in the competition column?" question_tokens = self.tokenizer.tokenize(question) test_file = 'fixtures/data/wikitables/tables/346.tagged' table_question_context = TableQuestionContext.read_from_file( test_file, question_tokens) entities, _ = table_question_context.get_entities_from_question() assert entities == [('string:friendly', ['string_column:competition'])]
def test_date_column_type_extraction_1(self): question = "how many were elected?" question_tokens = self.tokenizer.tokenize(question) test_file = f'{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-5.table' table_question_context = TableQuestionContext.read_from_file( test_file, question_tokens) data = table_question_context.table_data[0] assert "date_column:first_elected" in data
def test_multiword_entity_extraction(self): question = "was the positioning better the year of the france venue or the year of the south korea venue?" question_tokens = self.tokenizer.tokenize(question) test_file = f'{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-3.table' table_question_context = TableQuestionContext.read_from_file( test_file, question_tokens) entities, _ = table_question_context.get_entities_from_question() assert entities == [("string:france", ["string_column:venue"]), ("string:south_korea", ["string_column:venue"])]
def search(tables_directory: str, data: JsonDict, output_path: str, max_path_length: int, max_num_logical_forms: int, use_agenda: bool, output_separate_files: bool, conservative_agenda: bool) -> None: print(f"Starting search with {len(data)} instances", file=sys.stderr) executor_logger = logging.getLogger( 'weak_supervision.semparse.executors.wikitables_variable_free_executor' ) executor_logger.setLevel(logging.ERROR) tokenizer = WordTokenizer() if output_separate_files and not os.path.exists(output_path): os.makedirs(output_path) if not output_separate_files: output_file_pointer = open(output_path, "w") for instance_data in data: utterance = instance_data["question"] question_id = instance_data["id"] if utterance.startswith('"') and utterance.endswith('"'): utterance = utterance[1:-1] # For example: csv/200-csv/47.csv -> tagged/200-tagged/47.tagged table_file = instance_data["table_filename"].replace("csv", "tagged") target_list = instance_data["target_values"] tokenized_question = tokenizer.tokenize(utterance) table_file = f"{tables_directory}/{table_file}" context = TableQuestionContext.read_from_file(table_file, tokenized_question) world = WikiTablesVariableFreeWorld(context) walker = ActionSpaceWalker(world, max_path_length=max_path_length) correct_logical_forms = [] if use_agenda: agenda = world.get_agenda(conservative=conservative_agenda) allow_partial_match = not conservative_agenda all_logical_forms = walker.get_logical_forms_with_agenda( agenda=agenda, max_num_logical_forms=10000, allow_partial_match=allow_partial_match) else: all_logical_forms = walker.get_all_logical_forms( max_num_logical_forms=10000) for logical_form in all_logical_forms: if world.evaluate_logical_form(logical_form, target_list): correct_logical_forms.append(logical_form) if output_separate_files and correct_logical_forms: with gzip.open(f"{output_path}/{question_id}.gz", "wt") as output_file_pointer: for logical_form in correct_logical_forms: print(logical_form, file=output_file_pointer) elif not output_separate_files: print(f"{question_id} {utterance}", file=output_file_pointer) if use_agenda: print(f"Agenda: {agenda}", file=output_file_pointer) if not correct_logical_forms: print("NO LOGICAL FORMS FOUND!", file=output_file_pointer) for logical_form in correct_logical_forms[:max_num_logical_forms]: print(logical_form, file=output_file_pointer) print(file=output_file_pointer) if not output_separate_files: output_file_pointer.close()
def setUp(self): super().setUp() question_tokens = [Token(x) for x in ['what', 'was', 'the', 'last', 'year', '2013', '?']] self.table_file = self.FIXTURES_ROOT / 'data' / 'wikitables' / 'sample_table.tagged' self.table_context = TableQuestionContext.read_from_file(self.table_file, question_tokens) self.world_with_2013 = WikiTablesVariableFreeWorld(self.table_context) usl_league_tokens = [Token(x) for x in ['what', 'was', 'the', 'last', 'year', 'with', 'usl', 'a', 'league', '?']] self.world_with_usl_a_league = self._get_world_with_question_tokens(usl_league_tokens)
def test_number_comparison_works(self): # TableQuestionContext normlaizes all strings according to some rules. We want to ensure # that the original numerical values of number cells is being correctly processed here. tokens = WordTokenizer().tokenize("when was the attendance the highest?") tagged_file = self.FIXTURES_ROOT / "data" / "corenlp_processed_tables" / "TEST-2.table" context = TableQuestionContext.read_from_file(tagged_file, tokens) executor = WikiTablesVariableFreeExecutor(context.table_data) result = executor.execute("(select_date (argmax all_rows number_column:attendance) date_column:date)") assert result == Date(-1, 11, 10)
def test_date_extraction(self): question = "how many laps did matt kenset complete on february 26, 2006." question_tokens = self.tokenizer.tokenize(question) test_file = f'{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-8.table' table_question_context = TableQuestionContext.read_from_file( test_file, question_tokens) _, number_entities = table_question_context.get_entities_from_question( ) assert number_entities == [("2", 8), ("26", 9), ("2006", 11)]
def test_date_extraction_2(self): question = """how many different players scored for the san jose earthquakes during their 1979 home opener against the timbers?""" question_tokens = self.tokenizer.tokenize(question) test_file = f'{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-6.table' table_question_context = TableQuestionContext.read_from_file( test_file, question_tokens) _, number_entities = table_question_context.get_entities_from_question( ) assert number_entities == [("1979", 12)]
def test_null_extraction(self): question = "on what date did the eagles score the least points?" question_tokens = self.tokenizer.tokenize(question) test_file = f'{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-2.table' table_question_context = TableQuestionContext.read_from_file( test_file, question_tokens) entities, numbers = table_question_context.get_entities_from_question() # "Eagles" does not appear in the table. assert entities == [] assert numbers == []
def test_number_extraction(self): question = """how many players on the 191617 illinois fighting illini men's basketball team had more than 100 points scored?""" question_tokens = self.tokenizer.tokenize(question) test_file = f'{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-7.table' table_question_context = TableQuestionContext.read_from_file( test_file, question_tokens) _, number_entities = table_question_context.get_entities_from_question( ) assert number_entities == [("191617", 5), ("100", 16)]
def test_string_column_types_extraction(self): question = "how many were elected?" question_tokens = self.tokenizer.tokenize(question) test_file = f'{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-10.table' table_question_context = TableQuestionContext.read_from_file( test_file, question_tokens) data = table_question_context.table_data[0] assert "string_column:birthplace" in data assert "string_column:advocate" in data assert "string_column:notability" in data assert "string_column:name" in data
def test_number_and_entity_extraction(self): question = "other than m1 how many notations have 1 in them?" question_tokens = self.tokenizer.tokenize(question) test_file = f"{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-11.table" table_question_context = TableQuestionContext.read_from_file( test_file, question_tokens) string_entities, number_entities = table_question_context.get_entities_from_question( ) assert string_entities == [("string:m1", ["string_column:notation"]), ("string:1", ["string_column:position"])] assert number_entities == [("1", 2), ("1", 7)]
def test_numerical_column_type_extraction(self): question = """how many players on the 191617 illinois fighting illini men's basketball team had more than 100 points scored?""" question_tokens = self.tokenizer.tokenize(question) test_file = f'{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-7.table' table_question_context = TableQuestionContext.read_from_file( test_file, question_tokens) data = table_question_context.table_data[0] assert "number_column:games_played" in data assert "number_column:field_goals" in data assert "number_column:free_throws" in data assert "number_column:points" in data
def test_get_knowledge_graph(self): question = "other than m1 how many notations have 1 in them?" question_tokens = self.tokenizer.tokenize(question) test_file = f"{self.FIXTURES_ROOT}/data/corenlp_processed_tables/TEST-11.table" table_question_context = TableQuestionContext.read_from_file( test_file, question_tokens) knowledge_graph = table_question_context.get_table_knowledge_graph() entities = knowledge_graph.entities # -1 is not in entities because there are no date columns in the table. assert sorted(entities) == [ '1', 'number_column:notation', 'number_column:position', 'string:1', 'string:m1', 'string_column:mnemonic', 'string_column:notation', 'string_column:position', 'string_column:short_name', 'string_column:swara' ] neighbors = knowledge_graph.neighbors # Each number extracted from the question will have all number and date columns as # neighbors. Each string entity extracted from the question will only have the corresponding # column as the neighbor. assert set(neighbors['1']) == { 'number_column:notation', 'number_column:position' } assert neighbors['string_column:mnemonic'] == [] assert neighbors['string_column:short_name'] == [] assert neighbors['string_column:swara'] == [] assert neighbors['number_column:position'] == ['1'] assert neighbors['number_column:notation'] == ['1'] assert neighbors['string_column:position'] == ['string:1'] assert neighbors['string:1'] == ['string_column:position'] assert neighbors['string:m1'] == ['string_column:notation'] assert neighbors['string_column:notation'] == ['string:m1'] entity_text = knowledge_graph.entity_text assert entity_text == { '1': '1', 'string:m1': 'm1', 'string:1': '1', 'number_column:notation': 'notation', 'string_column:notation': 'notation', 'string_column:mnemonic': 'mnemonic', 'string_column:short_name': 'short name', 'string_column:swara': 'swara', 'string_column:position': 'position', 'number_column:position': 'position' }
def _get_world_with_question_tokens(self, tokens: List[Token]) -> WikiTablesVariableFreeWorld: table_context = TableQuestionContext.read_from_file(self.table_file, tokens) world = WikiTablesVariableFreeWorld(table_context) return world
def test_table_data(self): question = "what was the attendance when usl a league played?" question_tokens = self.tokenizer.tokenize(question) test_file = f'{self.FIXTURES_ROOT}/data/wikitables/sample_table.tagged' table_question_context = TableQuestionContext.read_from_file( test_file, question_tokens) assert table_question_context.table_data == [{ 'date_column:year': Date(2001, -1, -1), 'string_column:year': '2001', 'number_column:year': 2001.0, 'number_column:division': 2.0, 'string_column:division': '2', 'string_column:league': 'usl_a_league', 'string_column:regular_season': '4th_western', 'number_column:regular_season': 4.0, 'string_column:playoffs': 'quarterfinals', 'string_column:open_cup': 'did_not_qualify', 'number_column:open_cup': None, 'string_column:avg_attendance': '7_169', 'number_column:avg_attendance': 7169.0 }, { 'date_column:year': Date(2005, -1, -1), 'string_column:year': '2005', 'number_column:year': 2005.0, 'number_column:division': 2.0, 'string_column:division': '2', 'string_column:league': 'usl_first_division', 'string_column:regular_season': '5th', 'number_column:regular_season': 5.0, 'string_column:playoffs': 'quarterfinals', 'string_column:open_cup': '4th_round', 'number_column:open_cup': 4.0, 'string_column:avg_attendance': '6_028', 'number_column:avg_attendance': 6028.0 }]