def get_table_data_from_untagged_lines( cls, lines: List[List[str]] ) -> Tuple[List[Dict[str, Dict[str, str]]], Dict[str, Set[str]]]: """ This method will be called only when we do not have tagged information from CoreNLP. That is, when we are running the parser on data outside the WikiTableQuestions dataset. We try to do the same processing that CoreNLP does for WTQ, but what we do here may not be as effective. """ table_data: List[Dict[str, Dict[str, str]]] = [] column_index_to_name = {} column_names = lines[0] for column_index, column_name in enumerate(column_names): normalized_name = cls.normalize_string(column_name) column_index_to_name[column_index] = normalized_name column_name_type_mapping: Dict[str, Set[str]] = defaultdict(set) for row in lines[1:]: table_data.append({}) for column_index, cell_value in enumerate(row): column_name = column_index_to_name[column_index] cell_data: Dict[str, str] = {} # Interpret the content as a date. try: potential_date_string = str(Date.make_date(cell_value)) if potential_date_string != "-1": # This means the string is a really a date. cell_data["date"] = cell_value column_name_type_mapping[column_name].add("date") except ValueError: pass # Interpret the content as a number. try: float(cell_value) cell_data["number"] = cell_value column_name_type_mapping[column_name].add("number") except ValueError: pass # Interpret the content as a range or a score to get number and num2 out. if "-" in cell_value and len(cell_value.split("-")) == 2: # This could be a number range or a score cell_parts = cell_value.split("-") try: float(cell_parts[0]) float(cell_parts[1]) cell_data["number"] = cell_parts[0] cell_data["num2"] = cell_parts[1] column_name_type_mapping[column_name].add("number") column_name_type_mapping[column_name].add("num2") except ValueError: pass # Interpret the content as a string. cell_data["string"] = cell_value column_name_type_mapping[column_name].add("string") table_data[-1][column_name] = cell_data return table_data, column_name_type_mapping
def test_number_comparison_works(self): # TableQuestionContext normlaizes all strings according to some rules. We want to ensure # that the original numerical values of number cells is being correctly processed here. tokens = WordTokenizer().tokenize("when was the attendance the highest?") tagged_file = self.FIXTURES_ROOT / "data" / "corenlp_processed_tables" / "TEST-2.table" language = self._get_world_with_question_tokens_and_table_file(tokens, tagged_file) result = language.execute("(select_date (argmax all_rows number_column:attendance) date_column:date)") assert result == Date(-1, 11, 10)
def select_date(self, rows: List[Row], column: DateColumn) -> Date: """ Select function takes a row as a list and a column name and returns the date in that column. """ dates: List[Date] = [] for row in rows: cell_value = row.values[column.name] if isinstance(cell_value, Date): dates.append(cell_value) return dates[0] if dates else Date(-1, -1, -1) # type: ignore
def min_date(self, rows: List[Row], column: DateColumn) -> Date: """ Takes a list of rows and a column and returns the min of the values under that column in those rows. """ cell_values = [row.values[column.name] for row in rows] if not cell_values: return Date(-1, -1, -1) if not all([isinstance(value, Date) for value in cell_values]): raise ExecutionError(f"Invalid values for date selection function: {cell_values}") return min(cell_values) # type: ignore
def read_from_lines( cls, lines: List, question_tokens: List[Token]) -> 'TableQuestionContext': header = lines[0] if isinstance(header, list) and header[:6] == [ 'row', 'col', 'id', 'content', 'tokens', 'lemmaTokens' ]: # These lines are from the tagged table file from the official dataset. table_data, column_name_type_mapping = cls.get_table_data_from_tagged_lines( lines) else: # We assume that the lines are just the table data, with rows being newline separated, and columns # being tab-separated. rows = [line.split('\t') for line in lines] # type: ignore table_data, column_name_type_mapping = cls.get_table_data_from_untagged_lines( rows) # Each row is a mapping from column names to cell data. Cell data is a dict, where keys are # "string", "number", "num2" and "date", and the values are the corresponding values # extracted by CoreNLP. # Table data with each column split into different ones, depending on the types they have. table_data_with_column_types: List[Dict[str, CellValueType]] = [] all_column_names: Set[str] = set() for table_row in table_data: table_data_with_column_types.append({}) for column_name, cell_data in table_row.items(): for column_type in column_name_type_mapping[column_name]: typed_column_name = f"{column_type}_column:{column_name}" all_column_names.add(typed_column_name) cell_value_string = cell_data.get(column_type, None) if column_type in ["number", "num2"]: try: cell_number = float(cell_value_string) except (ValueError, TypeError): cell_number = None table_data_with_column_types[-1][ typed_column_name] = cell_number elif column_type == "date": cell_date = None if cell_value_string is not None: cell_date = Date.make_date(cell_value_string) table_data_with_column_types[-1][ typed_column_name] = cell_date else: if cell_value_string is None: normalized_string = None else: normalized_string = cls.normalize_string( cell_value_string) table_data_with_column_types[-1][ typed_column_name] = normalized_string return cls(table_data_with_column_types, column_name_type_mapping, all_column_names, question_tokens)
def mode_date(self, rows: List[Row], column: DateColumn) -> Date: """ Takes a list of rows and a column and returns the most frequent value under that column in those rows. """ most_frequent_list = self._get_most_frequent_values(rows, column) if not most_frequent_list: return Date(-1, -1, -1) most_frequent_value = most_frequent_list[0] if not isinstance(most_frequent_value, Date): raise ExecutionError(f"Invalid valus for mode_date: {most_frequent_value}") return most_frequent_value
def _make_date(cell_string: str) -> Date: string_parts = cell_string.split("_") year = -1 month = -1 day = -1 for part in string_parts: if part.isdigit(): if len(part) == 4: year = int(part) else: day = int(part) elif part in MONTH_NUMBERS: month = MONTH_NUMBERS[part] return Date(year, month, day)
def test_execute_works_with_argmin(self): logical_form = "(select_date (argmin all_rows number_column:avg_attendance) date_column:year)" cell_list = self.language.execute(logical_form) assert cell_list == Date(2005, 3, -1)
def test_date_comparison_works(self): assert Date(2013, 12, 31) > Date(2013, 12, 30) assert Date(2013, 12, 31) == Date(2013, 12, -1) assert Date(2013, -1, -1) >= Date(2013, 12, 31) # pylint: disable=singleton-comparison assert (Date(2013, 12, -1) > Date(2013, 12, 31)) == False with pytest.raises(ExecutionError, match='only compare Dates with Dates'): assert (Date(2013, 12, 31) > 2013) == False with pytest.raises(ExecutionError, match='only compare Dates with Dates'): assert (Date(2013, 12, 31) >= 2013) == False with pytest.raises(ExecutionError, match='only compare Dates with Dates'): assert Date(2013, 12, 31) != 2013 assert (Date(2018, 1, 1) >= Date(-1, 2, 1)) == False assert (Date(2018, 1, 1) < Date(-1, 2, 1)) == False # When year is unknown in both cases, we can compare months and days. assert Date(-1, 2, 1) < Date(-1, 2, 3) # If both year and month are not know in both cases, the comparison is undefined, and both # < and >= return False. assert (Date(-1, -1, 1) < Date(-1, -1, 3)) == False assert (Date(-1, -1, 1) >= Date(-1, -1, 3)) == False # Same when year is known, but months are not. assert (Date(2018, -1, 1) < Date(2018, -1, 3)) == False assert (Date(2018, -1, 1) >= Date(2018, -1, 3)) == False
def date(self, year: Number, month: Number, day: Number) -> Date: """ Takes three numbers and returns a ``Date`` object whose year, month, and day are the three numbers in that order. """ return Date(year, month, day) # type: ignore
def read_from_lines(cls, lines: List[List[str]], question_tokens: List[Token]) -> 'TableQuestionContext': column_index_to_name = {} header = lines[0] # the first line is the header index = 1 # Each row is a mapping from column names to cell data. Cell data is a dict, where keys are # "string", "number", "num2" and "date", and the values are the corresponding values # extracted by CoreNLP. table_data: List[Dict[str, Dict[str, str]]] = [] while lines[index][0] == '-1': # column names start with fb:row.row. current_line = lines[index] column_name_sempre = current_line[2] column_index = int(current_line[1]) column_name = column_name_sempre.replace('fb:row.row.', '') column_index_to_name[column_index] = column_name index += 1 column_name_type_mapping: Dict[str, Set[str]] = defaultdict(set) last_row_index = -1 for current_line in lines[1:]: row_index = int(current_line[0]) if row_index == -1: continue # header row column_index = int(current_line[1]) if row_index != last_row_index: table_data.append({}) node_info = dict(zip(header, current_line)) cell_data: Dict[str, str] = {} column_name = column_index_to_name[column_index] if node_info['date']: column_name_type_mapping[column_name].add("date") cell_data["date"] = node_info["date"] if node_info['number']: column_name_type_mapping[column_name].add("number") cell_data["number"] = node_info["number"] if node_info['num2']: column_name_type_mapping[column_name].add("num2") cell_data["num2"] = node_info["num2"] if node_info['content'] != '—': column_name_type_mapping[column_name].add("string") cell_data['string'] = node_info["content"] table_data[-1][column_name] = cell_data last_row_index = row_index # Table data with each column split into different ones, depending on the types they have. table_data_with_column_types: List[Dict[str, CellValueType]] = [] all_column_names: Set[str] = set() for table_row in table_data: table_data_with_column_types.append({}) for column_name, cell_data in table_row.items(): for column_type in column_name_type_mapping[column_name]: typed_column_name = f"{column_type}_column:{column_name}" all_column_names.add(typed_column_name) cell_value_string = cell_data.get(column_type, None) if column_type in ["number", "num2"]: try: cell_number = float(cell_value_string) except (ValueError, TypeError): cell_number = None table_data_with_column_types[-1][typed_column_name] = cell_number elif column_type == "date": cell_date = None if cell_value_string is not None: cell_date = Date.make_date(cell_value_string) table_data_with_column_types[-1][typed_column_name] = cell_date else: if cell_value_string is None: normalized_string = None else: normalized_string = cls.normalize_string(cell_value_string) table_data_with_column_types[-1][typed_column_name] = normalized_string return cls(table_data_with_column_types, column_name_type_mapping, all_column_names, question_tokens)