def test_number_comparison_works(self): # TableQuestionContext normlaizes all strings according to some rules. We want to ensure # that the original numerical values of number cells is being correctly processed here. tokens = WordTokenizer().tokenize("when was the attendance the highest?") tagged_file = self.FIXTURES_ROOT / "data" / "corenlp_processed_tables" / "TEST-2.table" language = self._get_world_with_question_tokens_and_table_file(tokens, tagged_file) result = language.execute("(select_date (argmax all_rows number_column:attendance) date_column:date)") assert result == Date(-1, 11, 10)
def min_date(self, rows: List[Row], column: DateColumn) -> Date: """ Takes a list of rows and a column and returns the min of the values under that column in those rows. """ cell_values = [row.values[column.name] for row in rows] if not cell_values: return Date(-1, -1, -1) if not all([isinstance(value, Date) for value in cell_values]): raise ExecutionError(f"Invalid values for date selection function: {cell_values}") return min(cell_values) # type: ignore
def select_date(self, rows: List[Row], column: DateColumn) -> Date: """ Select function takes a row as a list and a column name and returns the date in that column. """ dates: List[Date] = [] for row in rows: cell_value = row.values[column.name] if isinstance(cell_value, Date): dates.append(cell_value) return dates[0] if dates else Date(-1, -1, -1) # type: ignore
def mode_date(self, rows: List[Row], column: DateColumn) -> Date: """ Takes a list of rows and a column and returns the most frequent value under that column in those rows. """ most_frequent_list = self._get_most_frequent_values(rows, column) if not most_frequent_list: return Date(-1, -1, -1) most_frequent_value = most_frequent_list[0] if not isinstance(most_frequent_value, Date): raise ExecutionError(f"Invalid valus for mode_date: {most_frequent_value}") return most_frequent_value
def _make_date(cell_string: str) -> Date: string_parts = cell_string.split("_") year = -1 month = -1 day = -1 for part in string_parts: if part.isdigit(): if len(part) == 4: year = int(part) else: day = int(part) elif part in MONTH_NUMBERS: month = MONTH_NUMBERS[part] return Date(year, month, day)
def test_date_comparison_works(self): assert Date(2013, 12, 31) > Date(2013, 12, 30) assert Date(2013, 12, 31) == Date(2013, 12, -1) assert Date(2013, -1, -1) >= Date(2013, 12, 31) # pylint: disable=singleton-comparison assert (Date(2013, 12, -1) > Date(2013, 12, 31)) == False with pytest.raises(ExecutionError, match='only compare Dates with Dates'): assert (Date(2013, 12, 31) > 2013) == False with pytest.raises(ExecutionError, match='only compare Dates with Dates'): assert (Date(2013, 12, 31) >= 2013) == False with pytest.raises(ExecutionError, match='only compare Dates with Dates'): assert Date(2013, 12, 31) != 2013 assert (Date(2018, 1, 1) >= Date(-1, 2, 1)) == False assert (Date(2018, 1, 1) < Date(-1, 2, 1)) == False # When year is unknown in both cases, we can compare months and days. assert Date(-1, 2, 1) < Date(-1, 2, 3) # If both year and month are not know in both cases, the comparison is undefined, and both # < and >= return False. assert (Date(-1, -1, 1) < Date(-1, -1, 3)) == False assert (Date(-1, -1, 1) >= Date(-1, -1, 3)) == False # Same when year is known, but months are not. assert (Date(2018, -1, 1) < Date(2018, -1, 3)) == False assert (Date(2018, -1, 1) >= Date(2018, -1, 3)) == False
def test_execute_works_with_argmin(self): logical_form = "(select_date (argmin all_rows number_column:avg_attendance) date_column:year)" cell_list = self.language.execute(logical_form) assert cell_list == Date(2005, 3, -1)
def read_from_lines( cls, lines: List[List[str]], question_tokens: List[Token]) -> 'TableQuestionContext': column_index_to_name = {} header = lines[0] # the first line is the header index = 1 # Each row is a mapping from column names to cell data. Cell data is a dict, where keys are # "string", "number", "num2" and "date", and the values are the corresponding values # extracted by CoreNLP. table_data: List[Dict[str, Dict[str, str]]] = [] while lines[index][0] == '-1': # column names start with fb:row.row. current_line = lines[index] column_name_sempre = current_line[2] column_index = int(current_line[1]) column_name = column_name_sempre.replace('fb:row.row.', '') column_index_to_name[column_index] = column_name index += 1 column_name_type_mapping: Dict[str, Set[str]] = defaultdict(set) last_row_index = -1 for current_line in lines[1:]: row_index = int(current_line[0]) if row_index == -1: continue # header row column_index = int(current_line[1]) if row_index != last_row_index: table_data.append({}) node_info = dict(zip(header, current_line)) cell_data: Dict[str, str] = {} column_name = column_index_to_name[column_index] if node_info['date']: column_name_type_mapping[column_name].add("date") cell_data["date"] = node_info["date"] if node_info['number']: column_name_type_mapping[column_name].add("number") cell_data["number"] = node_info["number"] if node_info['num2']: column_name_type_mapping[column_name].add("num2") cell_data["num2"] = node_info["num2"] if node_info['content'] != '—': column_name_type_mapping[column_name].add("string") cell_data['string'] = node_info["content"] table_data[-1][column_name] = cell_data last_row_index = row_index # Table data with each column split into different ones, depending on the types they have. table_data_with_column_types: List[Dict[str, CellValueType]] = [] all_column_names: Set[str] = set() for table_row in table_data: table_data_with_column_types.append({}) for column_name, cell_data in table_row.items(): for column_type in column_name_type_mapping[column_name]: typed_column_name = f"{column_type}_column:{column_name}" all_column_names.add(typed_column_name) cell_value_string = cell_data.get(column_type, None) if column_type in ["number", "num2"]: try: cell_number = float(cell_value_string) except (ValueError, TypeError): cell_number = None table_data_with_column_types[-1][ typed_column_name] = cell_number elif column_type == "date": cell_date = None if cell_value_string is not None: cell_date = Date.make_date(cell_value_string) table_data_with_column_types[-1][ typed_column_name] = cell_date else: if cell_value_string is None: normalized_string = None else: normalized_string = cls.normalize_string( cell_value_string) table_data_with_column_types[-1][ typed_column_name] = normalized_string return cls(table_data_with_column_types, column_name_type_mapping, all_column_names, question_tokens)
def date(self, year: Number, month: Number, day: Number) -> Date: """ Takes three numbers and returns a ``Date`` object whose year, month, and day are the three numbers in that order. """ return Date(year, month, day) # type: ignore