def test_number_comparison_works(self):
     # TableQuestionContext normlaizes all strings according to some rules. We want to ensure
     # that the original numerical values of number cells is being correctly processed here.
     tokens = WordTokenizer().tokenize("when was the attendance the highest?")
     tagged_file = self.FIXTURES_ROOT / "data" / "corenlp_processed_tables" / "TEST-2.table"
     language = self._get_world_with_question_tokens_and_table_file(tokens, tagged_file)
     result = language.execute("(select_date (argmax all_rows number_column:attendance) date_column:date)")
     assert result == Date(-1, 11, 10)
 def min_date(self, rows: List[Row], column: DateColumn) -> Date:
     """
     Takes a list of rows and a column and returns the min of the values under that column in
     those rows.
     """
     cell_values = [row.values[column.name] for row in rows]
     if not cell_values:
         return Date(-1, -1, -1)
     if not all([isinstance(value, Date) for value in cell_values]):
         raise ExecutionError(f"Invalid values for date selection function: {cell_values}")
     return min(cell_values)  # type: ignore
    def select_date(self, rows: List[Row], column: DateColumn) -> Date:
        """
        Select function takes a row as a list and a column name and returns the date in that column.
        """
        dates: List[Date] = []
        for row in rows:
            cell_value = row.values[column.name]
            if isinstance(cell_value, Date):
                dates.append(cell_value)

        return dates[0] if dates else Date(-1, -1, -1)  # type: ignore
 def mode_date(self, rows: List[Row], column: DateColumn) -> Date:
     """
     Takes a list of rows and a column and returns the most frequent value under
     that column in those rows.
     """
     most_frequent_list = self._get_most_frequent_values(rows, column)
     if not most_frequent_list:
         return Date(-1, -1, -1)
     most_frequent_value = most_frequent_list[0]
     if not isinstance(most_frequent_value, Date):
         raise ExecutionError(f"Invalid valus for mode_date: {most_frequent_value}")
     return most_frequent_value
 def _make_date(cell_string: str) -> Date:
     string_parts = cell_string.split("_")
     year = -1
     month = -1
     day = -1
     for part in string_parts:
         if part.isdigit():
             if len(part) == 4:
                 year = int(part)
             else:
                 day = int(part)
         elif part in MONTH_NUMBERS:
             month = MONTH_NUMBERS[part]
     return Date(year, month, day)
Example #6
0
 def test_date_comparison_works(self):
     assert Date(2013, 12, 31) > Date(2013, 12, 30)
     assert Date(2013, 12, 31) == Date(2013, 12, -1)
     assert Date(2013, -1, -1) >= Date(2013, 12, 31)
     # pylint: disable=singleton-comparison
     assert (Date(2013, 12, -1) > Date(2013, 12, 31)) == False
     with pytest.raises(ExecutionError,
                        match='only compare Dates with Dates'):
         assert (Date(2013, 12, 31) > 2013) == False
     with pytest.raises(ExecutionError,
                        match='only compare Dates with Dates'):
         assert (Date(2013, 12, 31) >= 2013) == False
     with pytest.raises(ExecutionError,
                        match='only compare Dates with Dates'):
         assert Date(2013, 12, 31) != 2013
     assert (Date(2018, 1, 1) >= Date(-1, 2, 1)) == False
     assert (Date(2018, 1, 1) < Date(-1, 2, 1)) == False
     # When year is unknown in both cases, we can compare months and days.
     assert Date(-1, 2, 1) < Date(-1, 2, 3)
     # If both year and month are not know in both cases, the comparison is undefined, and both
     # < and >= return False.
     assert (Date(-1, -1, 1) < Date(-1, -1, 3)) == False
     assert (Date(-1, -1, 1) >= Date(-1, -1, 3)) == False
     # Same when year is known, but months are not.
     assert (Date(2018, -1, 1) < Date(2018, -1, 3)) == False
     assert (Date(2018, -1, 1) >= Date(2018, -1, 3)) == False
 def test_execute_works_with_argmin(self):
     logical_form = "(select_date (argmin all_rows number_column:avg_attendance) date_column:year)"
     cell_list = self.language.execute(logical_form)
     assert cell_list == Date(2005, 3, -1)
Example #8
0
    def read_from_lines(
            cls, lines: List[List[str]],
            question_tokens: List[Token]) -> 'TableQuestionContext':
        column_index_to_name = {}

        header = lines[0]  # the first line is the header
        index = 1
        # Each row is a mapping from column names to cell data. Cell data is a dict, where keys are
        # "string", "number", "num2" and "date", and the values are the corresponding values
        # extracted by CoreNLP.
        table_data: List[Dict[str, Dict[str, str]]] = []
        while lines[index][0] == '-1':
            # column names start with fb:row.row.
            current_line = lines[index]
            column_name_sempre = current_line[2]
            column_index = int(current_line[1])
            column_name = column_name_sempre.replace('fb:row.row.', '')
            column_index_to_name[column_index] = column_name
            index += 1
        column_name_type_mapping: Dict[str, Set[str]] = defaultdict(set)
        last_row_index = -1
        for current_line in lines[1:]:
            row_index = int(current_line[0])
            if row_index == -1:
                continue  # header row
            column_index = int(current_line[1])
            if row_index != last_row_index:
                table_data.append({})
            node_info = dict(zip(header, current_line))
            cell_data: Dict[str, str] = {}
            column_name = column_index_to_name[column_index]
            if node_info['date']:
                column_name_type_mapping[column_name].add("date")
                cell_data["date"] = node_info["date"]

            if node_info['number']:
                column_name_type_mapping[column_name].add("number")
                cell_data["number"] = node_info["number"]

            if node_info['num2']:
                column_name_type_mapping[column_name].add("num2")
                cell_data["num2"] = node_info["num2"]

            if node_info['content'] != '—':
                column_name_type_mapping[column_name].add("string")
                cell_data['string'] = node_info["content"]

            table_data[-1][column_name] = cell_data
            last_row_index = row_index
        # Table data with each column split into different ones, depending on the types they have.
        table_data_with_column_types: List[Dict[str, CellValueType]] = []
        all_column_names: Set[str] = set()
        for table_row in table_data:
            table_data_with_column_types.append({})
            for column_name, cell_data in table_row.items():
                for column_type in column_name_type_mapping[column_name]:
                    typed_column_name = f"{column_type}_column:{column_name}"
                    all_column_names.add(typed_column_name)
                    cell_value_string = cell_data.get(column_type, None)
                    if column_type in ["number", "num2"]:
                        try:
                            cell_number = float(cell_value_string)
                        except (ValueError, TypeError):
                            cell_number = None
                        table_data_with_column_types[-1][
                            typed_column_name] = cell_number
                    elif column_type == "date":
                        cell_date = None
                        if cell_value_string is not None:
                            cell_date = Date.make_date(cell_value_string)
                        table_data_with_column_types[-1][
                            typed_column_name] = cell_date
                    else:
                        if cell_value_string is None:
                            normalized_string = None
                        else:
                            normalized_string = cls.normalize_string(
                                cell_value_string)
                        table_data_with_column_types[-1][
                            typed_column_name] = normalized_string
        return cls(table_data_with_column_types, column_name_type_mapping,
                   all_column_names, question_tokens)
 def date(self, year: Number, month: Number, day: Number) -> Date:
     """
     Takes three numbers and returns a ``Date`` object whose year, month, and day are the three
     numbers in that order.
     """
     return Date(year, month, day)  # type: ignore