def get_table_data_from_untagged_lines(
        cls, lines: List[List[str]]
    ) -> Tuple[List[Dict[str, Dict[str, str]]], Dict[str, Set[str]]]:
        """
        This method will be called only when we do not have tagged information from CoreNLP. That is, when we are
        running the parser on data outside the WikiTableQuestions dataset. We try to do the same processing that
        CoreNLP does for WTQ, but what we do here may not be as effective.
        """
        table_data: List[Dict[str, Dict[str, str]]] = []
        column_index_to_name = {}
        column_names = lines[0]
        for column_index, column_name in enumerate(column_names):
            normalized_name = cls.normalize_string(column_name)
            column_index_to_name[column_index] = normalized_name

        column_name_type_mapping: Dict[str, Set[str]] = defaultdict(set)
        for row in lines[1:]:
            table_data.append({})
            for column_index, cell_value in enumerate(row):
                column_name = column_index_to_name[column_index]
                cell_data: Dict[str, str] = {}

                # Interpret the content as a date.
                try:
                    potential_date_string = str(Date.make_date(cell_value))
                    if potential_date_string != "-1":
                        # This means the string is a really a date.
                        cell_data["date"] = cell_value
                        column_name_type_mapping[column_name].add("date")
                except ValueError:
                    pass

                # Interpret the content as a number.
                try:
                    float(cell_value)
                    cell_data["number"] = cell_value
                    column_name_type_mapping[column_name].add("number")
                except ValueError:
                    pass

                # Interpret the content as a range or a score to get number and num2 out.
                if "-" in cell_value and len(cell_value.split("-")) == 2:
                    # This could be a number range or a score
                    cell_parts = cell_value.split("-")
                    try:
                        float(cell_parts[0])
                        float(cell_parts[1])
                        cell_data["number"] = cell_parts[0]
                        cell_data["num2"] = cell_parts[1]
                        column_name_type_mapping[column_name].add("number")
                        column_name_type_mapping[column_name].add("num2")
                    except ValueError:
                        pass

                # Interpret the content as a string.
                cell_data["string"] = cell_value
                column_name_type_mapping[column_name].add("string")
                table_data[-1][column_name] = cell_data

        return table_data, column_name_type_mapping
 def test_number_comparison_works(self):
     # TableQuestionContext normlaizes all strings according to some rules. We want to ensure
     # that the original numerical values of number cells is being correctly processed here.
     tokens = WordTokenizer().tokenize("when was the attendance the highest?")
     tagged_file = self.FIXTURES_ROOT / "data" / "corenlp_processed_tables" / "TEST-2.table"
     language = self._get_world_with_question_tokens_and_table_file(tokens, tagged_file)
     result = language.execute("(select_date (argmax all_rows number_column:attendance) date_column:date)")
     assert result == Date(-1, 11, 10)
Esempio n. 3
0
    def select_date(self, rows: List[Row], column: DateColumn) -> Date:
        """
        Select function takes a row as a list and a column name and returns the date in that column.
        """
        dates: List[Date] = []
        for row in rows:
            cell_value = row.values[column.name]
            if isinstance(cell_value, Date):
                dates.append(cell_value)

        return dates[0] if dates else Date(-1, -1, -1)  # type: ignore
Esempio n. 4
0
 def min_date(self, rows: List[Row], column: DateColumn) -> Date:
     """
     Takes a list of rows and a column and returns the min of the values under that column in
     those rows.
     """
     cell_values = [row.values[column.name] for row in rows]
     if not cell_values:
         return Date(-1, -1, -1)
     if not all([isinstance(value, Date) for value in cell_values]):
         raise ExecutionError(f"Invalid values for date selection function: {cell_values}")
     return min(cell_values)  # type: ignore
Esempio n. 5
0
    def read_from_lines(
            cls, lines: List,
            question_tokens: List[Token]) -> 'TableQuestionContext':

        header = lines[0]
        if isinstance(header, list) and header[:6] == [
                'row', 'col', 'id', 'content', 'tokens', 'lemmaTokens'
        ]:
            # These lines are from the tagged table file from the official dataset.
            table_data, column_name_type_mapping = cls.get_table_data_from_tagged_lines(
                lines)
        else:
            # We assume that the lines are just the table data, with rows being newline separated, and columns
            # being tab-separated.
            rows = [line.split('\t') for line in lines]  # type: ignore
            table_data, column_name_type_mapping = cls.get_table_data_from_untagged_lines(
                rows)
        # Each row is a mapping from column names to cell data. Cell data is a dict, where keys are
        # "string", "number", "num2" and "date", and the values are the corresponding values
        # extracted by CoreNLP.
        # Table data with each column split into different ones, depending on the types they have.
        table_data_with_column_types: List[Dict[str, CellValueType]] = []
        all_column_names: Set[str] = set()
        for table_row in table_data:
            table_data_with_column_types.append({})
            for column_name, cell_data in table_row.items():
                for column_type in column_name_type_mapping[column_name]:
                    typed_column_name = f"{column_type}_column:{column_name}"
                    all_column_names.add(typed_column_name)
                    cell_value_string = cell_data.get(column_type, None)
                    if column_type in ["number", "num2"]:
                        try:
                            cell_number = float(cell_value_string)
                        except (ValueError, TypeError):
                            cell_number = None
                        table_data_with_column_types[-1][
                            typed_column_name] = cell_number
                    elif column_type == "date":
                        cell_date = None
                        if cell_value_string is not None:
                            cell_date = Date.make_date(cell_value_string)
                        table_data_with_column_types[-1][
                            typed_column_name] = cell_date
                    else:
                        if cell_value_string is None:
                            normalized_string = None
                        else:
                            normalized_string = cls.normalize_string(
                                cell_value_string)
                        table_data_with_column_types[-1][
                            typed_column_name] = normalized_string
        return cls(table_data_with_column_types, column_name_type_mapping,
                   all_column_names, question_tokens)
Esempio n. 6
0
 def mode_date(self, rows: List[Row], column: DateColumn) -> Date:
     """
     Takes a list of rows and a column and returns the most frequent value under
     that column in those rows.
     """
     most_frequent_list = self._get_most_frequent_values(rows, column)
     if not most_frequent_list:
         return Date(-1, -1, -1)
     most_frequent_value = most_frequent_list[0]
     if not isinstance(most_frequent_value, Date):
         raise ExecutionError(f"Invalid valus for mode_date: {most_frequent_value}")
     return most_frequent_value
Esempio n. 7
0
 def _make_date(cell_string: str) -> Date:
     string_parts = cell_string.split("_")
     year = -1
     month = -1
     day = -1
     for part in string_parts:
         if part.isdigit():
             if len(part) == 4:
                 year = int(part)
             else:
                 day = int(part)
         elif part in MONTH_NUMBERS:
             month = MONTH_NUMBERS[part]
     return Date(year, month, day)
Esempio n. 8
0
 def test_execute_works_with_argmin(self):
     logical_form = "(select_date (argmin all_rows number_column:avg_attendance) date_column:year)"
     cell_list = self.language.execute(logical_form)
     assert cell_list == Date(2005, 3, -1)
Esempio n. 9
0
 def test_date_comparison_works(self):
     assert Date(2013, 12, 31) > Date(2013, 12, 30)
     assert Date(2013, 12, 31) == Date(2013, 12, -1)
     assert Date(2013, -1, -1) >= Date(2013, 12, 31)
     # pylint: disable=singleton-comparison
     assert (Date(2013, 12, -1) > Date(2013, 12, 31)) == False
     with pytest.raises(ExecutionError, match='only compare Dates with Dates'):
         assert (Date(2013, 12, 31) > 2013) == False
     with pytest.raises(ExecutionError, match='only compare Dates with Dates'):
         assert (Date(2013, 12, 31) >= 2013) == False
     with pytest.raises(ExecutionError, match='only compare Dates with Dates'):
         assert Date(2013, 12, 31) != 2013
     assert (Date(2018, 1, 1) >= Date(-1, 2, 1)) == False
     assert (Date(2018, 1, 1) < Date(-1, 2, 1)) == False
     # When year is unknown in both cases, we can compare months and days.
     assert Date(-1, 2, 1) < Date(-1, 2, 3)
     # If both year and month are not know in both cases, the comparison is undefined, and both
     # < and >= return False.
     assert (Date(-1, -1, 1) < Date(-1, -1, 3)) == False
     assert (Date(-1, -1, 1) >= Date(-1, -1, 3)) == False
     # Same when year is known, but months are not.
     assert (Date(2018, -1, 1) < Date(2018, -1, 3)) == False
     assert (Date(2018, -1, 1) >= Date(2018, -1, 3)) == False
Esempio n. 10
0
 def date(self, year: Number, month: Number, day: Number) -> Date:
     """
     Takes three numbers and returns a ``Date`` object whose year, month, and day are the three
     numbers in that order.
     """
     return Date(year, month, day)  # type: ignore
Esempio n. 11
0
    def read_from_lines(cls,
                        lines: List[List[str]],
                        question_tokens: List[Token]) -> 'TableQuestionContext':
        column_index_to_name = {}

        header = lines[0] # the first line is the header
        index = 1
        # Each row is a mapping from column names to cell data. Cell data is a dict, where keys are
        # "string", "number", "num2" and "date", and the values are the corresponding values
        # extracted by CoreNLP.
        table_data: List[Dict[str, Dict[str, str]]] = []
        while lines[index][0] == '-1':
            # column names start with fb:row.row.
            current_line = lines[index]
            column_name_sempre = current_line[2]
            column_index = int(current_line[1])
            column_name = column_name_sempre.replace('fb:row.row.', '')
            column_index_to_name[column_index] = column_name
            index += 1
        column_name_type_mapping: Dict[str, Set[str]] = defaultdict(set)
        last_row_index = -1
        for current_line in lines[1:]:
            row_index = int(current_line[0])
            if row_index == -1:
                continue  # header row
            column_index = int(current_line[1])
            if row_index != last_row_index:
                table_data.append({})
            node_info = dict(zip(header, current_line))
            cell_data: Dict[str, str] = {}
            column_name = column_index_to_name[column_index]
            if node_info['date']:
                column_name_type_mapping[column_name].add("date")
                cell_data["date"] = node_info["date"]

            if node_info['number']:
                column_name_type_mapping[column_name].add("number")
                cell_data["number"] = node_info["number"]

            if node_info['num2']:
                column_name_type_mapping[column_name].add("num2")
                cell_data["num2"] = node_info["num2"]

            if node_info['content'] != '—':
                column_name_type_mapping[column_name].add("string")
                cell_data['string'] = node_info["content"]

            table_data[-1][column_name] = cell_data
            last_row_index = row_index
        # Table data with each column split into different ones, depending on the types they have.
        table_data_with_column_types: List[Dict[str, CellValueType]] = []
        all_column_names: Set[str] = set()
        for table_row in table_data:
            table_data_with_column_types.append({})
            for column_name, cell_data in table_row.items():
                for column_type in column_name_type_mapping[column_name]:
                    typed_column_name = f"{column_type}_column:{column_name}"
                    all_column_names.add(typed_column_name)
                    cell_value_string = cell_data.get(column_type, None)
                    if column_type in ["number", "num2"]:
                        try:
                            cell_number = float(cell_value_string)
                        except (ValueError, TypeError):
                            cell_number = None
                        table_data_with_column_types[-1][typed_column_name] = cell_number
                    elif column_type == "date":
                        cell_date = None
                        if cell_value_string is not None:
                            cell_date = Date.make_date(cell_value_string)
                        table_data_with_column_types[-1][typed_column_name] = cell_date
                    else:
                        if cell_value_string is None:
                            normalized_string = None
                        else:
                            normalized_string = cls.normalize_string(cell_value_string)
                        table_data_with_column_types[-1][typed_column_name] = normalized_string
        return cls(table_data_with_column_types, column_name_type_mapping, all_column_names, question_tokens)