def add_dates_to_number_linking_scores(
            self, number_linking_scores: Dict[str, Tuple[str, str, List[int]]],
            current_tokenized_utterance: List[Token]) -> None:

        month_reverse_lookup = {
            str(number): string
            for string, number in MONTH_NUMBERS.items()
        }
        day_reverse_lookup = {
            str(number): string
            for string, number in DAY_NUMBERS.items()
        }

        if self.dates:
            for date in self.dates:
                # Add the year linking score
                entity_linking = [0 for token in current_tokenized_utterance]
                for token_index, token in enumerate(
                        current_tokenized_utterance):
                    if token.text == str(date.year):
                        entity_linking[token_index] = 1
                action = format_action(nonterminal='year_number',
                                       right_hand_side=str(date.year),
                                       is_number=True,
                                       keywords_to_uppercase=KEYWORDS)
                number_linking_scores[action] = ('year_number', str(date.year),
                                                 entity_linking)

                entity_linking = [0 for token in current_tokenized_utterance]
                for token_index, token in enumerate(
                        current_tokenized_utterance):
                    if token.text == month_reverse_lookup[str(date.month)]:
                        entity_linking[token_index] = 1
                action = format_action(nonterminal='month_number',
                                       right_hand_side=str(date.month),
                                       is_number=True,
                                       keywords_to_uppercase=KEYWORDS)

                number_linking_scores[action] = ('month_number',
                                                 str(date.month),
                                                 entity_linking)

                entity_linking = [0 for token in current_tokenized_utterance]
                for token_index, token in enumerate(
                        current_tokenized_utterance):
                    if token.text == day_reverse_lookup[str(date.day)]:
                        entity_linking[token_index] = 1
                for bigram_index, bigram in enumerate(
                        bigrams([
                            token.text for token in current_tokenized_utterance
                        ])):
                    if ' '.join(bigram) == day_reverse_lookup[str(date.day)]:
                        entity_linking[bigram_index] = 1
                        entity_linking[bigram_index + 1] = 1
                action = format_action(nonterminal='day_number',
                                       right_hand_side=str(date.day),
                                       is_number=True,
                                       keywords_to_uppercase=KEYWORDS)
                number_linking_scores[action] = ('day_number', str(date.day),
                                                 entity_linking)
 def add_to_number_linking_scores(
         self, all_numbers: Set[str],
         number_linking_scores: Dict[str, Tuple[str, str, List[int]]],
         get_number_linking_dict: Callable[[str, List[Token]],
                                           Dict[str, List[int]]],
         current_tokenized_utterance: List[Token],
         nonterminal: str) -> None:
     """
     This is a helper method for adding different types of numbers (eg. starting time ranges) as entities.
     We first go through all utterances in the interaction and find the numbers of a certain type and add
     them to the set ``all_numbers``, which is initialized with default values. We want to add all numbers
     that occur in the interaction, and not just the current turn because the query could contain numbers
     that were triggered before the current turn. For each entity, we then check if it is triggered by tokens
     in the current utterance and construct the linking score.
     """
     number_linking_dict: Dict[str, List[int]] = {}
     for utterance, tokenized_utterance in zip(self.utterances,
                                               self.tokenized_utterances):
         number_linking_dict = get_number_linking_dict(
             utterance, tokenized_utterance)
         all_numbers.update(number_linking_dict.keys())
     all_numbers_list: List[str] = sorted(all_numbers, reverse=True)
     for number in all_numbers_list:
         entity_linking = [0 for token in current_tokenized_utterance]
         # ``number_linking_dict`` is for the last utterance here. If the number was triggered
         # before the last utterance, then it will have linking scores of 0's.
         for token_index in number_linking_dict.get(number, []):
             if token_index < len(entity_linking):
                 entity_linking[token_index] = 1
         action = format_action(nonterminal,
                                number,
                                is_number=True,
                                keywords_to_uppercase=KEYWORDS)
         number_linking_scores[action] = (nonterminal, number,
                                          entity_linking)
Exemple #3
0
    def create_grammar_dict_and_strings(
            self) -> Tuple[Dict[str, List[str]], List[Tuple[str, str]]]:
        grammar_dictionary = deepcopy(GRAMMAR_DICTIONARY)
        strings_list = []

        if self.all_tables:
            grammar_dictionary['table_name'] = \
                    sorted([f'"{table}"'
                            for table in list(self.all_tables.keys())], reverse=True)
            grammar_dictionary['col_ref'] = ['"*"', 'agg']
            all_columns = []
            for table, columns in self.all_tables.items():
                grammar_dictionary['col_ref'].extend([
                    f'("{table}" ws "." ws "{column}")' for column in columns
                ])
                all_columns.extend(columns)
            grammar_dictionary['col_ref'] = sorted(
                grammar_dictionary['col_ref'], reverse=True)
            grammar_dictionary['col'] = sorted(
                [f'"{column}"' for column in all_columns], reverse=True)

        biexprs = []
        if self.tables_with_strings:
            for table, columns in self.tables_with_strings.items():
                biexprs.extend([
                    f'("{table}" ws "." ws "{column}" ws binaryop ws {table}_{column}_string)'
                    for column in columns
                ])
                for column in columns:
                    self.cursor.execute(
                        f'SELECT DISTINCT {table} . {column} FROM {table}')
                    results = self.cursor.fetchall()

                    # Almost all the query values are in the database, we hardcode the rare case here.
                    if table == 'flight' and column == 'airline_code':
                        results.append(('EA', ))
                    strings_list.extend([
                        (format_action(f"{table}_{column}_string",
                                       str(row[0]),
                                       is_string=not 'number' in column,
                                       is_number='number'
                                       in column), str(row[0]))
                        for row in results
                    ])

                    if column.endswith('number'):
                        grammar_dictionary[f'{table}_{column}_string'] = \
                                sorted([f'"{str(row[0])}"' for row in results], reverse=True)
                    else:
                        grammar_dictionary[f'{table}_{column}_string'] = \
                                sorted([f'"\'{str(row[0])}\'"' for row in results], reverse=True)

        grammar_dictionary['biexpr'] = sorted(biexprs, reverse=True) + \
                ['( col_ref ws binaryop ws value)', '(value ws binaryop ws value)']
        return grammar_dictionary, strings_list
    def create_grammar_dict_and_strings(
            self) -> Tuple[Dict[str, List[str]], List[Tuple[str, str]]]:
        grammar_dictionary = deepcopy(GRAMMAR_DICTIONARY)
        strings_list = []

        if self.all_tables:
            grammar_dictionary["table_name"] = sorted(
                [f'"{table}"' for table in list(self.all_tables.keys())],
                reverse=True)
            grammar_dictionary["col_ref"] = ['"*"', "agg"]
            all_columns = []
            for table, columns in self.all_tables.items():
                grammar_dictionary["col_ref"].extend([
                    f'("{table}" ws "." ws "{column}")' for column in columns
                ])
                all_columns.extend(columns)
            grammar_dictionary["col_ref"] = sorted(
                grammar_dictionary["col_ref"], reverse=True)
            grammar_dictionary["col"] = sorted(
                [f'"{column}"' for column in all_columns], reverse=True)

        biexprs = []
        if self.tables_with_strings:
            for table, columns in self.tables_with_strings.items():
                biexprs.extend([
                    f'("{table}" ws "." ws "{column}" ws binaryop ws {table}_{column}_string)'
                    for column in columns
                ])
                for column in columns:
                    self.cursor.execute(
                        f"SELECT DISTINCT {table} . {column} FROM {table}")
                    results = self.cursor.fetchall()

                    # Almost all the query values are in the database, we hardcode the rare case here.
                    if table == "flight" and column == "airline_code":
                        results.append(("EA", ))
                    strings_list.extend([(
                        format_action(
                            f"{table}_{column}_string",
                            str(row[0]),
                            is_string="number" not in column,
                            is_number="number" in column,
                        ),
                        str(row[0]),
                    ) for row in results])

                    if column.endswith("number"):
                        grammar_dictionary[
                            f"{table}_{column}_string"] = sorted(
                                [f'"{str(row[0])}"' for row in results],
                                reverse=True)
                    else:
                        grammar_dictionary[
                            f"{table}_{column}_string"] = sorted(
                                [f"\"'{str(row[0])}'\"" for row in results],
                                reverse=True)

        grammar_dictionary["biexpr"] = sorted(biexprs, reverse=True) + [
            "( col_ref ws binaryop ws value)",
            "(value ws binaryop ws value)",
        ]
        return grammar_dictionary, strings_list