Exemple #1
0
    def add_dates_to_number_linking_scores(
            self, number_linking_scores: Dict[str, Tuple[str, str, List[int]]],
            current_tokenized_utterance: List[Token]) -> None:

        month_reverse_lookup = {
            str(number): string
            for string, number in MONTH_NUMBERS.items()
        }
        day_reverse_lookup = {
            str(number): string
            for string, number in DAY_NUMBERS.items()
        }

        if self.dates:
            for date in self.dates:
                # Add the year linking score
                entity_linking = [0 for token in current_tokenized_utterance]
                for token_index, token in enumerate(
                        current_tokenized_utterance):
                    if token.text == str(date.year):
                        entity_linking[token_index] = 1
                action = format_action(nonterminal='year_number',
                                       right_hand_side=str(date.year),
                                       is_number=True,
                                       keywords_to_uppercase=KEYWORDS)
                number_linking_scores[action] = ('year_number', str(date.year),
                                                 entity_linking)

                entity_linking = [0 for token in current_tokenized_utterance]
                for token_index, token in enumerate(
                        current_tokenized_utterance):
                    if token.text == month_reverse_lookup[str(date.month)]:
                        entity_linking[token_index] = 1
                action = format_action(nonterminal='month_number',
                                       right_hand_side=str(date.month),
                                       is_number=True,
                                       keywords_to_uppercase=KEYWORDS)

                number_linking_scores[action] = ('month_number',
                                                 str(date.month),
                                                 entity_linking)

                entity_linking = [0 for token in current_tokenized_utterance]
                for token_index, token in enumerate(
                        current_tokenized_utterance):
                    if token.text == day_reverse_lookup[str(date.day)]:
                        entity_linking[token_index] = 1
                for bigram_index, bigram in enumerate(
                        bigrams([
                            token.text for token in current_tokenized_utterance
                        ])):
                    if ' '.join(bigram) == day_reverse_lookup[str(date.day)]:
                        entity_linking[bigram_index] = 1
                        entity_linking[bigram_index + 1] = 1
                action = format_action(nonterminal='day_number',
                                       right_hand_side=str(date.day),
                                       is_number=True,
                                       keywords_to_uppercase=KEYWORDS)
                number_linking_scores[action] = ('day_number', str(date.day),
                                                 entity_linking)
Exemple #2
0
 def add_to_number_linking_scores(
         self, all_numbers: Set[str],
         number_linking_scores: Dict[str, Tuple[str, str, List[int]]],
         get_number_linking_dict: Callable[[str, List[Token]],
                                           Dict[str, List[int]]],
         current_tokenized_utterance: List[Token],
         nonterminal: str) -> None:
     """
     This is a helper method for adding different types of numbers (eg. starting time ranges) as entities.
     We first go through all utterances in the interaction and find the numbers of a certain type and add
     them to the set ``all_numbers``, which is initialized with default values. We want to add all numbers
     that occur in the interaction, and not just the current turn because the query could contain numbers
     that were triggered before the current turn. For each entity, we then check if it is triggered by tokens
     in the current utterance and construct the linking score.
     """
     number_linking_dict: Dict[str, List[int]] = {}
     for utterance, tokenized_utterance in zip(self.utterances,
                                               self.tokenized_utterances):
         number_linking_dict = get_number_linking_dict(
             utterance, tokenized_utterance)
         all_numbers.update(number_linking_dict.keys())
     all_numbers_list: List[str] = sorted(all_numbers, reverse=True)
     for number in all_numbers_list:
         entity_linking = [0 for token in current_tokenized_utterance]
         # ``number_linking_dict`` is for the last utterance here. If the number was triggered
         # before the last utterance, then it will have linking scores of 0's.
         for token_index in number_linking_dict.get(number, []):
             if token_index < len(entity_linking):
                 entity_linking[token_index] = 1
         action = format_action(nonterminal,
                                number,
                                is_number=True,
                                keywords_to_uppercase=KEYWORDS)
         number_linking_scores[action] = (nonterminal, number,
                                          entity_linking)
Exemple #3
0
    def create_grammar_dict_and_strings(
            self) -> Tuple[Dict[str, List[str]], List[Tuple[str, str]]]:
        grammar_dictionary = deepcopy(GRAMMAR_DICTIONARY)
        strings_list = []

        if self.all_tables:
            grammar_dictionary['table_name'] = \
                    sorted([f'"{table}"'
                            for table in list(self.all_tables.keys())], reverse=True)
            grammar_dictionary['col_ref'] = ['"*"', 'agg']
            all_columns = []
            for table, columns in self.all_tables.items():
                grammar_dictionary['col_ref'].extend([
                    f'("{table}" ws "." ws "{column}")' for column in columns
                ])
                all_columns.extend(columns)
            grammar_dictionary['col_ref'] = sorted(
                grammar_dictionary['col_ref'], reverse=True)
            grammar_dictionary['col'] = sorted(
                [f'"{column}"' for column in all_columns], reverse=True)

        biexprs = []
        if self.tables_with_strings:
            for table, columns in self.tables_with_strings.items():
                biexprs.extend([
                    f'("{table}" ws "." ws "{column}" ws binaryop ws {table}_{column}_string)'
                    for column in columns
                ])
                for column in columns:
                    self.cursor.execute(
                        f'SELECT DISTINCT {table} . {column} FROM {table}')
                    results = self.cursor.fetchall()

                    # Almost all the query values are in the database, we hardcode the rare case here.
                    if table == 'flight' and column == 'airline_code':
                        results.append(('EA', ))
                    strings_list.extend([
                        (format_action(f"{table}_{column}_string",
                                       str(row[0]),
                                       is_string=not 'number' in column,
                                       is_number='number'
                                       in column), str(row[0]))
                        for row in results
                    ])

                    if column.endswith('number'):
                        grammar_dictionary[f'{table}_{column}_string'] = \
                                sorted([f'"{str(row[0])}"' for row in results], reverse=True)
                    else:
                        grammar_dictionary[f'{table}_{column}_string'] = \
                                sorted([f'"\'{str(row[0])}\'"' for row in results], reverse=True)

        grammar_dictionary['biexpr'] = sorted(biexprs, reverse=True) + \
                ['( col_ref ws binaryop ws value)', '(value ws binaryop ws value)']
        return grammar_dictionary, strings_list
    def create_grammar_dict_and_strings(self) -> Tuple[Dict[str, List[str]], List[Tuple[str, str]]]:
        grammar_dictionary = deepcopy(GRAMMAR_DICTIONARY)
        strings_list = []

        if self.all_tables:
            grammar_dictionary['table_name'] = \
                    sorted([f'"{table}"'
                            for table in list(self.all_tables.keys())], reverse=True)
            grammar_dictionary['col_ref'] = ['"*"', 'agg']
            all_columns = []
            for table, columns in self.all_tables.items():
                grammar_dictionary['col_ref'].extend([f'("{table}" ws "." ws "{column}")'
                                                      for column in columns])
                all_columns.extend(columns)
            grammar_dictionary['col_ref'] = sorted(grammar_dictionary['col_ref'], reverse=True)
            grammar_dictionary['col'] = sorted([f'"{column}"' for column in all_columns], reverse=True)

        biexprs = []
        if self.tables_with_strings:
            for table, columns in self.tables_with_strings.items():
                biexprs.extend([f'("{table}" ws "." ws "{column}" ws binaryop ws {table}_{column}_string)'
                                for column in columns])
                for column in columns:
                    self.cursor.execute(f'SELECT DISTINCT {table} . {column} FROM {table}')
                    results = self.cursor.fetchall()

                    # Almost all the query values are in the database, we hardcode the rare case here.
                    if table == 'flight' and column == 'airline_code':
                        results.append(('EA',))
                    strings_list.extend([(format_action(f"{table}_{column}_string",
                                                        str(row[0]),
                                                        is_string=not 'number' in column,
                                                        is_number='number' in column),
                                          str(row[0]))
                                         for row in results])

                    if column.endswith('number'):
                        grammar_dictionary[f'{table}_{column}_string'] = \
                                sorted([f'"{str(row[0])}"' for row in results], reverse=True)
                    else:
                        grammar_dictionary[f'{table}_{column}_string'] = \
                                sorted([f'"\'{str(row[0])}\'"' for row in results], reverse=True)

        grammar_dictionary['biexpr'] = sorted(biexprs, reverse=True) + \
                ['( col_ref ws binaryop ws value)', '(value ws binaryop ws value)']
        return grammar_dictionary, strings_list
    def create_grammar_dict_and_strings(
            self) -> Tuple[Dict[str, List[str]], List[Tuple[str, str]]]:
        grammar_dictionary = deepcopy(GRAMMAR_DICTIONARY)
        strings_list = []

        if self.all_tables:
            grammar_dictionary["table_name"] = sorted(
                [f'"{table}"' for table in list(self.all_tables.keys())],
                reverse=True)
            grammar_dictionary["col_ref"] = ['"*"', "agg"]
            all_columns = []
            for table, columns in self.all_tables.items():
                grammar_dictionary["col_ref"].extend([
                    f'("{table}" ws "." ws "{column}")' for column in columns
                ])
                all_columns.extend(columns)
            grammar_dictionary["col_ref"] = sorted(
                grammar_dictionary["col_ref"], reverse=True)
            grammar_dictionary["col"] = sorted(
                [f'"{column}"' for column in all_columns], reverse=True)

        biexprs = []
        if self.tables_with_strings:
            for table, columns in self.tables_with_strings.items():
                biexprs.extend([
                    f'("{table}" ws "." ws "{column}" ws binaryop ws {table}_{column}_string)'
                    for column in columns
                ])
                for column in columns:
                    self.cursor.execute(
                        f"SELECT DISTINCT {table} . {column} FROM {table}")
                    results = self.cursor.fetchall()

                    # Almost all the query values are in the database, we hardcode the rare case here.
                    if table == "flight" and column == "airline_code":
                        results.append(("EA", ))
                    strings_list.extend([(
                        format_action(
                            f"{table}_{column}_string",
                            str(row[0]),
                            is_string="number" not in column,
                            is_number="number" in column,
                        ),
                        str(row[0]),
                    ) for row in results])

                    if column.endswith("number"):
                        grammar_dictionary[
                            f"{table}_{column}_string"] = sorted(
                                [f'"{str(row[0])}"' for row in results],
                                reverse=True)
                    else:
                        grammar_dictionary[
                            f"{table}_{column}_string"] = sorted(
                                [f"\"'{str(row[0])}'\"" for row in results],
                                reverse=True)

        grammar_dictionary["biexpr"] = sorted(biexprs, reverse=True) + [
            "( col_ref ws binaryop ws value)",
            "(value ws binaryop ws value)",
        ]
        return grammar_dictionary, strings_list
Exemple #6
0
    def _get_linked_entities(
            self) -> Dict[str, Dict[str, Tuple[str, str, List[int]]]]:
        """
        This method gets entities from the current utterance finds which tokens they are linked to.
        The entities are divided into two main groups, ``numbers`` and ``strings``. We rely on these
        entities later for updating the valid actions and the grammar.
        """
        current_tokenized_utterance = [] if not self.tokenized_utterances \
                else self.tokenized_utterances[-1]

        # We generate a dictionary where the key is the type eg. ``number`` or ``string``.
        # The value is another dictionary where the key is the action and the value is a tuple
        # of the nonterminal, the string value and the linking score.
        entity_linking_scores: Dict[str, Dict[str, Tuple[str, str,
                                                         List[int]]]] = {}

        number_linking_scores: Dict[str, Tuple[str, str, List[int]]] = {}
        string_linking_scores: Dict[str, Tuple[str, str, List[int]]] = {}

        # Get time range start
        self.add_to_number_linking_scores({'0'}, number_linking_scores,
                                          get_time_range_start_from_utterance,
                                          current_tokenized_utterance,
                                          'time_range_start')

        self.add_to_number_linking_scores({"1200"}, number_linking_scores,
                                          get_time_range_end_from_utterance,
                                          current_tokenized_utterance,
                                          'time_range_end')

        self.add_to_number_linking_scores({'0', '1'}, number_linking_scores,
                                          get_numbers_from_utterance,
                                          current_tokenized_utterance,
                                          'number')

        # Add string linking dict.
        string_linking_dict: Dict[str, List[int]] = {}
        for tokenized_utterance in self.tokenized_utterances:
            string_linking_dict = get_strings_from_utterance(
                tokenized_utterance)
        strings_list = []

        if self.tables_with_strings:
            for table, columns in self.tables_with_strings.items():
                for column in columns:
                    self.cursor.execute(
                        f'SELECT DISTINCT {table} . {column} FROM {table}')
                    strings_list.extend([
                        (format_action(f"{table}_{column}_string",
                                       str(row[0]),
                                       is_string=not 'number' in column,
                                       is_number='number' in column,
                                       keywords_to_uppercase=KEYWORDS),
                         str(row[0])) for row in self.cursor.fetchall()
                    ])

        # We construct the linking scores for strings from the ``string_linking_dict`` here.
        for string in strings_list:
            entity_linking = [0 for token in current_tokenized_utterance]
            # string_linking_dict has the strings and linking scores from the last utterance.
            # If the string is not in the last utterance, then the linking scores will be all 0.
            for token_index in string_linking_dict.get(string[1], []):
                entity_linking[token_index] = 1
            action = string[0]
            string_linking_scores[action] = (action.split(' -> ')[0],
                                             string[1], entity_linking)

        entity_linking_scores['number'] = number_linking_scores
        entity_linking_scores['string'] = string_linking_scores
        return entity_linking_scores