Example #1
0
 def test_one_of(self):
     len_eq(OneOf(Literal('aaa'), Literal('bb')).match('aaa'),
            3)  # first alternative
     len_eq(OneOf(Literal('aaa'), Literal('bb')).match('bbaaa'),
            2)  # second
     assert_raises(ParseError,
                   OneOf(Literal('aaa'), Literal('bb')).match,
                   'aa')  # no match
Example #2
0
    def _expressions_from_rules(self, rule_syntax):
        """Return the rules for parsing the grammar definition syntax.

        Return a 2-tuple: a dict of rule names pointing to their expressions,
        and then the top-level expression for the first rule.

        """
        # Hard-code enough of the rules to parse the grammar that describes the
        # grammar description language, to bootstrap:
        ws = Regex(r'\s+', name='ws')
        _ = Regex(r'[ \t]+', name='_')
        label = Regex(r'[a-zA-Z_][a-zA-Z_0-9]*', name='label')
        quantifier = Regex(r'[*+?]', name='quantifier')
        # This pattern supports empty literals. TODO: A problem?
        literal = Regex(r'u?r?"[^"\\]*(?:\\.[^"\\]*)*"',
                        ignore_case=True,
                        dot_all=True,
                        name='literal')
        regex = Sequence(Literal('~'),
                         literal,
                         Regex('[ilmsux]*', ignore_case=True),
                         name='regex')
        atom = OneOf(label, literal, regex, name='atom')
        quantified = Sequence(atom, quantifier, name='quantified')
        term = OneOf(quantified, atom, name='term')
        another_term = Sequence(_, term, name='another_term')
        sequence = Sequence(term, OneOrMore(another_term), name='sequence')
        or_term = Sequence(_, Literal('/'), another_term, name='or_term')
        ored = Sequence(term, OneOrMore(or_term), name='ored')
        and_term = Sequence(_, Literal('&'), another_term, name='and_term')
        anded = Sequence(term, OneOrMore(and_term), name='anded')
        poly_term = OneOf(anded, ored, sequence, name='poly_term')
        rhs = OneOf(poly_term, term, name='rhs')
        eol = Regex(r'[\r\n$]', name='eol')  # TODO: Support $.
        rule = Sequence(Optional(ws),
                        label,
                        Optional(_),
                        Literal('='),
                        Optional(_),
                        rhs,
                        Optional(_),
                        eol,
                        name='rule')
        rules = Sequence(OneOrMore(rule), Optional(ws), name='rules')

        # Use those hard-coded rules to parse the (possibly more extensive)
        # rule syntax. (For example, unless I start using parentheses in the
        # rule language definition itself, I should never have to hard-code
        # expressions for those above.)
        rule_tree = rules.parse(rule_syntax)

        # Turn the parse tree into a map of expressions:
        return RuleVisitor().visit(rule_tree)
Example #3
0
    def _expressions_from_rules(self, rule_syntax, custom_rules):
        """Return the rules for parsing the grammar definition syntax.

        Return a 2-tuple: a dict of rule names pointing to their expressions,
        and then the top-level expression for the first rule.

        """
        # Hard-code enough of the rules to parse the grammar that describes the
        # grammar description language, to bootstrap:
        comment = Regex(r'#[^\r\n]*', name='comment')
        meaninglessness = OneOf(Regex(r'\s+'), comment, name='meaninglessness')
        _ = ZeroOrMore(meaninglessness, name='_')
        equals = Sequence(Literal('='), _, name='equals')
        label = Sequence(Regex(r'[a-zA-Z_][a-zA-Z_0-9]*'), _, name='label')
        reference = Sequence(label, Not(equals), name='reference')
        quantifier = Sequence(Regex(r'[*+?]'), _, name='quantifier')
        # This pattern supports empty literals. TODO: A problem?
        spaceless_literal = Regex(r'u?r?"[^"\\]*(?:\\.[^"\\]*)*"',
                                  ignore_case=True,
                                  dot_all=True,
                                  name='spaceless_literal')
        literal = Sequence(spaceless_literal, _, name='literal')
        regex = Sequence(Literal('~'),
                         literal,
                         Regex('[ilmsuxa]*', ignore_case=True),
                         _,
                         name='regex')
        atom = OneOf(reference, literal, regex, name='atom')
        quantified = Sequence(atom, quantifier, name='quantified')

        term = OneOf(quantified, atom, name='term')
        not_term = Sequence(Literal('!'), term, _, name='not_term')
        term.members = (not_term,) + term.members

        sequence = Sequence(term, OneOrMore(term), name='sequence')
        or_term = Sequence(Literal('/'), _, term, name='or_term')
        ored = Sequence(term, OneOrMore(or_term), name='ored')
        expression = OneOf(ored, sequence, term, name='expression')
        rule = Sequence(label, equals, expression, name='rule')
        rules = Sequence(_, OneOrMore(rule), name='rules')

        # Use those hard-coded rules to parse the (more extensive) rule syntax.
        # (For example, unless I start using parentheses in the rule language
        # definition itself, I should never have to hard-code expressions for
        # those above.)

        rule_tree = rules.parse(rule_syntax)

        # Turn the parse tree into a map of expressions:
        return RuleVisitor().visit(rule_tree)
Example #4
0
 def _add_numeric_nonterminal_to_grammar(self, nonterminal: str,
                                         new_grammar: Grammar) -> None:
     numbers = self._get_numeric_database_values(nonterminal)
     number_literals = [Literal(number) for number in numbers]
     if number_literals:
         new_grammar[nonterminal] = OneOf(*number_literals,
                                          name=nonterminal)
Example #5
0
def test_parse_classes():
    """Make sure we recognize character classes."""

    class_or_inverted = OneOf(regex_grammar['inverted_class'],
                              regex_grammar['class'],
                              name='class_or_inverted')

    def parse_class(pattern):
        class_or_inverted.parse(pattern)

    def dont_parse_class(pattern):
        assert_raises(ParseError, class_or_inverted.parse, pattern)

    def assert_matches(pattern, text):
        eq_(class_or_inverted.match(pattern).text, text)

    # These should match all the way to the end:
    for pattern in [
            '[]]', '[^]]', r'[\d-]', r'[a\]]', r'[()[\]{}]', '[]()[{}]',
            '[a-zA-Z0-9]', '[abcde]'
    ]:
        yield parse_class, pattern

    # These shouldn't match:
    for pattern in ['[]', '[^]', '[']:
        yield dont_parse_class, pattern

    # Make sure we don't go too far:
    for pattern, text in [('[abc]]', '[abc]'), ('[[0-9]qux', '[[0-9]'),
                          (r'[^\a\f\]]abc', r'[^\a\f\]]')]:
        yield assert_matches, pattern, text
Example #6
0
    def _expressions_from_rules(self, rule_syntax, custom_rules):
        """Return the rules for parsing the grammar definition syntax.

        Return a 2-tuple: a dict of rule names pointing to their expressions,
        and then the top-level expression for the first rule.

        """
        # Hard-code enough of the rules to parse the grammar that describes the
        # grammar description language, to bootstrap:
        comment = Regex(r"#[^\r\n]*", name="comment")
        meaninglessness = OneOf(Regex(r"\s+"), comment, name="meaninglessness")
        _ = ZeroOrMore(meaninglessness, name="_")
        equals = Sequence(Literal("="), _, name="equals")
        label = Sequence(Regex(r"[a-zA-Z_][a-zA-Z_0-9]*"), _, name="label")
        reference = Sequence(label, Not(equals), name="reference")
        quantifier = Sequence(Regex(r"[*+?]"), _, name="quantifier")
        # This pattern supports empty literals. TODO: A problem?
        spaceless_literal = Regex(
            r'u?r?"[^"\\]*(?:\\.[^"\\]*)*"', ignore_case=True, dot_all=True, name="spaceless_literal"
        )
        literal = Sequence(spaceless_literal, _, name="literal")
        regex = Sequence(Literal("~"), literal, Regex("[ilmsux]*", ignore_case=True), _, name="regex")
        atom = OneOf(reference, literal, regex, name="atom")
        quantified = Sequence(atom, quantifier, name="quantified")

        term = OneOf(quantified, atom, name="term")
        not_term = Sequence(Literal("!"), term, _, name="not_term")
        term.members = (not_term,) + term.members

        sequence = Sequence(term, OneOrMore(term), name="sequence")
        or_term = Sequence(Literal("/"), _, term, name="or_term")
        ored = Sequence(term, OneOrMore(or_term), name="ored")
        expression = OneOf(ored, sequence, term, name="expression")
        rule = Sequence(label, equals, expression, name="rule")
        rules = Sequence(_, OneOrMore(rule), name="rules")

        # Use those hard-coded rules to parse the (more extensive) rule syntax.
        # (For example, unless I start using parentheses in the rule language
        # definition itself, I should never have to hard-code expressions for
        # those above.)

        rule_tree = rules.parse(rule_syntax)

        # Turn the parse tree into a map of expressions:
        return RuleVisitor().visit(rule_tree)
Example #7
0
    def _expressions_from_rules(self, rule_syntax, custom_rules):
        """Return the rules for parsing the grammar definition syntax.

        Return a 2-tuple: a dict of rule names pointing to their expressions,
        and then the top-level expression for the first rule.

        """
        # Hard-code enough of the rules to parse the grammar that describes the
        # grammar description language, to bootstrap:
        comment = Regex(r'#[^\r\n]*', name='comment')
        meaninglessness = OneOf(Regex(r'\s+'), comment, name='meaninglessness')
        _ = ZeroOrMore(meaninglessness, name='_')
        equals = Sequence(Literal('='), _, name='equals')
        label = Sequence(Regex(r'[a-zA-Z_][a-zA-Z_0-9]*'), _, name='label')
        reference = Sequence(label, Not(equals), name='reference')
        quantifier = Sequence(Regex(r'[*+?]'), _, name='quantifier')
        # This pattern supports empty literals. TODO: A problem?
        spaceless_literal = Regex(r'u?r?"[^"\\]*(?:\\.[^"\\]*)*"',
                                  ignore_case=True,
                                  dot_all=True,
                                  name='spaceless_literal')
        literal = Sequence(spaceless_literal, _, name='literal')
        regex = Sequence(Literal('~'),
                         literal,
                         Regex('[ilmsuxa]*', ignore_case=True),
                         _,
                         name='regex')
        atom = OneOf(reference, literal, regex, name='atom')
        quantified = Sequence(atom, quantifier, name='quantified')

        term = OneOf(quantified, atom, name='term')
        not_term = Sequence(Literal('!'), term, _, name='not_term')
        term.members = (not_term, ) + term.members

        sequence = Sequence(term, OneOrMore(term), name='sequence')
        or_term = Sequence(Literal('/'), _, term, name='or_term')
        ored = Sequence(term, OneOrMore(or_term), name='ored')
        expression = OneOf(ored, sequence, term, name='expression')
        rule = Sequence(label, equals, expression, name='rule')
        rules = Sequence(_, OneOrMore(rule), name='rules')

        # Use those hard-coded rules to parse the (more extensive) rule syntax.
        # (For example, unless I start using parentheses in the rule language
        # definition itself, I should never have to hard-code expressions for
        # those above.)

        rule_tree = rules.parse(rule_syntax)

        # Turn the parse tree into a map of expressions:
        return RuleVisitor().visit(rule_tree)
Example #8
0
 def test_one_of(self):
     """``OneOf`` should return its own node, wrapping the child that succeeds."""
     o = OneOf(Literal('a', name='lit'), name='one_of')
     text = 'aa'
     eq_(o.match(text),
         Node('one_of', text, 0, 1, children=[Node('lit', text, 0, 1)]))
Example #9
0
    def _update_grammar(self):
        """
        We create a new ``Grammar`` object from the one in ``AtisSqlTableContext``, that also
        has the new entities that are extracted from the utterance. Stitching together the expressions
        to form the grammar is a little tedious here, but it is worth it because we don't have to create
        a new grammar from scratch. Creating a new grammar is expensive because we have many production
        rules that have all database values in the column on the right hand side. We update the expressions
        bottom up, since the higher level expressions may refer to the lower level ones. For example, the
        ternary expression will refer to the start and end times.
        """

        # This will give us a shallow copy. We have to be careful here because the ``Grammar`` object
        # contains ``Expression`` objects that have tuples containing the members of that expression.
        # We have to create new sub-expression objects so that original grammar is not mutated.
        new_grammar = copy(AtisWorld.sql_table_context.grammar)

        for numeric_nonterminal in NUMERIC_NONTERMINALS:
            self._add_numeric_nonterminal_to_grammar(numeric_nonterminal,
                                                     new_grammar)
        self._update_expression_reference(new_grammar, 'pos_value', 'number')

        ternary_expressions = [
            self._get_sequence_with_spacing(new_grammar, [
                new_grammar['col_ref'],
                Literal('BETWEEN'), new_grammar['time_range_start'],
                Literal(f'AND'), new_grammar['time_range_end']
            ]),
            self._get_sequence_with_spacing(new_grammar, [
                new_grammar['col_ref'],
                Literal('NOT'),
                Literal('BETWEEN'), new_grammar['time_range_start'],
                Literal(f'AND'), new_grammar['time_range_end']
            ]),
            self._get_sequence_with_spacing(new_grammar, [
                new_grammar['col_ref'],
                Literal('not'),
                Literal('BETWEEN'), new_grammar['time_range_start'],
                Literal(f'AND'), new_grammar['time_range_end']
            ])
        ]

        new_grammar['ternaryexpr'] = OneOf(*ternary_expressions,
                                           name='ternaryexpr')
        self._update_expression_reference(new_grammar, 'condition',
                                          'ternaryexpr')

        new_binary_expressions = []

        fare_round_trip_cost_expression = \
                    self._get_sequence_with_spacing(new_grammar,
                                                    [Literal('fare'),
                                                     Literal('.'),
                                                     Literal('round_trip_cost'),
                                                     new_grammar['binaryop'],
                                                     new_grammar['fare_round_trip_cost']])
        new_binary_expressions.append(fare_round_trip_cost_expression)

        fare_one_direction_cost_expression = \
                    self._get_sequence_with_spacing(new_grammar,
                                                    [Literal('fare'),
                                                     Literal('.'),
                                                     Literal('one_direction_cost'),
                                                     new_grammar['binaryop'],
                                                     new_grammar['fare_one_direction_cost']])

        new_binary_expressions.append(fare_one_direction_cost_expression)

        flight_number_expression = \
                    self._get_sequence_with_spacing(new_grammar,
                                                    [Literal('flight'),
                                                     Literal('.'),
                                                     Literal('flight_number'),
                                                     new_grammar['binaryop'],
                                                     new_grammar['flight_number']])
        new_binary_expressions.append(flight_number_expression)

        if self.dates:
            year_binary_expression = self._get_sequence_with_spacing(
                new_grammar, [
                    Literal('date_day'),
                    Literal('.'),
                    Literal('year'), new_grammar['binaryop'],
                    new_grammar['year_number']
                ])
            month_binary_expression = self._get_sequence_with_spacing(
                new_grammar, [
                    Literal('date_day'),
                    Literal('.'),
                    Literal('month_number'), new_grammar['binaryop'],
                    new_grammar['month_number']
                ])
            day_binary_expression = self._get_sequence_with_spacing(
                new_grammar, [
                    Literal('date_day'),
                    Literal('.'),
                    Literal('day_number'), new_grammar['binaryop'],
                    new_grammar['day_number']
                ])
            new_binary_expressions.extend([
                year_binary_expression, month_binary_expression,
                day_binary_expression
            ])

        new_binary_expressions = new_binary_expressions + list(
            new_grammar['biexpr'].members)
        new_grammar['biexpr'] = OneOf(*new_binary_expressions, name='biexpr')
        self._update_expression_reference(new_grammar, 'condition', 'biexpr')
        return new_grammar
Example #10
0
    def visit_not_term(self, not_term, (exclamation, term, _)):
        return Not(term)

    def visit_rule(self, rule, (label, equals, expression)):
        """Assign a name to the Expression and return it."""
        expression.name = label  # Assign a name to the expr.
        return expression

    def visit_sequence(self, sequence, (term, other_terms)):
        """A parsed Sequence looks like [term node, OneOrMore node of
        ``another_term``s]. Flatten it out."""
        return Sequence(term, *other_terms)

    def visit_ored(self, ored, (first_term, other_terms)):
        return OneOf(first_term, *other_terms)

    def visit_or_term(self, or_term, (slash, _, term)):
        """Return just the term from an ``or_term``.

        We already know it's going to be ored, from the containing ``ored``.

        """
        return term

    def visit_label(self, label, (name, _)):
        """Turn a label into a unicode string."""
        return name.text

    def visit_reference(self, reference, (label, not_equals)):
        """Stick a :class:`LazyReference` in the tree as a placeholder.
Example #11
0
    def _update_grammar(self):
        """
        We create a new ``Grammar`` object from the one in ``AtisSqlTableContext``, that also
        has the new entities that are extracted from the utterance. Stitching together the expressions
        to form the grammar is a little tedious here, but it is worth it because we don't have to create
        a new grammar from scratch. Creating a new grammar is expensive because we have many production
        rules that have all database values in the column on the right hand side. We update the expressions
        bottom up, since the higher level expressions may refer to the lower level ones. For example, the
        ternary expression will refer to the start and end times.
        """

        # This will give us a shallow copy, but that's OK because everything
        # inside is immutable so we get a new copy of it.
        new_grammar = copy(AtisWorld.sql_table_context.grammar)

        numbers = self._get_numeric_database_values('number')
        number_literals = [Literal(number) for number in numbers]
        new_grammar['number'] = OneOf(*number_literals, name='number')
        self._update_expression_reference(new_grammar, 'pos_value', 'number')

        time_range_start = self._get_numeric_database_values(
            'time_range_start')
        time_range_start_literals = [
            Literal(time) for time in time_range_start
        ]
        new_grammar['time_range_start'] = OneOf(*time_range_start_literals,
                                                name='time_range_start')

        time_range_end = self._get_numeric_database_values('time_range_end')
        time_range_end_literals = [Literal(time) for time in time_range_end]
        new_grammar['time_range_end'] = OneOf(*time_range_end_literals,
                                              name='time_range_end')

        ternary_expressions = [
            self._get_sequence_with_spacing(new_grammar, [
                new_grammar['col_ref'],
                Literal('BETWEEN'), new_grammar['time_range_start'],
                Literal(f'AND'), new_grammar['time_range_end']
            ]),
            self._get_sequence_with_spacing(new_grammar, [
                new_grammar['col_ref'],
                Literal('NOT'),
                Literal('BETWEEN'), new_grammar['time_range_start'],
                Literal(f'AND'), new_grammar['time_range_end']
            ]),
            self._get_sequence_with_spacing(new_grammar, [
                new_grammar['col_ref'],
                Literal('not'),
                Literal('BETWEEN'), new_grammar['time_range_start'],
                Literal(f'AND'), new_grammar['time_range_end']
            ])
        ]

        new_grammar['ternaryexpr'] = OneOf(*ternary_expressions,
                                           name='ternaryexpr')
        self._update_expression_reference(new_grammar, 'condition',
                                          'ternaryexpr')

        if self.dates:
            new_binary_expressions = []
            year_binary_expression = self._get_sequence_with_spacing(
                new_grammar, [
                    Literal('date_day'),
                    Literal('.'),
                    Literal('year'), new_grammar['binaryop'],
                    Literal(f'{self.dates[0].year}')
                ])
            new_binary_expressions.append(year_binary_expression)
            for date in self.dates:
                month_binary_expression = self._get_sequence_with_spacing(
                    new_grammar, [
                        Literal('date_day'),
                        Literal('.'),
                        Literal('month_number'), new_grammar['binaryop'],
                        Literal(f'{date.month}')
                    ])

                day_binary_expression = self._get_sequence_with_spacing(
                    new_grammar, [
                        Literal('date_day'),
                        Literal('.'),
                        Literal('day_number'), new_grammar['binaryop'],
                        Literal(f'{date.day}')
                    ])
                new_binary_expressions.extend(
                    [month_binary_expression, day_binary_expression])

            new_grammar['biexpr'].members = new_grammar[
                'biexpr'].members + tuple(new_binary_expressions)
        return new_grammar
 def test_one_of(self):
     """``OneOf`` should return its own node, wrapping the child that succeeds."""
     o = OneOf(Literal("a", name="lit"), name="one_of")
     text = "aa"
     eq_(o.match(text), Node("one_of", text, 0, 1, children=[Node("lit", text, 0, 1)]))
Example #13
0
 def visit_ored(self, ored, xxx_todo_changeme7):
     (first_term, other_terms) = xxx_todo_changeme7
     return OneOf(first_term, *other_terms)
Example #14
0
 def visit_ored(self, ored, children):
     first_term, other_terms = children
     return OneOf(first_term, *other_terms)
Example #15
0
 def visit_ored(self, node, ored):
     first_term, other_terms = ored
     return OneOf(first_term, *other_terms)
 def test_one_of(self):
     """``OneOf`` should return its own node, wrapping the child that succeeds."""
     o = OneOf(Literal('a', name='lit'), name='one_of')
     text = 'aa'
     eq_(o.match(text), Node('one_of', text, 0, 1, children=[
                             Node('lit', text, 0, 1)]))
Example #17
0
 def test_one_of(self):
     len_eq(OneOf(Literal('aaa'), Literal('bb')).match('aaa'), 3)  # first alternative
     len_eq(OneOf(Literal('aaa'), Literal('bb')).match('bbaaa'), 2)  # second
     len_eq(OneOf(Literal('aaa'), Literal('bb')).match('aa'), None)  # no match