def test_one_of(self): len_eq(OneOf(Literal('aaa'), Literal('bb')).match('aaa'), 3) # first alternative len_eq(OneOf(Literal('aaa'), Literal('bb')).match('bbaaa'), 2) # second assert_raises(ParseError, OneOf(Literal('aaa'), Literal('bb')).match, 'aa') # no match
def _expressions_from_rules(self, rule_syntax): """Return the rules for parsing the grammar definition syntax. Return a 2-tuple: a dict of rule names pointing to their expressions, and then the top-level expression for the first rule. """ # Hard-code enough of the rules to parse the grammar that describes the # grammar description language, to bootstrap: ws = Regex(r'\s+', name='ws') _ = Regex(r'[ \t]+', name='_') label = Regex(r'[a-zA-Z_][a-zA-Z_0-9]*', name='label') quantifier = Regex(r'[*+?]', name='quantifier') # This pattern supports empty literals. TODO: A problem? literal = Regex(r'u?r?"[^"\\]*(?:\\.[^"\\]*)*"', ignore_case=True, dot_all=True, name='literal') regex = Sequence(Literal('~'), literal, Regex('[ilmsux]*', ignore_case=True), name='regex') atom = OneOf(label, literal, regex, name='atom') quantified = Sequence(atom, quantifier, name='quantified') term = OneOf(quantified, atom, name='term') another_term = Sequence(_, term, name='another_term') sequence = Sequence(term, OneOrMore(another_term), name='sequence') or_term = Sequence(_, Literal('/'), another_term, name='or_term') ored = Sequence(term, OneOrMore(or_term), name='ored') and_term = Sequence(_, Literal('&'), another_term, name='and_term') anded = Sequence(term, OneOrMore(and_term), name='anded') poly_term = OneOf(anded, ored, sequence, name='poly_term') rhs = OneOf(poly_term, term, name='rhs') eol = Regex(r'[\r\n$]', name='eol') # TODO: Support $. rule = Sequence(Optional(ws), label, Optional(_), Literal('='), Optional(_), rhs, Optional(_), eol, name='rule') rules = Sequence(OneOrMore(rule), Optional(ws), name='rules') # Use those hard-coded rules to parse the (possibly more extensive) # rule syntax. (For example, unless I start using parentheses in the # rule language definition itself, I should never have to hard-code # expressions for those above.) rule_tree = rules.parse(rule_syntax) # Turn the parse tree into a map of expressions: return RuleVisitor().visit(rule_tree)
def _expressions_from_rules(self, rule_syntax, custom_rules): """Return the rules for parsing the grammar definition syntax. Return a 2-tuple: a dict of rule names pointing to their expressions, and then the top-level expression for the first rule. """ # Hard-code enough of the rules to parse the grammar that describes the # grammar description language, to bootstrap: comment = Regex(r'#[^\r\n]*', name='comment') meaninglessness = OneOf(Regex(r'\s+'), comment, name='meaninglessness') _ = ZeroOrMore(meaninglessness, name='_') equals = Sequence(Literal('='), _, name='equals') label = Sequence(Regex(r'[a-zA-Z_][a-zA-Z_0-9]*'), _, name='label') reference = Sequence(label, Not(equals), name='reference') quantifier = Sequence(Regex(r'[*+?]'), _, name='quantifier') # This pattern supports empty literals. TODO: A problem? spaceless_literal = Regex(r'u?r?"[^"\\]*(?:\\.[^"\\]*)*"', ignore_case=True, dot_all=True, name='spaceless_literal') literal = Sequence(spaceless_literal, _, name='literal') regex = Sequence(Literal('~'), literal, Regex('[ilmsuxa]*', ignore_case=True), _, name='regex') atom = OneOf(reference, literal, regex, name='atom') quantified = Sequence(atom, quantifier, name='quantified') term = OneOf(quantified, atom, name='term') not_term = Sequence(Literal('!'), term, _, name='not_term') term.members = (not_term,) + term.members sequence = Sequence(term, OneOrMore(term), name='sequence') or_term = Sequence(Literal('/'), _, term, name='or_term') ored = Sequence(term, OneOrMore(or_term), name='ored') expression = OneOf(ored, sequence, term, name='expression') rule = Sequence(label, equals, expression, name='rule') rules = Sequence(_, OneOrMore(rule), name='rules') # Use those hard-coded rules to parse the (more extensive) rule syntax. # (For example, unless I start using parentheses in the rule language # definition itself, I should never have to hard-code expressions for # those above.) rule_tree = rules.parse(rule_syntax) # Turn the parse tree into a map of expressions: return RuleVisitor().visit(rule_tree)
def _add_numeric_nonterminal_to_grammar(self, nonterminal: str, new_grammar: Grammar) -> None: numbers = self._get_numeric_database_values(nonterminal) number_literals = [Literal(number) for number in numbers] if number_literals: new_grammar[nonterminal] = OneOf(*number_literals, name=nonterminal)
def test_parse_classes(): """Make sure we recognize character classes.""" class_or_inverted = OneOf(regex_grammar['inverted_class'], regex_grammar['class'], name='class_or_inverted') def parse_class(pattern): class_or_inverted.parse(pattern) def dont_parse_class(pattern): assert_raises(ParseError, class_or_inverted.parse, pattern) def assert_matches(pattern, text): eq_(class_or_inverted.match(pattern).text, text) # These should match all the way to the end: for pattern in [ '[]]', '[^]]', r'[\d-]', r'[a\]]', r'[()[\]{}]', '[]()[{}]', '[a-zA-Z0-9]', '[abcde]' ]: yield parse_class, pattern # These shouldn't match: for pattern in ['[]', '[^]', '[']: yield dont_parse_class, pattern # Make sure we don't go too far: for pattern, text in [('[abc]]', '[abc]'), ('[[0-9]qux', '[[0-9]'), (r'[^\a\f\]]abc', r'[^\a\f\]]')]: yield assert_matches, pattern, text
def _expressions_from_rules(self, rule_syntax, custom_rules): """Return the rules for parsing the grammar definition syntax. Return a 2-tuple: a dict of rule names pointing to their expressions, and then the top-level expression for the first rule. """ # Hard-code enough of the rules to parse the grammar that describes the # grammar description language, to bootstrap: comment = Regex(r"#[^\r\n]*", name="comment") meaninglessness = OneOf(Regex(r"\s+"), comment, name="meaninglessness") _ = ZeroOrMore(meaninglessness, name="_") equals = Sequence(Literal("="), _, name="equals") label = Sequence(Regex(r"[a-zA-Z_][a-zA-Z_0-9]*"), _, name="label") reference = Sequence(label, Not(equals), name="reference") quantifier = Sequence(Regex(r"[*+?]"), _, name="quantifier") # This pattern supports empty literals. TODO: A problem? spaceless_literal = Regex( r'u?r?"[^"\\]*(?:\\.[^"\\]*)*"', ignore_case=True, dot_all=True, name="spaceless_literal" ) literal = Sequence(spaceless_literal, _, name="literal") regex = Sequence(Literal("~"), literal, Regex("[ilmsux]*", ignore_case=True), _, name="regex") atom = OneOf(reference, literal, regex, name="atom") quantified = Sequence(atom, quantifier, name="quantified") term = OneOf(quantified, atom, name="term") not_term = Sequence(Literal("!"), term, _, name="not_term") term.members = (not_term,) + term.members sequence = Sequence(term, OneOrMore(term), name="sequence") or_term = Sequence(Literal("/"), _, term, name="or_term") ored = Sequence(term, OneOrMore(or_term), name="ored") expression = OneOf(ored, sequence, term, name="expression") rule = Sequence(label, equals, expression, name="rule") rules = Sequence(_, OneOrMore(rule), name="rules") # Use those hard-coded rules to parse the (more extensive) rule syntax. # (For example, unless I start using parentheses in the rule language # definition itself, I should never have to hard-code expressions for # those above.) rule_tree = rules.parse(rule_syntax) # Turn the parse tree into a map of expressions: return RuleVisitor().visit(rule_tree)
def _expressions_from_rules(self, rule_syntax, custom_rules): """Return the rules for parsing the grammar definition syntax. Return a 2-tuple: a dict of rule names pointing to their expressions, and then the top-level expression for the first rule. """ # Hard-code enough of the rules to parse the grammar that describes the # grammar description language, to bootstrap: comment = Regex(r'#[^\r\n]*', name='comment') meaninglessness = OneOf(Regex(r'\s+'), comment, name='meaninglessness') _ = ZeroOrMore(meaninglessness, name='_') equals = Sequence(Literal('='), _, name='equals') label = Sequence(Regex(r'[a-zA-Z_][a-zA-Z_0-9]*'), _, name='label') reference = Sequence(label, Not(equals), name='reference') quantifier = Sequence(Regex(r'[*+?]'), _, name='quantifier') # This pattern supports empty literals. TODO: A problem? spaceless_literal = Regex(r'u?r?"[^"\\]*(?:\\.[^"\\]*)*"', ignore_case=True, dot_all=True, name='spaceless_literal') literal = Sequence(spaceless_literal, _, name='literal') regex = Sequence(Literal('~'), literal, Regex('[ilmsuxa]*', ignore_case=True), _, name='regex') atom = OneOf(reference, literal, regex, name='atom') quantified = Sequence(atom, quantifier, name='quantified') term = OneOf(quantified, atom, name='term') not_term = Sequence(Literal('!'), term, _, name='not_term') term.members = (not_term, ) + term.members sequence = Sequence(term, OneOrMore(term), name='sequence') or_term = Sequence(Literal('/'), _, term, name='or_term') ored = Sequence(term, OneOrMore(or_term), name='ored') expression = OneOf(ored, sequence, term, name='expression') rule = Sequence(label, equals, expression, name='rule') rules = Sequence(_, OneOrMore(rule), name='rules') # Use those hard-coded rules to parse the (more extensive) rule syntax. # (For example, unless I start using parentheses in the rule language # definition itself, I should never have to hard-code expressions for # those above.) rule_tree = rules.parse(rule_syntax) # Turn the parse tree into a map of expressions: return RuleVisitor().visit(rule_tree)
def test_one_of(self): """``OneOf`` should return its own node, wrapping the child that succeeds.""" o = OneOf(Literal('a', name='lit'), name='one_of') text = 'aa' eq_(o.match(text), Node('one_of', text, 0, 1, children=[Node('lit', text, 0, 1)]))
def _update_grammar(self): """ We create a new ``Grammar`` object from the one in ``AtisSqlTableContext``, that also has the new entities that are extracted from the utterance. Stitching together the expressions to form the grammar is a little tedious here, but it is worth it because we don't have to create a new grammar from scratch. Creating a new grammar is expensive because we have many production rules that have all database values in the column on the right hand side. We update the expressions bottom up, since the higher level expressions may refer to the lower level ones. For example, the ternary expression will refer to the start and end times. """ # This will give us a shallow copy. We have to be careful here because the ``Grammar`` object # contains ``Expression`` objects that have tuples containing the members of that expression. # We have to create new sub-expression objects so that original grammar is not mutated. new_grammar = copy(AtisWorld.sql_table_context.grammar) for numeric_nonterminal in NUMERIC_NONTERMINALS: self._add_numeric_nonterminal_to_grammar(numeric_nonterminal, new_grammar) self._update_expression_reference(new_grammar, 'pos_value', 'number') ternary_expressions = [ self._get_sequence_with_spacing(new_grammar, [ new_grammar['col_ref'], Literal('BETWEEN'), new_grammar['time_range_start'], Literal(f'AND'), new_grammar['time_range_end'] ]), self._get_sequence_with_spacing(new_grammar, [ new_grammar['col_ref'], Literal('NOT'), Literal('BETWEEN'), new_grammar['time_range_start'], Literal(f'AND'), new_grammar['time_range_end'] ]), self._get_sequence_with_spacing(new_grammar, [ new_grammar['col_ref'], Literal('not'), Literal('BETWEEN'), new_grammar['time_range_start'], Literal(f'AND'), new_grammar['time_range_end'] ]) ] new_grammar['ternaryexpr'] = OneOf(*ternary_expressions, name='ternaryexpr') self._update_expression_reference(new_grammar, 'condition', 'ternaryexpr') new_binary_expressions = [] fare_round_trip_cost_expression = \ self._get_sequence_with_spacing(new_grammar, [Literal('fare'), Literal('.'), Literal('round_trip_cost'), new_grammar['binaryop'], new_grammar['fare_round_trip_cost']]) new_binary_expressions.append(fare_round_trip_cost_expression) fare_one_direction_cost_expression = \ self._get_sequence_with_spacing(new_grammar, [Literal('fare'), Literal('.'), Literal('one_direction_cost'), new_grammar['binaryop'], new_grammar['fare_one_direction_cost']]) new_binary_expressions.append(fare_one_direction_cost_expression) flight_number_expression = \ self._get_sequence_with_spacing(new_grammar, [Literal('flight'), Literal('.'), Literal('flight_number'), new_grammar['binaryop'], new_grammar['flight_number']]) new_binary_expressions.append(flight_number_expression) if self.dates: year_binary_expression = self._get_sequence_with_spacing( new_grammar, [ Literal('date_day'), Literal('.'), Literal('year'), new_grammar['binaryop'], new_grammar['year_number'] ]) month_binary_expression = self._get_sequence_with_spacing( new_grammar, [ Literal('date_day'), Literal('.'), Literal('month_number'), new_grammar['binaryop'], new_grammar['month_number'] ]) day_binary_expression = self._get_sequence_with_spacing( new_grammar, [ Literal('date_day'), Literal('.'), Literal('day_number'), new_grammar['binaryop'], new_grammar['day_number'] ]) new_binary_expressions.extend([ year_binary_expression, month_binary_expression, day_binary_expression ]) new_binary_expressions = new_binary_expressions + list( new_grammar['biexpr'].members) new_grammar['biexpr'] = OneOf(*new_binary_expressions, name='biexpr') self._update_expression_reference(new_grammar, 'condition', 'biexpr') return new_grammar
def visit_not_term(self, not_term, (exclamation, term, _)): return Not(term) def visit_rule(self, rule, (label, equals, expression)): """Assign a name to the Expression and return it.""" expression.name = label # Assign a name to the expr. return expression def visit_sequence(self, sequence, (term, other_terms)): """A parsed Sequence looks like [term node, OneOrMore node of ``another_term``s]. Flatten it out.""" return Sequence(term, *other_terms) def visit_ored(self, ored, (first_term, other_terms)): return OneOf(first_term, *other_terms) def visit_or_term(self, or_term, (slash, _, term)): """Return just the term from an ``or_term``. We already know it's going to be ored, from the containing ``ored``. """ return term def visit_label(self, label, (name, _)): """Turn a label into a unicode string.""" return name.text def visit_reference(self, reference, (label, not_equals)): """Stick a :class:`LazyReference` in the tree as a placeholder.
def _update_grammar(self): """ We create a new ``Grammar`` object from the one in ``AtisSqlTableContext``, that also has the new entities that are extracted from the utterance. Stitching together the expressions to form the grammar is a little tedious here, but it is worth it because we don't have to create a new grammar from scratch. Creating a new grammar is expensive because we have many production rules that have all database values in the column on the right hand side. We update the expressions bottom up, since the higher level expressions may refer to the lower level ones. For example, the ternary expression will refer to the start and end times. """ # This will give us a shallow copy, but that's OK because everything # inside is immutable so we get a new copy of it. new_grammar = copy(AtisWorld.sql_table_context.grammar) numbers = self._get_numeric_database_values('number') number_literals = [Literal(number) for number in numbers] new_grammar['number'] = OneOf(*number_literals, name='number') self._update_expression_reference(new_grammar, 'pos_value', 'number') time_range_start = self._get_numeric_database_values( 'time_range_start') time_range_start_literals = [ Literal(time) for time in time_range_start ] new_grammar['time_range_start'] = OneOf(*time_range_start_literals, name='time_range_start') time_range_end = self._get_numeric_database_values('time_range_end') time_range_end_literals = [Literal(time) for time in time_range_end] new_grammar['time_range_end'] = OneOf(*time_range_end_literals, name='time_range_end') ternary_expressions = [ self._get_sequence_with_spacing(new_grammar, [ new_grammar['col_ref'], Literal('BETWEEN'), new_grammar['time_range_start'], Literal(f'AND'), new_grammar['time_range_end'] ]), self._get_sequence_with_spacing(new_grammar, [ new_grammar['col_ref'], Literal('NOT'), Literal('BETWEEN'), new_grammar['time_range_start'], Literal(f'AND'), new_grammar['time_range_end'] ]), self._get_sequence_with_spacing(new_grammar, [ new_grammar['col_ref'], Literal('not'), Literal('BETWEEN'), new_grammar['time_range_start'], Literal(f'AND'), new_grammar['time_range_end'] ]) ] new_grammar['ternaryexpr'] = OneOf(*ternary_expressions, name='ternaryexpr') self._update_expression_reference(new_grammar, 'condition', 'ternaryexpr') if self.dates: new_binary_expressions = [] year_binary_expression = self._get_sequence_with_spacing( new_grammar, [ Literal('date_day'), Literal('.'), Literal('year'), new_grammar['binaryop'], Literal(f'{self.dates[0].year}') ]) new_binary_expressions.append(year_binary_expression) for date in self.dates: month_binary_expression = self._get_sequence_with_spacing( new_grammar, [ Literal('date_day'), Literal('.'), Literal('month_number'), new_grammar['binaryop'], Literal(f'{date.month}') ]) day_binary_expression = self._get_sequence_with_spacing( new_grammar, [ Literal('date_day'), Literal('.'), Literal('day_number'), new_grammar['binaryop'], Literal(f'{date.day}') ]) new_binary_expressions.extend( [month_binary_expression, day_binary_expression]) new_grammar['biexpr'].members = new_grammar[ 'biexpr'].members + tuple(new_binary_expressions) return new_grammar
def test_one_of(self): """``OneOf`` should return its own node, wrapping the child that succeeds.""" o = OneOf(Literal("a", name="lit"), name="one_of") text = "aa" eq_(o.match(text), Node("one_of", text, 0, 1, children=[Node("lit", text, 0, 1)]))
def visit_ored(self, ored, xxx_todo_changeme7): (first_term, other_terms) = xxx_todo_changeme7 return OneOf(first_term, *other_terms)
def visit_ored(self, ored, children): first_term, other_terms = children return OneOf(first_term, *other_terms)
def visit_ored(self, node, ored): first_term, other_terms = ored return OneOf(first_term, *other_terms)
def test_one_of(self): """``OneOf`` should return its own node, wrapping the child that succeeds.""" o = OneOf(Literal('a', name='lit'), name='one_of') text = 'aa' eq_(o.match(text), Node('one_of', text, 0, 1, children=[ Node('lit', text, 0, 1)]))
def test_one_of(self): len_eq(OneOf(Literal('aaa'), Literal('bb')).match('aaa'), 3) # first alternative len_eq(OneOf(Literal('aaa'), Literal('bb')).match('bbaaa'), 2) # second len_eq(OneOf(Literal('aaa'), Literal('bb')).match('aa'), None) # no match