def test__parser__grammar_oneof_take_longest_match(seg_list): """Test that the OneOf grammar takes the longest match.""" fooRegex = RegexParser(r"fo{2}", KeywordSegment) baar = StringParser("baar", KeywordSegment) foo = StringParser("foo", KeywordSegment) fooBaar = Sequence( foo, baar, ) # Even if fooRegex comes first, fooBaar # is a longer match and should be taken g = OneOf(fooRegex, fooBaar) with RootParseContext(dialect=None) as ctx: assert fooRegex.match(seg_list[2:], parse_context=ctx).matched_segments == ( KeywordSegment("foo", seg_list[2].pos_marker), ) assert g.match(seg_list[2:], parse_context=ctx).matched_segments == ( KeywordSegment("foo", seg_list[2].pos_marker), KeywordSegment("baar", seg_list[3].pos_marker), )
def test__parser__grammar_oneof_take_first(seg_list): """Test that the OneOf grammar takes first match in case they are of same length.""" fooRegex = RegexParser(r"fo{2}", KeywordSegment) foo = StringParser("foo", KeywordSegment) # Both segments would match "foo" # so we test that order matters g1 = OneOf(fooRegex, foo) g2 = OneOf(foo, fooRegex) with RootParseContext(dialect=None) as ctx: assert g1.match(seg_list[2:], parse_context=ctx).matched_segments == ( KeywordSegment("foo", seg_list[2].pos_marker), ) assert g2.match(seg_list[2:], parse_context=ctx).matched_segments == ( KeywordSegment("foo", seg_list[2].pos_marker), )
class SamplingExpressionSegment(BaseSegment): """A sampling expression.""" type = "sample_expression" match_grammar = Sequence( "TABLESAMPLE", Bracketed( OneOf( Sequence( "BUCKET", Ref("NumericLiteralSegment"), "OUT", "OF", Ref("NumericLiteralSegment"), Sequence( "ON", OneOf( Ref("SingleIdentifierGrammar"), Ref("FunctionSegment"), ), optional=True, ), ), Sequence( Ref("NumericLiteralSegment"), OneOf("PERCENT", "ROWS", optional=True), ), RegexParser( r"\d+[bBkKmMgG]", CodeSegment, type="byte_length_literal", ), ), ), Ref( "AliasExpressionSegment", optional=True, ), )
RegexLexer("atsign_literal", r"@[a-zA-Z_][\w]*", CodeSegment), RegexLexer("dollar_literal", r"[$][a-zA-Z0-9_.]*", CodeSegment), ], before="not_equal", ) exasol_fs_dialect.add( FunctionScriptTerminatorSegment=NamedParser("function_script_terminator", CodeSegment, type="statement_terminator"), WalrusOperatorSegment=NamedParser("walrus_operator", SymbolSegment, type="assignment_operator"), VariableNameSegment=RegexParser( r"[A-Z][A-Z0-9_]*", CodeSegment, name="function_variable", type="variable", ), ) exasol_fs_dialect.replace(SemicolonSegment=StringParser(";", SymbolSegment, name="semicolon", type="semicolon"), ) @exasol_fs_dialect.segment(replace=True) class StatementSegment(BaseSegment): """A generic segment, to any of its child subsegments.""" type = "statement"
Sequence("FOR", "SYSTEM_TIME", "AS", "OF", Ref("ExpressionSegment"), optional=True), Sequence("WITH", "OFFSET", "AS", Ref("SingleIdentifierGrammar"), optional=True), ), FunctionNameIdentifierSegment=RegexParser( # In BigQuery struct() has a special syntax, so we don't treat it as a function r"[A-Z][A-Z0-9_]*", CodeSegment, name="function_name_identifier", type="function_name_identifier", anti_template=r"STRUCT", ), ) @bigquery_dialect.segment(replace=True) class FunctionDefinitionGrammar(BaseSegment): """This is the body of a `CREATE FUNCTION AS` statement.""" match_grammar = Sequence( AnyNumberOf( Sequence( "LANGUAGE", # Not really a parameter, but best fit for now.
"SAMPLE", "TABLESAMPLE", "UNPIVOT", ]) snowflake_dialect.add( # In snowflake, these are case sensitive even though they're not quoted # so they need a different `name` and `type` so they're not picked up # by other rules. ParameterAssignerSegment=StringParser("=>", SymbolSegment, name="parameter_assigner", type="parameter_assigner"), NakedSemiStructuredElementSegment=RegexParser( r"[A-Z0-9_]*", CodeSegment, name="naked_semi_structured_element", type="semi_structured_element", ), QuotedSemiStructuredElementSegment=NamedParser( "double_quote", CodeSegment, name="quoted_semi_structured_element", type="semi_structured_element", ), ColumnIndexIdentifierSegment=RegexParser( r"\$[0-9]+", CodeSegment, name="column_index_identifier_segment", type="identifier", ), )
ProcedureParameterGrammar=OneOf( Sequence( OneOf( Ref("OutputParameterSegment"), Ref("InputParameterSegment"), Ref("InputOutputParameterSegment"), optional=True, ), Ref("ParameterNameSegment", optional=True), Ref("DatatypeSegment"), ), Ref("DatatypeSegment"), ), LocalVariableNameSegment=RegexParser( r"`?[a-zA-Z0-9_]*`?", CodeSegment, name="declared_variable", type="variable", ), SessionVariableNameSegment=RegexParser( r"[@][a-zA-Z0-9_]*", CodeSegment, name="declared_variable", type="variable", ), ) mysql_dialect.replace( DelimiterSegment=OneOf(Ref("SemicolonSegment"), Ref("TildeSegment")), TildeSegment=StringParser( "~", SymbolSegment, name="tilde", type="statement_terminator" ),
RcfileKeywordSegment=StringParser("RCFILE", KeywordSegment, type="file_format"), SequencefileKeywordSegment=StringParser("SEQUENCEFILE", KeywordSegment, type="file_format"), TextfileKeywordSegment=StringParser("TEXTFILE", KeywordSegment, type="file_format"), PropertyGrammar=Sequence( Ref("QuotedLiteralSegment"), Ref("EqualsSegment"), Ref("QuotedLiteralSegment"), ), LocationGrammar=Sequence("LOCATION", Ref("S3UrlGrammar")), S3UrlGrammar=RegexParser(r"^'s3://.*", RawSegment), BracketedPropertyListGrammar=Bracketed(Delimited(Ref("PropertyGrammar"))), CTASPropertyGrammar=Sequence( OneOf( "external_location", "format", "partitioned_by", "bucketed_by", "bucket_count", "write_compression", "orc_compression", "parquet_compression", "field_delimiter", ), Ref("EqualsSegment"), Ref("LiteralGrammar"),
allow_trailing=True, ), QuestionMarkSegment=StringParser( "?", SymbolSegment, name="question_mark", type="question_mark" ), AtSignLiteralSegment=NamedParser( "atsign_literal", CodeSegment, name="atsign_literal", type="literal", trim_chars=("@",), ), # Add a Full equivalent which also allow keywords NakedIdentifierSegmentFull=RegexParser( r"[A-Z_][A-Z0-9_]*", CodeSegment, name="naked_identifier_all", type="identifier", ), SingleIdentifierGrammarFull=OneOf( Ref("NakedIdentifierSegment"), Ref("QuotedIdentifierSegment"), Ref("NakedIdentifierSegmentFull"), ), DefaultDeclareOptionsGrammar=Sequence( "DEFAULT", OneOf( Ref("LiteralGrammar"), Bracketed(Ref("SelectStatementSegment")), Ref("BareFunctionSegment"), Ref("FunctionSegment"), Ref("ArrayLiteralSegment"),
Ref("QuotedIdentifierSegment"), Ref("BracketedIdentifierSegment"), ), LiteralGrammar=OneOf( Ref("QuotedLiteralSegment"), Ref("QuotedLiteralSegmentWithN"), Ref("NumericLiteralSegment"), Ref("BooleanLiteralGrammar"), Ref("QualifiedNumericLiteralSegment"), # NB: Null is included in the literals, because it is a keyword which # can otherwise be easily mistaken for an identifier. Ref("NullLiteralSegment"), Ref("DateTimeLiteralGrammar"), ), ParameterNameSegment=RegexParser(r"[@][A-Za-z0-9_]+", CodeSegment, name="parameter", type="parameter"), FunctionNameIdentifierSegment=RegexParser( r"[A-Z][A-Z0-9_]*|\[[A-Z][A-Z0-9_]*\]", CodeSegment, name="function_name_identifier", type="function_name_identifier", ), DatatypeIdentifierSegment=Ref("SingleIdentifierGrammar"), PrimaryKeyGrammar=Sequence( "PRIMARY", "KEY", OneOf("CLUSTERED", "NONCLUSTERED", optional=True)), FromClauseTerminatorGrammar=OneOf( "WHERE", "LIMIT", "GROUP", "ORDER",
bracket_pairs_set="angle_bracket_pairs", ), ), # BigQuery also supports the special "Struct" construct. BaseExpressionElementGrammar=ansi_dialect.get_grammar( "BaseExpressionElementGrammar").copy( insert=[Ref("TypelessStructSegment")]), FunctionContentsGrammar=ansi_dialect.get_grammar( "FunctionContentsGrammar").copy( insert=[Ref("TypelessStructSegment")], before=Ref("ExpressionSegment"), ), # BigQuery allows underscore in parameter names, and also anything if quoted in backticks ParameterNameSegment=OneOf( RegexParser(r"[A-Z_][A-Z0-9_]*", CodeSegment, name="parameter", type="parameter"), RegexParser(r"`[^`]*`", CodeSegment, name="parameter", type="parameter"), ), DateTimeLiteralGrammar=Nothing(), ) # Set Keywords bigquery_dialect.sets("unreserved_keywords").clear() bigquery_dialect.sets("unreserved_keywords").update( [n.strip().upper() for n in bigquery_unreserved_keywords.split("\n")]) bigquery_dialect.sets("reserved_keywords").clear()