def parse(s): equals = pp.Suppress('=') colon = pp.Suppress(':') comment = pp.Suppress( pp.Optional(pp.Literal('#') - pp.ZeroOrMore(pp.Word(pp.printables)))) # set up multiple grammars # single str value strkeys = pp.oneOf(' '.join(strkeylist), caseless=True) string = pp.Word(pp.alphanums + punctuation) strgram = strkeys - equals - string - comment # single num value numkeys = pp.oneOf(' '.join(numkeys_int + numkeys_float), caseless=True) point = pp.Literal(".") e = pp.CaselessLiteral("E") num = pp.Combine( pp.Word("+-" + pp.nums, pp.nums) + pp.Optional(point + pp.Optional(pp.Word(pp.nums))) + pp.Optional(e + pp.Word("+-" + pp.nums, pp.nums))) numgram = numkeys - equals - num - comment # variable definition grammar strnumkeys = pp.oneOf(' '.join(var_def_keys + b_var_def_keys), caseless=True) bng_parameter = pp.Word(pp.alphas, pp.alphanums + "_") varnums = bng_parameter - num - num - pp.Optional(pp.Word("ubBU")) strnumgram = strnumkeys - equals - varnums - comment # multiple string value grammar multstrkey = pp.oneOf(' '.join(multstrkeys), caseless=True) multstrgram = multstrkey - equals - pp.OneOrMore(string) # var and logvar alt grammar (only one number given) varkeys = pp.oneOf(' '.join(var_def_keys_1or2nums), caseless=True) vargram = varkeys - equals - bng_parameter - num - pp.Optional( num) - comment # multiple num value multnumkey = pp.oneOf(' '.join(multnumkeys), caseless=True) multnumgram = multnumkey - equals - pp.OneOrMore(num) - comment # model-data mapping grammar mdmkey = pp.CaselessLiteral("model") nonetoken = pp.Suppress(pp.CaselessLiteral("none")) model_file = pp.Regex(".*?\.(bngl|xml)") exp_file = pp.Regex(".*?\.(exp|con|prop)") mdmgram = mdmkey - equals - model_file - colon - ( pp.delimitedList(exp_file) ^ nonetoken) - comment # normalization mapping grammar normkey = pp.CaselessLiteral("normalization") anything = pp.Word(pp.alphanums + punctuation + ' ') normgram = normkey - equals - anything # The set of legal grammars for normalization is too complicated, # Will handle with separate code. # Grammar for dictionary-like specification of simulation actions # We are intentionally over-permissive here, because the Action class will be able to give more helpful error # messages than a failed parse. dict_entry = pp.Word( pp.alphas) - colon - pp.Word(pp.alphanums + punctuation_safe) dict_key = pp.oneOf(' '.join(dictkeys), caseless=True) dictgram = dict_key - equals - pp.delimitedList(dict_entry) - comment # mutant model grammar mutkey = pp.CaselessLiteral('mutant') mut_op = pp.Group( pp.Word(pp.alphas + '_', pp.alphanums + '_') - pp.oneOf('+ - * / =') - num) mutgram = mutkey - equals - string - string - pp.Group(pp.OneOrMore(mut_op)) - \ pp.Group(colon - (pp.delimitedList(exp_file) ^ nonetoken)) - comment # check each grammar and output somewhat legible error message line = (mdmgram | strgram | numgram | strnumgram | multnumgram | multstrgram | vargram | normgram | dictgram | mutgram).parseString(s, parseAll=True).asList() return line
class StructDefine(object): """ StructDefine is a decorator class used for defining structures by parsing a simple intermediate language input decorating a StructFormatter class. """ All = {} rawtypes = ( "x", "c", "b", "B", "h", "H", "i", "I", "l", "L", "f", "d", "s", "n", "N", "p", "P", "q", "Q", ) alignments = { "x": 1, "c": 1, "b": 1, "B": 1, "s": 1, "h": 2, "H": 2, "i": 4, "I": 4, "l": 4, "L": 4, "f": 4, "q": 8, "Q": 8, "d": 8, "P": 8, } integer = pp.Regex(r"[0-9][0-9]*") integer.setParseAction(lambda r: int(r[0])) bitslen = pp.Group(pp.Suppress("#") + integer + pp.Suppress(".") + integer) symbol = pp.Regex(r"[A-Za-z_][A-Za-z0-9_]*") comment = pp.Suppress(";") + pp.restOfLine fieldname = pp.Suppress(":") + pp.Group( pp.Optional(pp.Literal(">") | pp.Literal("<"), default=None) + symbol) inf = pp.Regex(r"~[bBhHiI]?") length = integer | symbol | inf | bitslen typename = pp.Group(symbol + pp.Optional(pp.Suppress("*") + length, default=0)) structfmt = pp.OneOrMore( pp.Group(typename + fieldname + pp.Optional(comment, default=""))) def __init__(self, fmt, **kargs): self.fields = [] self.source = fmt self.packed = kargs.get("packed", False) if "alignments" in kargs: self.alignments = kargs["alignments"] for l in self.structfmt.parseString(fmt, True).asList(): f_type, f_name, f_comment = l f_order, f_name = f_name f_type, f_count = f_type if f_order is None and "order" in kargs: f_order = kargs["order"] if f_type in self.rawtypes: f_cls = RawField if isinstance(f_count, str) and f_count.startswith("~"): f_cls = VarField if f_count[1:] in "bBhHiI": f_cls = CntField f_align = self.alignments[f_type] else: f_cls = Field f_type = kargs.get(f_type, f_type) f_align = 0 self.fields.append( f_cls(f_type, f_count, f_name, f_order, f_align, f_comment)) def __call__(self, cls): self.All[cls.__name__] = cls cls.fields = self.fields cls.source = self.source cls.packed = self.packed cls.fkeys = defaultdict(default_formatter) return cls
# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import html import pyparsing as pp # Some useful primitives ident = pp.Word(pp.alphas + "_", pp.alphas + pp.nums + "_") intNum = pp.Word(pp.nums) hexNum = pp.Literal("0x") + pp.Word(pp.hexnums) octalNum = pp.Literal("0") + pp.Word("01234567") integer = (hexNum | octalNum | intNum) + \ pp.Optional(pp.Literal("ULL") | pp.Literal("LL") | pp.Literal("L")) floatNum = pp.Regex(r'\d+(\.\d*)?([eE]\d+)?') + pp.Optional(pp.Literal("f")) char = pp.Literal("'") + pp.Word(pp.printables, exact=1) + pp.Literal("'") arrayIndex = integer | ident lbracket = pp.Literal("(").suppress() rbracket = pp.Literal(")").suppress() lbrace = pp.Literal("{").suppress() rbrace = pp.Literal("}").suppress() comma = pp.Literal(",").suppress() equals = pp.Literal("=").suppress() dot = pp.Literal(".").suppress() semicolon = pp.Literal(";").suppress() # initializer := { [member = ] (variable | expression | { initializer } ) } typeName = ident varName = ident
RETURNS = pp.Keyword("returns") PUBLIC = pp.Keyword("public") VIRTUAL = pp.Keyword("virtual") FUNC = pp.oneOf("func function") PROC = pp.oneOf("proc procedure") OF = pp.Keyword("of") SELECT = pp.Keyword("select") UPDATE = pp.Keyword("update") FROM = pp.Keyword("from") AS = pp.Keyword("as") EQ = pp.Literal("=") ONE = pp.Keyword("one") STAR = pp.Literal("*") UNIQUE = pp.Keyword("unique") CLASS = pp.Keyword("class") dblDashComment = pp.Regex(r"--(?:\\\n|[^\n])*").setName("-- comment") #variable/label name name = pp.Word(pp.alphanums + "_" + ".") #sql astuple = AS + name select_tablename = (pp.Optional(name + EQ).suppress() + name).setResultsName("select") update_tablename = (pp.Optional(name + EQ).suppress() + name).setResultsName("update") tablefield = pp.Word(pp.alphanums + "_" + ".") tablefields = pp.Optional(tablefield ^ pp.delimitedList(tablefield, ",")) #class pptype = pp.Word(pp.alphanums + "." + "_")
override = bool(toks.override), abstract = bool(toks.abstract), is_async = bool(toks.is_async), rtype = toks.rtype, anotations = toks.anotations, throws = list(toks.throws) if toks.throws else []) def parse_namespace(toks): return Namespace(toks.name, toks.members) # VAPI Parser Grammar ident = pp.Word(pp.alphas + '_', pp.alphanums + '_').setName("ident") dot_ident = pp.Combine(ident + pp.ZeroOrMore(pp.Literal(".") + ident)) integer = pp.Regex(r'[+-]?\d+').setName("integer").setParseAction(tokenMap(int)) real = pp.Regex(r'[+-]?\d+\.\d*').setName("real").setParseAction(tokenMap(float)) sci_real = pp.Regex(r'[+-]?\d+([eE][+-]?\d+|\.\d*([eE][+-]?\d+)?)').setName("scireal").setParseAction(tokenMap(float)) number = (sci_real | real | integer).streamline() string = pp.QuotedString("\"", "\\") null = pp.Literal("null").setParseAction(lambda toks: None) true = pp.Literal("true").setParseAction(lambda toks: True) false = pp.Literal("false").setParseAction(lambda toks: False) value = string | number | null | true | false param = pp.Group(ident + pp.Literal("=").suppress() + value) type_name = pp.Combine(dot_ident + pp.Optional(pp.Literal("?")))("type_name") params = pp.Group(pp.Optional(param + pp.ZeroOrMore(pp.Literal(',').suppress() + param))).setParseAction(parse_params)("params") params_in_parens = pp.Literal('(').suppress() + pp.Optional(params) + pp.Literal(')').suppress() anotation = pp.Group(pp.Literal('[').suppress() + ident("name") + pp.Optional(params_in_parens) + pp.Literal(']').suppress()).setParseAction(parse_anotation) anotations = pp.ZeroOrMore(anotation).setParseAction(parse_anotations)("anotations") access = pp.Optional(pp.Keyword("protected") | pp.Keyword("public") | pp.Keyword("private") | pp.Keyword("internal"))("access")
class BashHistoryParser(text_parser.PyparsingMultiLineTextParser): """Parses events from Bash history files.""" NAME = 'bash' DESCRIPTION = 'Parser for Bash history files' _ENCODING = 'utf-8' _TIMESTAMP = pyparsing.Suppress('#') + pyparsing.Word( pyparsing.nums, min=9, max=10).setParseAction( text_parser.PyParseIntCast).setResultsName('timestamp') _COMMAND = pyparsing.Regex(r'.*?(?=($|\n#\d{10}))', re.DOTALL).setResultsName('command') _LINE_GRAMMAR = _TIMESTAMP + _COMMAND + pyparsing.lineEnd() _VERIFICATION_GRAMMAR = (pyparsing.Regex(r'^\s?[^#].*?$', re.MULTILINE) + _TIMESTAMP + pyparsing.NotAny(pyparsing.pythonStyleComment)) LINE_STRUCTURES = [('log_entry', _LINE_GRAMMAR)] def ParseRecord(self, parser_mediator, key, structure): """Parses a record and produces a Bash history event. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfvfs. key (str): name of the parsed structure. structure (pyparsing.ParseResults): elements parsed from the file. Raises: ParseError: when the structure type is unknown. """ if key != 'log_entry': raise errors.ParseError( 'Unable to parse record, unknown structure: {0:s}'.format(key)) event_data = BashHistoryEventData() event_data.command = structure.command date_time = dfdatetime_posix_time.PosixTime( timestamp=structure.timestamp) event = time_events.DateTimeValuesEvent( date_time, definitions.TIME_DESCRIPTION_MODIFICATION) parser_mediator.ProduceEventWithEventData(event, event_data) # pylint: disable=unused-argument def VerifyStructure(self, parser_mediator, lines): """Verifies that this is a bash history file. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfvfs. lines (str): one or more lines from the text file. Returns: bool: True if this is the correct parser, False otherwise. """ match_generator = self._VERIFICATION_GRAMMAR.scanString(lines, maxMatches=1) return bool(list(match_generator))
class ZshExtendedHistoryParser(text_parser.PyparsingMultiLineTextParser): """Parser for ZSH extended history files""" NAME = 'zsh_extended_history' DATA_FORMAT = 'ZSH extended history file' _ENCODING = 'utf-8' _VERIFICATION_REGEX = re.compile(r'^:\s\d+:\d+;') _PYPARSING_COMPONENTS = { 'timestamp': text_parser.PyparsingConstants.INTEGER.setResultsName('timestamp'), 'elapsed_seconds': text_parser.PyparsingConstants.INTEGER.setResultsName( 'elapsed_seconds'), 'command': pyparsing.Regex(r'.+?(?=($|\n:\s\d+:\d+;))', re.DOTALL).setResultsName('command'), } _LINE_GRAMMAR = (pyparsing.Literal(':') + _PYPARSING_COMPONENTS['timestamp'] + pyparsing.Literal(':') + _PYPARSING_COMPONENTS['elapsed_seconds'] + pyparsing.Literal(';') + _PYPARSING_COMPONENTS['command'] + pyparsing.LineEnd()) LINE_STRUCTURES = [('command', _LINE_GRAMMAR)] def ParseRecord(self, parser_mediator, key, structure): """Parses a record and produces a ZSH history event. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfvfs. key (str): name of the parsed structure. structure (pyparsing.ParseResults): structure parsed from the log file. Raises: ParseError: when the structure type is unknown. """ if key != 'command': raise errors.ParseError( 'Unable to parse record, unknown structure: {0:s}'.format(key)) event_data = ZshHistoryEventData() event_data.command = self._GetValueFromStructure(structure, 'command') event_data.elapsed_seconds = self._GetValueFromStructure( structure, 'elapsed_seconds') timestamp = self._GetValueFromStructure(structure, 'timestamp') date_time = dfdatetime_posix_time.PosixTime(timestamp=timestamp) event = time_events.DateTimeValuesEvent( date_time, definitions.TIME_DESCRIPTION_MODIFICATION) parser_mediator.ProduceEventWithEventData(event, event_data) def VerifyStructure(self, parser_mediator, lines): """Verifies whether content corresponds to a ZSH extended_history file. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfvfs. lines (str): one or more lines from the text file. Returns: bool: True if the line was successfully parsed. """ if self._VERIFICATION_REGEX.match(lines): return True return False
adjacent=False) value.setParseAction(lambda start, tokens: (start, tokens[0])) empty = pypar.Empty() empty.setParseAction(lambda start, tokens: (start, tokens)) value = pypar.Group(value + empty) row = pypar.Group( pypar.Optional(separator).suppress() + (value + pypar.Literal(separator).suppress()) * (1, None) + pypar.Optional(value) + (pypar.StringEnd() | pypar.Literal("\n")).suppress() + pypar.Optional("\n").suppress()) table_parser ^= ( (pypar.LineStart() + pypar.Optional(pypar.White())).suppress() + # Allow line breaks for table headings row + pypar.Optional( pypar.Regex(r"[\-_=]{3,}") + pypar.Literal("\n") * (1, 2)).suppress() + row * (0, None)).setResultsName("delimiter:" + separator) table_parser.parseWithTabs() key_value_separators = [":", "-", ">"] key_value_list_parser = pypar.NoMatch() for separator in key_value_separators: value = pypar.Combine(word_token_regex(separator) * (1, 10), joinString=' ', adjacent=False) value.setParseAction(lambda start, tokens: (start, tokens[0])) empty = pypar.Empty() empty.setParseAction(lambda start, tokens: (start, tokens)) value = pypar.Group(value + empty) row = pypar.Group(value + pypar.Literal(separator).suppress() + value +
def __create(self): START = pp.StringStart().suppress() END = pp.StringEnd().suppress() #----------------------------------------------------------------------# # LANGUAGE TOKENS #----------------------------------------------------------------------# TRUE = pp.Literal('True').setParseAction(lambda s, loc, toks: toks[0]) FALSE = pp.Literal('False').setParseAction( lambda s, loc, toks: toks[0]) AND = pp.Literal('and').setParseAction(lambda s, loc, toks: toks[0]) OR = pp.Literal('or').setParseAction(lambda s, loc, toks: toks[0]) NOT = pp.Literal('not').setParseAction(lambda s, loc, toks: toks[0]) # # Expression's elements # LEFT_PAREN = pp.Literal('(') RIGHT_PAREN = pp.Literal(')') LEFT_SPAREN = pp.Literal('[') RIGHT_SPAREN = pp.Literal(']') COMMA = pp.Literal(',') SEMICOLON = pp.Literal(';') # OID's syntax elements COLUMN = pp.Literal(':') TYPE_NEW = pp.Literal('@') TYPE_OLD = pp.Literal('#') # Unescaped String prefix UNESCAPE_STR = pp.Literal('r') # # Operators # ASSIGN = pp.Literal('=') # OIDs concat operator DOT = pp.Literal('.') PLUS_PLUS = pp.Literal('++') MINUS_MINUS = pp.Literal('--') POWER = pp.Literal('**') PLUS = pp.Literal('+') MINUS = pp.Literal('-') MULTI = pp.Literal('*') DIV = pp.Literal('/') MOD = pp.Literal('%') EQ = pp.Literal('eq') EQUAL = pp.Literal('==') NEQUAL = pp.Literal('!=') REGEXPQUAL = pp.Literal('=~') GT = pp.Literal('>') LT = pp.Literal('<') GEQ = pp.Literal('>=') LEQ = pp.Literal('<=') LOGIC_NOT = pp.Literal('!') LOGIC_AND = pp.Literal('&&') LOGIC_OR = pp.Literal('||') BITAND = pp.Literal('&') BITOR = pp.Literal('|') BITXOR = pp.Literal('^') # One's complement operator BITONE = pp.Literal('~') IF = pp.Literal('if') THEN = pp.Literal('then') ELSE = pp.Literal('else') TRY = pp.Literal('try') CATCH = pp.Literal('catch') #---------------------------------------------------------------------------*/ # Language Types #---------------------------------------------------------------------------*/ # # Literals # QUOTED = pp.QuotedString('"', escChar='\\') | pp.QuotedString( "'", escChar='\\') STRING = pp.originalTextFor(QUOTED) RSTRING = pp.originalTextFor(UNESCAPE_STR + QUOTED) # # Variable identifiers ($a, $a1, $_a, $a_a123) # VAR_ID = pp.Word('$', pp.alphanums + '_', min=2) # # Function identifiers # FUNCTION_ID = pp.Word(pp.alphas, pp.alphanums + '_', min=1) # # Numbers # HEX = pp.originalTextFor(pp.Regex('[0][xX][0-9a-fA-F]+')) DEC = pp.originalTextFor(pp.Word('0') | pp.Regex('[1-9][0-9]*')) OCTAL = pp.originalTextFor(pp.Regex('[0][0-7]+')) FLOAT1 = pp.Regex('[0-9]+[\.][0-9]+([eE][+-]?[0-9]+)*') FLOAT2 = pp.Regex('[0-9]+[\.]([eE][+-]?[0-9]+)*') FLOAT = pp.originalTextFor(FLOAT1 | FLOAT2) # # Special identifiers { <name> (@|#) } # DATA_ID = pp.originalTextFor( pp.Combine( pp.Word('{') + pp.Word(pp.alphas, pp.alphanums + '_-.') + pp.Word('@#') + pp.Word('}'))) #----------------------------------------------------------------------# #----------------------------------------------------------------------# # # GRAMMAR SYNTAX # #----------------------------------------------------------------------# #----------------------------------------------------------------------# #----------------------------------------------------------------------# # variabile # constants (1, 1.0, 'c', "foo", ecc...) # ( ... ) #----------------------------------------------------------------------# OID_SEQUENCE = pp.Regex('[0-9]+[\.][0-9]+([\.][0-9]+)+') constant = ( TRUE.setParseAction(lambda s, loc, toks: self.f.createBool(True)) | FALSE.setParseAction(lambda s, loc, toks: self.f.createBool(False)) | HEX.setParseAction( lambda s, loc, toks: self.f.createInteger(int(toks[1], 16))) | (~(OID_SEQUENCE) + FLOAT).setParseAction( lambda s, loc, toks: self.f.createFloat(float(toks[0]))) | OCTAL.setParseAction( lambda s, loc, toks: self.f.createInteger(int(toks[1], 8))) | DEC.setParseAction( lambda s, loc, toks: self.f.createInteger(int(toks[1], 10))) | STRING.setParseAction( lambda s, loc, toks: self.f.createString(toks, True)) | RSTRING.setParseAction( lambda s, loc, toks: self.f.createString(toks[1:], True))) cond_expr = pp.Forward() #----------------------------------------------------------------------# # Primary Expr #----------------------------------------------------------------------# primary_expr = ( (LEFT_PAREN.suppress() + cond_expr + RIGHT_PAREN.suppress() ).setParseAction(lambda s, loc, toks: toks[0]) | VAR_ID.setParseAction( lambda s, loc, toks: self.f.createIdentifier(toks[0])) | DATA_ID.setParseAction( lambda s, loc, toks: self.f.createDataIdentifier(toks[1])) | constant) #----------------------------------------------------------------------# # POSTFIX EXPRESSION #----------------------------------------------------------------------# # foo() # for(a,b,...) # $id() # $id # $id(a,b,...) #----------------------------------------------------------------------# # # Named argument # named_argument_value = pp.Forward() name_argument = ( FUNCTION_ID + ASSIGN.suppress() + named_argument_value ).setParseAction( lambda s, loc, toks: self.f.createNamedArgument(toks[0], toks[1])) # # Simple argument # simple_argument_value = pp.Forward() # # 1, 2, 3, foo=10, bar=10234 # argument = name_argument | simple_argument_value argument_expr_list = (argument + pp.ZeroOrMore(COMMA.suppress() + argument)) #----------------------------------------------------------------------# # ( ), (a,b,c,...) #----------------------------------------------------------------------# def _call_expr_callback(s, loc, toks): args = toks.get('args') if args is None: args = [] else: args = list(args) return ('CALL', args) call_expr = ( LEFT_PAREN.suppress() + pp.Optional(argument_expr_list('args')) + RIGHT_PAREN.suppress()).setParseAction(_call_expr_callback) #----------------------------------------------------------------------# # [], [;], [i], [i;], [;j] [i;j] #----------------------------------------------------------------------# def _range_expr_callback(s, loc, toks): args = [] start = toks.get('start') args.append(start) if 'end' in toks: end = toks.get('end') args.append(end) return ('RANGE', args) range_value = pp.Forward() range_expr = ( LEFT_SPAREN.suppress() + pp.Optional(range_value)('start') + pp.Optional(SEMICOLON.suppress() + pp.Optional(range_value)('end')) + RIGHT_SPAREN.suppress()).setParseAction(_range_expr_callback) #----------------------------------------------------------------------# call_or_range = range_expr | call_expr def _func_callback(s, loc, toks): if len(toks) == 1: return toks[0] current_t = toks[0] for t in toks[1:]: f_type, args = t if f_type == 'CALL': current_t = self.f.createCallOp(current_t, args) elif f_type == 'RANGE': current_t = self.f.createRangeOp(current_t, args) else: raise Exception("ERROR") return current_t postfix_expr = ( (FUNCTION_ID + pp.OneOrMore(call_or_range)).setParseAction(_func_callback) | (primary_expr + pp.ZeroOrMore(call_or_range)).setParseAction(_func_callback)) #----------------------------------------------------------------------# # UNARY EXPRESSION #----------------------------------------------------------------------# # <expr> # <expr>() # <expr>[] # + <expr> # - <expr> # ~ <expr> # ! <expr> #---------------------------------------------------------------------------*/ unary_expr = pp.Forward() calc_expr = ( postfix_expr | (PLUS_PLUS.suppress() + unary_expr).setParseAction( lambda s, loc, toks: self.f.createAddAddOp(toks[0])) | (MINUS_MINUS.suppress() + unary_expr).setParseAction( lambda s, loc, toks: self.f.createSubSubOp(toks[0])) | (PLUS.suppress() + unary_expr).setParseAction(lambda s, loc, toks: toks[0]) | (MINUS.suppress() + unary_expr).setParseAction( lambda s, loc, toks: self.f.createMinusOp(toks[0])) | ((LOGIC_NOT | NOT).suppress() + unary_expr ).setParseAction(lambda s, loc, toks: self.f.createNotOp(toks[0])) | (BITONE.suppress() + unary_expr).setParseAction( lambda s, loc, toks: self.f.createBitOneOp(toks[0]))) #---------------------------------------------------------------------------*/ # OID Expressions #---------------------------------------------------------------------------*/ # These expressions rappresent SNMP OID values: # # <oid expression> [':' <community-expr>] '@' [ <host-expr> [':' <port-expr>] ] # # where <oid expression> is: # # n.n.n '.' <exp-1> '.' <exp-2> '.' <exp-n> # #---------------------------------------------------------------------------*/ # # The DOT ('.') operator is a bit tricky: expressions are converted # into strings and concatenated. # # This means that if i concatenate OID 1.2.3.4 with the float # literal 5.6 the result is 1.2.3.4.5.6 # def _oid_compositon_callback(s, loc, toks): toks = list(toks) expr = toks.pop(0) while toks: expr = self.f.createConcatOID(expr, toks.pop(0)) return expr def _oid_callback(s, loc, toks): return self.f.createOID(toks[1]) oid_compositon = ( pp.originalTextFor(OID_SEQUENCE).setParseAction(_oid_callback) + pp.ZeroOrMore(DOT.suppress() + ( pp.originalTextFor(OID_SEQUENCE).setParseAction(_oid_callback) | postfix_expr))).setParseAction(_oid_compositon_callback) def _snmp_single_expr_callback(s, loc, toks): oid = toks['oid'] community = toks['community'] if 'community' in toks else None t = toks['type'] node = toks['node'] if 'node' in toks else None port = toks['port'] if 'port' in toks else None return self.f.createSnmpValue(oid, community, t, node, port) snmp_single_expr = ( oid_compositon('oid') + pp.Optional(COLUMN.suppress() + postfix_expr)('community') + pp.originalTextFor(TYPE_OLD | TYPE_NEW)('type') + pp.Optional( postfix_expr('node') + pp.Optional(COLUMN.suppress() + postfix_expr)('port')) ).setParseAction(_snmp_single_expr_callback) #----------------------------------------------------------------------# # 1.3.6.1.2.1.1@ [ ] #----------------------------------------------------------------------# def _func_callback_x(s, loc, toks): toks = list(toks) if len(toks) == 1: return toks[0] expr = toks[0] range_args = toks[1][1] return self.f.createRangeOp(expr, range_args) snmp_value_expr = ( snmp_single_expr + pp.Optional(range_expr)).setParseAction(_func_callback_x) #----------------------------------------------------------------------# # IF <expr> THEN <expr ELSE <expr> #----------------------------------------------------------------------# def _if_callback(s, loc, toks): e1 = toks.get('e1') e2 = toks.get('e2') e3 = toks.get('e3') return self.f.createIf(e1, e2, e3) if_expr = (IF.suppress() + cond_expr("e1") + THEN.suppress() + cond_expr("e2") + ELSE.suppress() + cond_expr("e3")).setParseAction(_if_callback) #----------------------------------------------------------------------# # try <expr> catch [ <id> ] ( <expr> ) [ catch <id> ( <expr> ) ....] #----------------------------------------------------------------------# def _catch_expr_callback(s, loc, toks): ex_name = toks.get('exception') expr = toks.get('expr') return (ex_name, expr) def _try_expr_callback(s, loc, toks): body = toks['body'] catch_list = list(toks['catch_list']) return self.f.createTry(body, catch_list) # # catch [ <expr> ] ( <expr> ) # catch_expr_body = pp.Forward() catch_expr = ( pp.Optional(FUNCTION_ID)('exception') + LEFT_PAREN.suppress() + pp.Optional(cond_expr)('expr') + RIGHT_PAREN.suppress()).setParseAction(_catch_expr_callback) # # try <expr> [ catch <expr> ( <expr> ) .... ] # catch_list = CATCH.suppress() + pp.OneOrMore(catch_expr) try_expr = ( TRY.suppress() + cond_expr('body') + catch_list('catch_list')).setParseAction(_try_expr_callback) #----------------------------------------------------------------------# # UNARY EXPRESSION #----------------------------------------------------------------------# unary_expr <<= (if_expr | try_expr | snmp_value_expr | calc_expr) #----------------------------------------------------------------------# # OPERATORS #----------------------------------------------------------------------# OP_MAP = { str(POWER.match): self.f.createPowerOp, str(MULTI.match): self.f.createMultiOp, str(DIV.match): self.f.createDivOp, str(MOD.match): self.f.createModOp, str(PLUS.match): self.f.createAddOp, str(MINUS.match): self.f.createSubOp, str(LT.match): self.f.createLtOp, str(GT.match): self.f.createGtOp, str(LEQ.match): self.f.createLEqOp, str(GEQ.match): self.f.createGEqOp, str(EQUAL.match): self.f.createEqOp, str(EQ.match): self.f.createEqOp, str(NEQUAL.match): self.f.createNotEqOp, str(REGEXPQUAL.match): self.f.createRegExpEqOp, str(BITAND.match): self.f.createBitAndOp, str(BITXOR.match): self.f.createBitXOrOp, str(BITOR.match): self.f.createBitOrOp, str(AND.match): self.f.createAndOp, str(LOGIC_AND.match): self.f.createAndOp, str(OR.match): self.f.createOrOp, str(LOGIC_OR.match): self.f.createOrOp, } def _op_callback(s, loc, toks): l = list(toks) if len(l) == 1: return l expr = l.pop(0) while l: op, expr2 = l.pop(0), l.pop(0) op_callback = OP_MAP[op] expr = op_callback(expr, expr2) return expr expr = unary_expr #// a ** b expr = (expr + pp.ZeroOrMore(POWER + expr)).setParseAction(_op_callback) #// a * b #// a / c #// a % c expr = (expr + pp.ZeroOrMore((MULTI | DIV | MOD) + expr)).setParseAction(_op_callback) #// a + b #// a - b expr = ( expr + pp.ZeroOrMore((PLUS | MINUS) + expr)).setParseAction(_op_callback) #// a < b #// a > b #// a <= b #// a >= b expr = (expr + pp.ZeroOrMore((LT | GT | LEQ | GEQ) + expr)).setParseAction(_op_callback) #// a == b #// a != b #// a ~= b expr = (expr + pp.ZeroOrMore((EQUAL | EQ | NEQUAL | REGEXPQUAL) + expr)).setParseAction(_op_callback) #// a & b expr = (expr + pp.ZeroOrMore(BITAND + expr)).setParseAction(_op_callback) #// a ^ b expr = (expr + pp.ZeroOrMore(BITXOR + expr)).setParseAction(_op_callback) #// a | b expr = (expr + pp.ZeroOrMore(BITOR + expr)).setParseAction(_op_callback) #// a && b expr = (expr + pp.ZeroOrMore((LOGIC_AND | AND) + expr)).setParseAction(_op_callback) #// a || b expr = ( expr + pp.ZeroOrMore((LOGIC_OR | OR) + expr)).setParseAction(_op_callback) #----------------------------------------------------------------------# # Recursive rules #----------------------------------------------------------------------# cond_expr <<= expr simple_argument_value <<= cond_expr named_argument_value <<= cond_expr range_value <<= cond_expr #----------------------------------------------------------------------# # Initiali RULE #----------------------------------------------------------------------# lang_expr = (START + cond_expr + END) return lang_expr
import pyparsing as pp from cifparser.path import path_parser from cifparser.errors import ParserError Comment = collections.namedtuple('Comment',['value']) ObjectDef = collections.namedtuple('ObjectDef', ['path']) ListItemDef = collections.namedtuple('ListItemDef', ['path']) FieldDef = collections.namedtuple('FieldDef', ['field_name','field_value']) ValueContinuation = collections.namedtuple('ValueContinuation', ['value_continuation']) ListContinuation = collections.namedtuple('ListContinuation', ['list_continuation']) comment_parser = pp.Literal('#') + pp.restOfLine objectdef_parser = path_parser + pp.Literal(':') listitemdef_parser = pp.Literal('-') + path_parser + pp.Literal(':') fieldkey_parser = pp.Regex(r'[^=]+') fieldkey_parser.setParseAction(lambda tokens: tokens[0].strip()) fielddef_parser = fieldkey_parser + pp.Literal('=') + pp.restOfLine valuecontinuation_parser = pp.Literal('|') + pp.restOfLine listcontinuation_parser = pp.Literal(',') + pp.restOfLine def comment_parse_action(tokens): return Comment(tokens[1]) comment_parser.setParseAction(comment_parse_action) def objectdef_parse_action(tokens): return ObjectDef(tokens[0]) objectdef_parser.setParseAction(objectdef_parse_action) def listitemdef_parse_action(tokens): return ListItemDef(tokens[1])
#!/usr/local/bin/python # -*- coding: utf-8 -*- import pyparsing as pp import pyparsing_ext as ppx w = ppx.Wordx(lambda x: x in {'a', 'b', 'c', 'd'}) print(w.parseString('abbcccdddde')) M = ppx.delimitedMatrix(w, ch1=' ', ch2=pp.Regex('\n+').leaveWhitespace()) p = M.parseString('a b\nc d') print(p.asList()) s = '''[1]hehe [2]hehe''' print(ppx.enumeratedItems().parseString(s))
def parse(self, request): input = request._rest_context.get('filter') if not input: return None condition_positions = [] operator = pp.Regex('|'.join(self.ALLOWED_OPERATORS)) number = pp.Regex(r"[+-]?\d+(:?\.\d*)?(:?[eE][+-]?\d+)?") AND = pp.Literal(LOGICAL_OPERATORS.AND) OR = pp.Literal(LOGICAL_OPERATORS.OR) NOT = pp.Literal(LOGICAL_OPERATORS.NOT) identifier = pp.Regex(r"[a-zA-Z]+[a-zA-Z0-9]*(_[a-zA-Z0-9]+)*") identifiers = pp.Group( pp.delimitedList(identifier, delim="__", combine=False)) comparison_term = pp.Forward() list_term = ( pp.Group( pp.Suppress('[') + pp.delimitedList(comparison_term, delim=",", combine=False) + pp.Suppress(']')) | pp.Group( pp.Suppress('(') + pp.delimitedList(comparison_term, delim=",", combine=False) + pp.Suppress(')')) | pp.Group( pp.Suppress('{') + pp.delimitedList(comparison_term, delim=",", combine=False) + pp.Suppress('}'))) string = (pp.QuotedString("'", escChar='\\', unquoteResults=True) | pp.QuotedString('"', escChar='\\', unquoteResults=True)) null = pp.Literal('null').setParseAction(lambda s, l, t: None) boolean = pp.Regex('|'.join( ('true', 'false'))).setParseAction(lambda s, l, t: t[0] == 'true') comparison_term << (string | number | list_term | null | boolean) condition = pp.Group(identifiers + operator + comparison_term).setResultsName('condition') condition.setParseAction( lambda s, loc, tocs: condition_positions.append(loc)) expr = pp.operatorPrecedence(condition, [ ( NOT, 1, pp.opAssoc.RIGHT, ), ( AND, 2, pp.opAssoc.LEFT, ), ( OR, 2, pp.opAssoc.LEFT, ), ]) try: return self._parse_to_conditions( expr.parseString(input, parseAll=True).asList()[0], list(condition_positions), condition, input) except pp.ParseException as ex: raise FilterParserError( mark_safe(ugettext('Invalid filter value "{}"').format(input)))
class SCCMParser(text_parser.PyparsingMultiLineTextParser): """Parser for Windows System Center Configuration Manager (SCCM) logs.""" NAME = 'sccm' DESCRIPTION = 'Parser for SCCM logs files.' _ENCODING = 'utf-8-sig' # Increasing the buffer size as SCCM messages are commonly well larger # than the default value. BUFFER_SIZE = 16384 LINE_STRUCTURES = [] _MICRO_SECONDS_PER_MINUTE = 60 * 1000000 _FOUR_DIGITS = text_parser.PyparsingConstants.FOUR_DIGITS _ONE_OR_TWO_DIGITS = text_parser.PyparsingConstants.ONE_OR_TWO_DIGITS # PyParsing Components used to construct grammars for parsing lines. _PARSING_COMPONENTS = { 'msg_left_delimiter': pyparsing.Literal('<![LOG['), 'msg_right_delimiter': pyparsing.Literal(']LOG]!><time="'), 'year': _FOUR_DIGITS.setResultsName('year'), 'month': _ONE_OR_TWO_DIGITS.setResultsName('month'), 'day': _ONE_OR_TWO_DIGITS.setResultsName('day'), 'microsecond': pyparsing.Regex(r'\d{3,7}'). setResultsName('microsecond'), 'utc_offset_minutes': pyparsing.Regex(r'[-+]\d{3}').setResultsName( 'utc_offset_minutes'), 'date_prefix': pyparsing.Literal('" date="'). setResultsName( 'date_prefix'), 'component_prefix': pyparsing.Literal('" component="').setResultsName( 'component_prefix'), 'component': pyparsing.Word(pyparsing.alphanums).setResultsName( 'component'), 'text': pyparsing.Regex( r'.*?(?=(]LOG]!><time="))', re.DOTALL).setResultsName('text'), 'line_remainder': pyparsing.Regex( r'.*?(?=(\<!\[LOG\[))', re.DOTALL).setResultsName('line_remainder'), 'lastline_remainder': pyparsing.restOfLine.setResultsName( 'lastline_remainder'), 'hour': _ONE_OR_TWO_DIGITS.setResultsName('hour'), 'minute': text_parser.PyparsingConstants.TWO_DIGITS.setResultsName( 'minute'), 'second': text_parser.PyparsingConstants.TWO_DIGITS.setResultsName( 'second')} # Base grammar for individual log event lines. LINE_GRAMMAR_BASE = ( _PARSING_COMPONENTS['msg_left_delimiter'] + _PARSING_COMPONENTS['text'] + _PARSING_COMPONENTS['msg_right_delimiter'] + _PARSING_COMPONENTS['hour'] + pyparsing.Suppress(':') + _PARSING_COMPONENTS['minute'] + pyparsing.Suppress(':') + _PARSING_COMPONENTS['second'] + pyparsing.Suppress('.') + _PARSING_COMPONENTS['microsecond'] + _PARSING_COMPONENTS['date_prefix'] + _PARSING_COMPONENTS['month'] + pyparsing.Suppress('-') + _PARSING_COMPONENTS['day'] + pyparsing.Suppress('-') + _PARSING_COMPONENTS['year'] + _PARSING_COMPONENTS['component_prefix'] + _PARSING_COMPONENTS['component']) # Grammar for individual log event lines with a minutes offset from UTC. LINE_GRAMMAR_OFFSET = ( _PARSING_COMPONENTS['msg_left_delimiter'] + _PARSING_COMPONENTS['text'] + _PARSING_COMPONENTS['msg_right_delimiter'] + _PARSING_COMPONENTS['hour'] + pyparsing.Suppress(':') + _PARSING_COMPONENTS['minute'] + pyparsing.Suppress(':') + _PARSING_COMPONENTS['second'] + pyparsing.Suppress('.') + _PARSING_COMPONENTS['microsecond'] + _PARSING_COMPONENTS['utc_offset_minutes'] + _PARSING_COMPONENTS['date_prefix'] + _PARSING_COMPONENTS['month'] + pyparsing.Suppress('-') + _PARSING_COMPONENTS['day'] + pyparsing.Suppress('-') + _PARSING_COMPONENTS['year'] + _PARSING_COMPONENTS['component_prefix'] + _PARSING_COMPONENTS['component']) LINE_STRUCTURES = [ ('log_entry', LINE_GRAMMAR_BASE + _PARSING_COMPONENTS['line_remainder']), ('log_entry_at_end', LINE_GRAMMAR_BASE +_PARSING_COMPONENTS['lastline_remainder'] + pyparsing.lineEnd), ('log_entry_offset', LINE_GRAMMAR_OFFSET + _PARSING_COMPONENTS['line_remainder']), ('log_entry_offset_at_end', LINE_GRAMMAR_OFFSET + _PARSING_COMPONENTS['lastline_remainder'] + pyparsing.lineEnd)] def ParseRecord(self, parser_mediator, key, structure): """Parse the record and return an SCCM log event object. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfvfs. file_object (dfvfs.FileIO): a file-like object. structure (pyparsing.ParseResults): structure of tokens derived from a line of a text file. Raises: ParseError: when the structure type is unknown. TimestampError: when a non-int value for microseconds is encountered. """ if key not in ( 'log_entry', 'log_entry_at_end', 'log_entry_offset', 'log_entry_offset_at_end'): raise errors.ParseError( 'Unable to parse record, unknown structure: {0:s}'.format(key)) # Sometimes, SCCM logs will exhibit a seven-digit sub-second precision # (100 nanosecond intervals). Using six-digit precision because # timestamps are in microseconds. if len(structure.microsecond) > 6: structure.microsecond = structure.microsecond[0:6] try: microseconds = int(structure.microsecond, 10) except ValueError as exception: parser_mediator.ProduceExtractionError( 'unable to determine microseconds with error: {0!s}'.format( exception)) return # 3-digit precision is milliseconds, # so multiply by 1000 to convert to microseconds if len(structure.microsecond) == 3: microseconds *= 1000 try: timestamp = timelib.Timestamp.FromTimeParts( structure.year, structure.month, structure.day, structure.hour, structure.minute, structure.second, microseconds) except errors.TimestampError as exception: timestamp = timelib.Timestamp.NONE_TIMESTAMP parser_mediator.ProduceExtractionError( 'unable to determine timestamp with error: {0!s}'.format( exception)) # If an offset is given for the event, apply the offset to convert to UTC. if timestamp and 'offset' in key: try: delta_microseconds = int(structure.utc_offset_minutes[1:], 10) except (IndexError, ValueError) as exception: raise errors.TimestampError( 'Unable to parse minute offset from UTC with error: {0!s}.'.format( exception)) delta_microseconds *= self._MICRO_SECONDS_PER_MINUTE if structure.utc_offset_minutes[0] == '-': delta_microseconds = -delta_microseconds timestamp += delta_microseconds event_data = SCCMLogEventData() event_data.component = structure.component # TODO: pass line number to offset or remove. event_data.offset = 0 event_data.text = structure.text event = time_events.TimestampEvent( timestamp, definitions.TIME_DESCRIPTION_WRITTEN) parser_mediator.ProduceEventWithEventData(event, event_data) def VerifyStructure(self, parser_mediator, lines): """Verifies whether content corresponds to an SCCM log file. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfvfs. lines (str): one or more lines from the text file. Returns: bool: True if this is the correct parser, False otherwise. """ # Identify the token to which we attempt a match. match = self._PARSING_COMPONENTS['msg_left_delimiter'].match # Because logs files can lead with a partial event, # we can't assume that the first character (post-BOM) # in the file is the beginning of our match - so we # look for match anywhere in lines. return match in lines
class SyslogParser(text_parser.PyparsingMultiLineTextParser): """Parses syslog formatted log files""" NAME = 'syslog' DESCRIPTION = 'Syslog Parser' _ENCODING = 'utf-8' _plugin_classes = {} # The reporter and facility fields can contain any printable character, but # to allow for processing of syslog formats that delimit the reporter and # facility with printable characters, we remove certain common delimiters # from the set of printable characters. _REPORTER_CHARACTERS = ''.join( [c for c in pyparsing.printables if c not in [':', '[', '<']]) _FACILITY_CHARACTERS = ''.join( [c for c in pyparsing.printables if c not in [':', '>']]) _SYSLOG_SEVERITY = [ 'EMERG', 'ALERT', 'CRIT', 'ERR', 'WARNING', 'NOTICE', 'INFO', 'DEBUG' ] _OFFSET_PREFIX = ['-', '+'] _BODY_CONTENT = ( r'.*?(?=($|\n\w{3}\s+\d{1,2}\s\d{2}:\d{2}:\d{2})|' \ r'($|\n\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{6}' \ r'[\+|-]\d{2}:\d{2}\s))') _VERIFICATION_REGEX = re.compile(r'^\w{3}\s+\d{1,2}\s\d{2}:\d{2}:\d{2}\s' + _BODY_CONTENT) # The Chrome OS syslog messages are of a format beginning with an # ISO 8601 combined date and time expression with timezone designator: # 2016-10-25T12:37:23.297265-07:00 # # This will then be followed by the SYSLOG Severity which will be one of: # EMERG,ALERT,CRIT,ERR,WARNING,NOTICE,INFO,DEBUG # # 2016-10-25T12:37:23.297265-07:00 INFO _CHROMEOS_VERIFICATION_REGEX = re.compile( r'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.' r'\d{6}[\+|-]\d{2}:\d{2}\s' r'(EMERG|ALERT|CRIT|ERR|WARNING|NOTICE|INFO|DEBUG)' + _BODY_CONTENT) _PYPARSING_COMPONENTS = { 'year': text_parser.PyparsingConstants.FOUR_DIGITS.setResultsName('year'), 'two_digit_month': (text_parser.PyparsingConstants.TWO_DIGITS.setResultsName( 'two_digit_month')), 'month': text_parser.PyparsingConstants.MONTH.setResultsName('month'), 'day': text_parser.PyparsingConstants.ONE_OR_TWO_DIGITS.setResultsName('day'), 'hour': text_parser.PyparsingConstants.TWO_DIGITS.setResultsName('hour'), 'minute': text_parser.PyparsingConstants.TWO_DIGITS.setResultsName('minute'), 'second': text_parser.PyparsingConstants.TWO_DIGITS.setResultsName('second'), 'fractional_seconds': pyparsing.Word(pyparsing.nums).setResultsName('fractional_seconds'), 'hostname': pyparsing.Word(pyparsing.printables).setResultsName('hostname'), 'reporter': pyparsing.Word(_REPORTER_CHARACTERS).setResultsName('reporter'), 'pid': text_parser.PyparsingConstants.PID.setResultsName('pid'), 'facility': pyparsing.Word(_FACILITY_CHARACTERS).setResultsName('facility'), 'severity': pyparsing.oneOf(_SYSLOG_SEVERITY).setResultsName('severity'), 'body': pyparsing.Regex(_BODY_CONTENT, re.DOTALL).setResultsName('body'), 'comment_body': pyparsing.SkipTo(' ---').setResultsName('body'), 'iso_8601_offset': (pyparsing.oneOf(_OFFSET_PREFIX) + text_parser.PyparsingConstants.TWO_DIGITS + pyparsing.Optional( pyparsing.Literal(':') + text_parser.PyparsingConstants.TWO_DIGITS)) } _PYPARSING_COMPONENTS['date'] = ( _PYPARSING_COMPONENTS['month'] + _PYPARSING_COMPONENTS['day'] + _PYPARSING_COMPONENTS['hour'] + pyparsing.Suppress(':') + _PYPARSING_COMPONENTS['minute'] + pyparsing.Suppress(':') + _PYPARSING_COMPONENTS['second'] + pyparsing.Optional( pyparsing.Suppress('.') + _PYPARSING_COMPONENTS['fractional_seconds'])) _PYPARSING_COMPONENTS['iso_8601_date'] = pyparsing.Combine( _PYPARSING_COMPONENTS['year'] + pyparsing.Literal('-') + _PYPARSING_COMPONENTS['two_digit_month'] + pyparsing.Literal('-') + _PYPARSING_COMPONENTS['day'] + pyparsing.Literal('T') + _PYPARSING_COMPONENTS['hour'] + pyparsing.Literal(':') + _PYPARSING_COMPONENTS['minute'] + pyparsing.Literal(':') + _PYPARSING_COMPONENTS['second'] + pyparsing.Literal('.') + _PYPARSING_COMPONENTS['fractional_seconds'] + _PYPARSING_COMPONENTS['iso_8601_offset'], joinString='', adjacent=True).setResultsName('iso_8601_date') _CHROMEOS_SYSLOG_LINE = ( _PYPARSING_COMPONENTS['iso_8601_date'] + _PYPARSING_COMPONENTS['severity'] + _PYPARSING_COMPONENTS['reporter'] + pyparsing.Optional(pyparsing.Suppress(':')) + pyparsing.Optional( pyparsing.Suppress('[') + _PYPARSING_COMPONENTS['pid'] + pyparsing.Suppress(']')) + pyparsing.Optional(pyparsing.Suppress(':')) + _PYPARSING_COMPONENTS['body'] + pyparsing.lineEnd()) _SYSLOG_LINE = ( _PYPARSING_COMPONENTS['date'] + _PYPARSING_COMPONENTS['hostname'] + _PYPARSING_COMPONENTS['reporter'] + pyparsing.Optional( pyparsing.Suppress('[') + _PYPARSING_COMPONENTS['pid'] + pyparsing.Suppress(']')) + pyparsing.Optional( pyparsing.Suppress('<') + _PYPARSING_COMPONENTS['facility'] + pyparsing.Suppress('>')) + pyparsing.Optional(pyparsing.Suppress(':')) + _PYPARSING_COMPONENTS['body'] + pyparsing.lineEnd()) _SYSLOG_COMMENT = (_PYPARSING_COMPONENTS['date'] + pyparsing.Suppress(':') + pyparsing.Suppress('---') + _PYPARSING_COMPONENTS['comment_body'] + pyparsing.Suppress('---') + pyparsing.LineEnd()) _KERNEL_SYSLOG_LINE = ( _PYPARSING_COMPONENTS['date'] + pyparsing.Literal('kernel').setResultsName('reporter') + pyparsing.Suppress(':') + _PYPARSING_COMPONENTS['body'] + pyparsing.lineEnd()) LINE_STRUCTURES = [('syslog_line', _SYSLOG_LINE), ('syslog_line', _KERNEL_SYSLOG_LINE), ('syslog_comment', _SYSLOG_COMMENT), ('chromeos_syslog_line', _CHROMEOS_SYSLOG_LINE)] _SUPPORTED_KEYS = frozenset([key for key, _ in LINE_STRUCTURES]) def __init__(self): """Initializes a parser.""" super(SyslogParser, self).__init__() self._last_month = 0 self._maximum_year = 0 self._plugin_by_reporter = {} self._year_use = 0 def _UpdateYear(self, mediator, month): """Updates the year to use for events, based on last observed month. Args: mediator (ParserMediator): mediates the interactions between parsers and other components, such as storage and abort signals. month (int): month observed by the parser, where January is 1. """ if not self._year_use: self._year_use = mediator.GetEstimatedYear() if not self._maximum_year: self._maximum_year = mediator.GetLatestYear() if not self._last_month: self._last_month = month return # Some syslog daemons allow out-of-order sequences, so allow some leeway # to not cause Apr->May->Apr to cause the year to increment. # See http://bugzilla.adiscon.com/show_bug.cgi?id=527 if self._last_month > (month + 1): if self._year_use != self._maximum_year: self._year_use += 1 self._last_month = month def EnablePlugins(self, plugin_includes): """Enables parser plugins. Args: plugin_includes (list[str]): names of the plugins to enable, where None or an empty list represents all plugins. Note that the default plugin is handled separately. """ super(SyslogParser, self).EnablePlugins(plugin_includes) self._plugin_by_reporter = {} for plugin in self._plugins: self._plugin_by_reporter[plugin.REPORTER] = plugin def ParseRecord(self, parser_mediator, key, structure): """Parses a matching entry. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfvfs. key (str): name of the parsed structure. structure (pyparsing.ParseResults): elements parsed from the file. Raises: ParseError: when the structure type is unknown. """ if key not in self._SUPPORTED_KEYS: raise errors.ParseError( 'Unable to parse record, unknown structure: {0:s}'.format(key)) if key == 'chromeos_syslog_line': timestamp = timelib.Timestamp.FromTimeString( structure.iso_8601_date[0]) else: month = timelib.MONTH_DICT.get(structure.month.lower(), None) if not month: parser_mediator.ProduceParserError( 'Invalid month value: {0:s}'.format(month)) return self._UpdateYear(parser_mediator, month) timestamp = timelib.Timestamp.FromTimeParts( year=self._year_use, month=month, day=structure.day, hour=structure.hour, minutes=structure.minute, seconds=structure.second, timezone=parser_mediator.timezone) plugin = None if key == 'syslog_comment': event_data = SyslogCommentEventData() event_data.body = structure.body # TODO: pass line number to offset or remove. event_data.offset = 0 else: event_data = SyslogLineEventData() event_data.body = structure.body event_data.hostname = structure.hostname or None # TODO: pass line number to offset or remove. event_data.offset = 0 event_data.pid = structure.pid event_data.reporter = structure.reporter event_data.severity = structure.severity plugin = self._plugin_by_reporter.get(structure.reporter, None) if plugin: attributes = { 'hostname': structure.hostname, 'severity': structure.severity, 'reporter': structure.reporter, 'pid': structure.pid, 'body': structure.body } try: # TODO: pass event_data instead of attributes. plugin.Process(parser_mediator, timestamp, attributes) except errors.WrongPlugin: plugin = None if not plugin: event = time_events.TimestampEvent( timestamp, definitions.TIME_DESCRIPTION_WRITTEN) parser_mediator.ProduceEventWithEventData(event, event_data) def VerifyStructure(self, unused_parser_mediator, lines): """Verifies that this is a syslog-formatted file. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfvfs. lines (str): one or more lines from the text file. Returns: bool: True if this is the correct parser, False otherwise. """ return (re.match(self._VERIFICATION_REGEX, lines) or re.match( self._CHROMEOS_VERIFICATION_REGEX, lines)) is not None
def _build_tgrep_parser(set_parse_actions=True): ''' Builds a pyparsing-based parser object for tokenizing and interpreting tgrep search strings. ''' tgrep_op = (pyparsing.Optional('!') + pyparsing.Regex('[$%,.<>][%,.<>0-9-\':]*')) tgrep_qstring = pyparsing.QuotedString(quoteChar='"', escChar='\\', unquoteResults=False) tgrep_node_regex = pyparsing.QuotedString(quoteChar='/', escChar='\\', unquoteResults=False) tgrep_qstring_icase = pyparsing.Regex( 'i@\\"(?:[^"\\n\\r\\\\]|(?:\\\\.))*\\"') tgrep_node_regex_icase = pyparsing.Regex( 'i@\\/(?:[^/\\n\\r\\\\]|(?:\\\\.))*\\/') tgrep_node_literal = pyparsing.Regex('[^][ \r\t\n;:.,&|<>()$!@%\'^=]+') tgrep_expr = pyparsing.Forward() tgrep_relations = pyparsing.Forward() tgrep_parens = pyparsing.Literal('(') + tgrep_expr + ')' tgrep_nltk_tree_pos = (pyparsing.Literal('N(') + pyparsing.Optional( pyparsing.Word(pyparsing.nums) + ',' + pyparsing.Optional( pyparsing.delimitedList(pyparsing.Word(pyparsing.nums), delim=',') + pyparsing.Optional(','))) + ')') tgrep_node_label = pyparsing.Regex('[A-Za-z0-9]+') tgrep_node_label_use = pyparsing.Combine('=' + tgrep_node_label) # see _tgrep_segmented_pattern_action tgrep_node_label_use_pred = tgrep_node_label_use.copy() macro_name = pyparsing.Regex('[^];:.,&|<>()[$!@%\'^=\r\t\n ]+') macro_name.setWhitespaceChars('') macro_use = pyparsing.Combine('@' + macro_name) tgrep_node_expr = (tgrep_node_label_use_pred | macro_use | tgrep_nltk_tree_pos | tgrep_qstring_icase | tgrep_node_regex_icase | tgrep_qstring | tgrep_node_regex | '*' | tgrep_node_literal) tgrep_node_expr2 = ( (tgrep_node_expr + pyparsing.Literal('=').setWhitespaceChars('') + tgrep_node_label.copy().setWhitespaceChars('')) | tgrep_node_expr) tgrep_node = (tgrep_parens | (pyparsing.Optional("'") + tgrep_node_expr2 + pyparsing.ZeroOrMore("|" + tgrep_node_expr))) tgrep_brackets = pyparsing.Optional('!') + '[' + tgrep_relations + ']' tgrep_relation = tgrep_brackets | (tgrep_op + tgrep_node) tgrep_rel_conjunction = pyparsing.Forward() tgrep_rel_conjunction << ( tgrep_relation + pyparsing.ZeroOrMore(pyparsing.Optional('&') + tgrep_rel_conjunction)) tgrep_relations << tgrep_rel_conjunction + pyparsing.ZeroOrMore( "|" + tgrep_relations) tgrep_expr << tgrep_node + pyparsing.Optional(tgrep_relations) tgrep_expr_labeled = tgrep_node_label_use + pyparsing.Optional( tgrep_relations) tgrep_expr2 = tgrep_expr + pyparsing.ZeroOrMore(':' + tgrep_expr_labeled) macro_defn = (pyparsing.Literal('@') + pyparsing.White().suppress() + macro_name + tgrep_expr2) tgrep_exprs = ( pyparsing.Optional(macro_defn + pyparsing.ZeroOrMore(';' + macro_defn) + ';') + tgrep_expr2 + pyparsing.ZeroOrMore(';' + (macro_defn | tgrep_expr2)) + pyparsing.ZeroOrMore(';').suppress()) if set_parse_actions: tgrep_node_label_use.setParseAction(_tgrep_node_label_use_action) tgrep_node_label_use_pred.setParseAction( _tgrep_node_label_pred_use_action) macro_use.setParseAction(_tgrep_macro_use_action) tgrep_node.setParseAction(_tgrep_node_action) tgrep_node_expr2.setParseAction(_tgrep_bind_node_label_action) tgrep_parens.setParseAction(_tgrep_parens_action) tgrep_nltk_tree_pos.setParseAction(_tgrep_nltk_tree_pos_action) tgrep_relation.setParseAction(_tgrep_relation_action) tgrep_rel_conjunction.setParseAction(_tgrep_conjunction_action) tgrep_relations.setParseAction(_tgrep_rel_disjunction_action) macro_defn.setParseAction(_macro_defn_action) # the whole expression is also the conjunction of two # predicates: the first node predicate, and the remaining # relation predicates tgrep_expr.setParseAction(_tgrep_conjunction_action) tgrep_expr_labeled.setParseAction(_tgrep_segmented_pattern_action) tgrep_expr2.setParseAction( functools.partial(_tgrep_conjunction_action, join_char=':')) tgrep_exprs.setParseAction(_tgrep_exprs_action) return tgrep_exprs.ignore('#' + pyparsing.restOfLine)
any_depth_p = unified.any_depth_p.copy().setParseAction(_any_depth_parse) def initial_markers(text): """Pull out a list of the first paragraph markers, i.e. markers before any text""" try: return list(any_depth_p.parseString(text)) except pyparsing.ParseException: return [] _collapsed_grammar = QuickSearchable( # A guard to reduce false positives pyparsing.Suppress(pyparsing.Regex(u',|\.|-|—|>|means ')) + any_depth_p) def collapsed_markers(text): """Not all paragraph markers are at the beginning of of the text. This grabs inner markers like (1) and (i) here: (c) cContent —(1) 1Content (i) iContent""" potential = [triplet for triplet in _collapsed_grammar.scanString(text)] # remove any that overlap with citations potential = [trip for trip in remove_citation_overlaps(text, potential)] # flatten the results potential = [pm for pms, _, _ in potential for pm in pms] # remove any matches that aren't (a), (1), (i), etc. -- All other # markers can't be collapsed first_markers = [level[0] for level in p_levels] potential = [pm for pm in potential if pm in first_markers]
# References: # http://stackoverflow.com/questions/11133339/parsing-a-complex-logical-expression-in-pyparsing-in-a-binary-tree-fashion # http://stackoverflow.com/questions/33532451/pyparsing-python-binary-boolean-expression-to-xml-nesting-issue-2-7-10 # http://pyparsing.wikispaces.com/file/view/simpleArith.py/30268305/simpleArith.py # http://qiita.com/knoguchi/items/ee949989d0a9f04bee6f # http://qiita.com/knoguchi/items/6f9b7383b7252a9ebdad """ import logging import pyparsing as pp import networkx as nx from patternmatching.query.Condition import * LPAR, RPAR = map(pp.Suppress, "()") numvalue = pp.Regex(r"\d+(\.\d*)?([eE][+-]?\d+)?") term = pp.Forward() factor = pp.Forward() addsub = pp.oneOf('+ -') muldiv = pp.oneOf('* /') compare = pp.Regex(">=|<=|!=|>|<|==").setName("compare") NOT_ = pp.Keyword("NOT").setName("NOT") AND_ = pp.Keyword("AND").setName("AND") OR_ = pp.Keyword("OR").setName("OR") symbol = pp.Word(pp.alphas).setName("symbol") propsymbol = pp.Group(symbol + "." + symbol).setName("propsymbol") formula = pp.Optional(addsub) + term + pp.ZeroOrMore(addsub + term) term << (factor + pp.ZeroOrMore(muldiv + factor)) factor << (numvalue | propsymbol | LPAR + formula + RPAR)
def word_token_regex(disallowed_delimiter): return pypar.Regex(r"[^\s\n" + re.escape(disallowed_delimiter) + r"]+")
class SELinuxParser(text_parser.PyparsingSingleLineTextParser): """Parser for SELinux audit.log files.""" NAME = 'selinux' DESCRIPTION = 'Parser for SELinux audit.log files.' _ENCODING = 'utf-8' _SELINUX_KEY_VALUE_GROUP = pyparsing.Group( pyparsing.Word(pyparsing.alphanums).setResultsName('key') + pyparsing.Suppress('=') + ( pyparsing.QuotedString('"') ^ pyparsing.Word(pyparsing.printables)).setResultsName('value')) _SELINUX_KEY_VALUE_DICT = pyparsing.Dict( pyparsing.ZeroOrMore(_SELINUX_KEY_VALUE_GROUP)) _SELINUX_BODY_GROUP = pyparsing.Group( pyparsing.Empty().setResultsName('key') + pyparsing.restOfLine.setResultsName('value')) _SELINUX_MSG_GROUP = pyparsing.Group( pyparsing.Literal('msg').setResultsName('key') + pyparsing.Suppress('=audit(') + pyparsing.Word(pyparsing.nums).setResultsName('seconds') + pyparsing.Suppress('.') + pyparsing.Word(pyparsing.nums).setResultsName('milliseconds') + pyparsing.Suppress(':') + pyparsing.Word(pyparsing.nums).setResultsName('serial') + pyparsing.Suppress('):')) _SELINUX_TYPE_GROUP = pyparsing.Group( pyparsing.Literal('type').setResultsName('key') + pyparsing.Suppress('=') + ( pyparsing.Word(pyparsing.srange('[A-Z_]')) ^ pyparsing.Regex(r'UNKNOWN\[[0-9]+\]')).setResultsName('value')) _SELINUX_TYPE_AVC_GROUP = pyparsing.Group( pyparsing.Literal('type').setResultsName('key') + pyparsing.Suppress('=') + ( pyparsing.Word('AVC') ^ pyparsing.Word('USER_AVC')).setResultsName('value')) # A log line is formatted as: type=TYPE msg=audit([0-9]+\.[0-9]+:[0-9]+): .* _SELINUX_LOG_LINE = pyparsing.Dict( _SELINUX_TYPE_GROUP + _SELINUX_MSG_GROUP + _SELINUX_BODY_GROUP) LINE_STRUCTURES = [('line', _SELINUX_LOG_LINE)] def ParseRecord(self, parser_mediator, key, structure): """Parses a structure of tokens derived from a line of a text file. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfvfs. key (str): name of the parsed structure. structure (pyparsing.ParseResults): structure of tokens derived from a line of a text file. Raises: ParseError: when the structure type is unknown. """ if key != 'line': raise errors.ParseError( 'Unable to parse record, unknown structure: {0:s}'.format(key)) msg_value = self._GetValueFromStructure(structure, 'msg') if not msg_value: parser_mediator.ProduceExtractionWarning( 'missing msg value: {0!s}'.format(structure)) return try: seconds = int(msg_value[0], 10) except ValueError: parser_mediator.ProduceExtractionWarning( 'unsupported number of seconds in msg value: {0!s}'.format( structure)) return try: milliseconds = int(msg_value[1], 10) except ValueError: parser_mediator.ProduceExtractionWarning( 'unsupported number of milliseconds in msg value: {0!s}'.format( structure)) return timestamp = ((seconds * 1000) + milliseconds) * 1000 body_text = structure[2][0] try: # Try to parse the body text as key value pairs. Note that not # all log lines will be properly formatted key value pairs. body_structure = self._SELINUX_KEY_VALUE_DICT.parseString(body_text) except pyparsing.ParseException: body_structure = pyparsing.ParseResults() event_data = SELinuxLogEventData() event_data.audit_type = self._GetValueFromStructure(structure, 'type') event_data.body = body_text event_data.pid = self._GetValueFromStructure(body_structure, 'pid') # TODO: pass line number to offset or remove. event_data.offset = 0 event = time_events.TimestampEvent( timestamp, definitions.TIME_DESCRIPTION_WRITTEN) parser_mediator.ProduceEventWithEventData(event, event_data) def VerifyStructure(self, parser_mediator, line): """Verifies if a line from a text file is in the expected format. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfvfs. line (str): line from a text file. Returns: bool: True if the line is in the expected format, False if not. """ try: structure = self._SELINUX_LOG_LINE.parseString(line) except pyparsing.ParseException as exception: logger.debug( 'Unable to parse SELinux audit.log file with error: {0!s}'.format( exception)) return False return 'type' in structure and 'msg' in structure
class Grammar: keywords = [ 'and', 'or', 'not', 'if', 'then', 'else', 'include', 'inherit', 'null', 'true', 'false', 'for', 'in' ] # This is a hack: this condition helps uselessly recursing into the grammar for # juxtapositions. early_abort_scan = ~p.oneOf([';', ',', ']', '}', 'for']) expression = pattern('expression', p.Forward()) comment = p.Regex('#') + ~p.FollowedBy(sym('.')) + p.restOfLine doc_comment = pattern('doc_comment', (sym('#.') - p.restOfLine)) quotedIdentifier = pattern('quotedIdentifier', p.QuotedString('`', multiline=False)) # - Must start with an alphascore # - May contain alphanumericscores and special characters such as : and - # - Must not end in a special character identifier = pattern( 'identifier', parseWithLocation( quotedIdentifier | p.Regex(r'[a-zA-Z_]([a-zA-Z0-9_:-]*[a-zA-Z0-9_])?'), Identifier)) # Variable identifier (can't be any of the keywords, which may have lower matching priority) variable = pattern( 'variable', ~p.MatchFirst(p.oneOf(keywords)) + pattern('identifier', parseWithLocation(identifier.copy(), Var))) # Contants integer = pattern( 'integer', parseWithLocation(p.Word(p.nums), convertAndMake(int, Literal))) floating = pattern( 'floating', parseWithLocation(p.Regex(r'\d*\.\d+'), convertAndMake(float, Literal))) dq_string = pattern( 'dq_string', parseWithLocation( p.QuotedString('"', escChar='\\', unquoteResults=False, multiline=True), convertAndMake(unquote, Literal))) sq_string = pattern( 'sq_string', parseWithLocation( p.QuotedString("'", escChar='\\', unquoteResults=False, multiline=True), convertAndMake(unquote, Literal))) boolean = pattern( 'boolean', parseWithLocation( p.Keyword('true') | p.Keyword('false'), convertAndMake(mkBool, Literal))) null = pattern('null', parseWithLocation(p.Keyword('null'), Null)) # List list_ = pattern( 'list', parseWithLocation(bracketedList('[', ']', ',', expression), List)) # Tuple inherit = pattern( 'inherit', (kw('inherit') - p.ZeroOrMore(variable)).setParseAction(inheritNodes)) schema_spec = pattern( 'schema_spec', parseWithLocation( p.Optional(p.Keyword('private').setParseAction(lambda: True), default=False) - p.Optional(p.Keyword('required').setParseAction(lambda: True), default=False) - p.Optional(expression, default=any_schema_expr), MemberSchemaNode)) optional_schema = pattern( 'optional_schema', p.Optional(p.Suppress(':') - schema_spec, default=no_schema)) expression_value = pattern('expression_value', sym('=') - swallow_errors(expression)) void_value = pattern( 'void_value', parseWithLocation(p.FollowedBy(sym(';') | sym('}')), lambda loc: Void(loc, 'nonameyet'))) member_value = pattern('member_value', swallow_errors(expression_value | void_value)) named_member = pattern( 'named_member', parseWithLocation( identifier - optional_schema - member_value - swallow_remainder(), TupleMemberNode)) documented_member = pattern( 'documented_member', parseWithLocation( parseWithLocation(p.ZeroOrMore(doc_comment), DocComment) + named_member, attach_doc_comment)) tuple_member = early_abort_scan + pattern( 'tuple_member', swallow_errors(inherit | documented_member) - swallow_remainder()) ErrorAwareTupleNode = functools.partial(TupleNode, allow_errors) tuple_members = pattern( 'tuple_members', parseWithLocation(listMembers(';', tuple_member), ErrorAwareTupleNode)) tuple << pattern( 'tuple', parseWithLocation( bracketedList('{', '}', ';', tuple_member, allow_missing_close=allow_errors), ErrorAwareTupleNode)) # Argument list will live by itself as a atom. Actually, it's a tuple, but we # don't call it that because we use that term for something else already :) arg_list = pattern( 'arg_list', bracketedList('(', ')', ',', expression).setParseAction(ArgList)) parenthesized_expr = pattern('parenthesized_expr', (sym('(') - expression - ')').setParseAction(head)) unary_op = pattern( 'unary_op', (p.oneOf(' '.join(functions.unary_operators.keys())) - expression).setParseAction(mkUnOp)) if_then_else = pattern( 'if_then_else', parseWithLocation( kw('if') + expression + kw('then') + expression + kw('else') + expression, Condition)) list_comprehension = pattern( 'list_comprehension', parseWithLocation( sym('[') + expression + kw('for') + variable + kw('in') + expression + p.Optional(kw('if') + expression) + sym(']'), ListComprehension)) # We don't allow space-application here # Now our grammar is becoming very dirty and hackish deref = pattern('deref', p.Forward()) include = pattern('include', parseWithLocation(kw('include') - deref, Include)) atom = pattern('atom', (tuple | sq_string | dq_string | variable | floating | integer | boolean | list_ | null | unary_op | parenthesized_expr | if_then_else | include | list_comprehension)) # We have two different forms of function application, so they can have 2 # different precedences. This one: fn(args), which binds stronger than # dereferencing (fn(args).attr == (fn(args)).attr) applic1 = pattern( 'applic1', parseWithLocation(atom - p.ZeroOrMore(arg_list), mkApplications)) # Dereferencing of an expression (obj.bar) deref << parseWithLocation( applic1 - p.ZeroOrMore(p.Suppress('.') - swallow_errors(identifier)), mkDerefs) # All binary operators at various precedence levels go here: # This piece of code does the moral equivalent of: # # T = F*F | F/F | F # E = T+T | T-T | T # # etc. term = deref for op_level in functions.binary_operators_before_juxtaposition: operator_syms = list(op_level.keys()) term = (term - p.ZeroOrMore(p.oneOf(operator_syms) - term)).setParseAction(mkBinOps) # Juxtaposition function application (fn arg), must be 1-arg every time applic2 = pattern( 'applic2', parseWithLocation(term - p.ZeroOrMore(early_abort_scan + term), mkApplications)) term = applic2 for op_level in functions.binary_operators_after_juxtaposition: operator_syms = list(op_level.keys()) term = (term - p.ZeroOrMore(p.oneOf(operator_syms) - term)).setParseAction(mkBinOps) expression << term # Two entry points: start at an arbitrary expression, or expect the top-level # scope to be a tuple. start = pattern('start', expression.copy().ignore(comment)) start_tuple = tuple_members.ignore(comment)
class SCCMParser(text_parser.PyparsingMultiLineTextParser): """Parser for Windows System Center Configuration Manager (SCCM) logs.""" NAME = 'sccm' DESCRIPTION = 'Parser for SCCM logs files.' _ENCODING = 'utf-8-sig' # Increasing the buffer size as SCCM messages are commonly well larger # than the default value. BUFFER_SIZE = 16384 LINE_STRUCTURES = [] _FOUR_DIGITS = text_parser.PyparsingConstants.FOUR_DIGITS _ONE_OR_TWO_DIGITS = text_parser.PyparsingConstants.ONE_OR_TWO_DIGITS # PyParsing Components used to construct grammars for parsing lines. _PARSING_COMPONENTS = { 'msg_left_delimiter': pyparsing.Literal('<![LOG['), 'msg_right_delimiter': pyparsing.Literal(']LOG]!><time="'), 'year': _FOUR_DIGITS.setResultsName('year'), 'month': _ONE_OR_TWO_DIGITS.setResultsName('month'), 'day': _ONE_OR_TWO_DIGITS.setResultsName('day'), 'fraction_of_second': pyparsing.Regex(r'\d{3,7}').setResultsName('fraction_of_second'), 'utc_offset_minutes': pyparsing.Regex(r'[-+]\d{2,3}').setResultsName('utc_offset_minutes'), 'date_prefix': pyparsing.Literal('" date="').setResultsName('date_prefix'), 'component_prefix': pyparsing.Literal('" component="').setResultsName('component_prefix'), 'component': pyparsing.Word(pyparsing.alphanums).setResultsName('component'), 'text': pyparsing.Regex(r'.*?(?=(]LOG]!><time="))', re.DOTALL).setResultsName('text'), 'line_remainder': pyparsing.Regex(r'.*?(?=(\<!\[LOG\[))', re.DOTALL).setResultsName('line_remainder'), 'lastline_remainder': pyparsing.restOfLine.setResultsName('lastline_remainder'), 'hour': _ONE_OR_TWO_DIGITS.setResultsName('hour'), 'minute': text_parser.PyparsingConstants.TWO_DIGITS.setResultsName('minute'), 'second': text_parser.PyparsingConstants.TWO_DIGITS.setResultsName('second') } # Base grammar for individual log event lines. LINE_GRAMMAR_BASE = ( _PARSING_COMPONENTS['msg_left_delimiter'] + _PARSING_COMPONENTS['text'] + _PARSING_COMPONENTS['msg_right_delimiter'] + _PARSING_COMPONENTS['hour'] + pyparsing.Suppress(':') + _PARSING_COMPONENTS['minute'] + pyparsing.Suppress(':') + _PARSING_COMPONENTS['second'] + pyparsing.Suppress('.') + _PARSING_COMPONENTS['fraction_of_second'] + _PARSING_COMPONENTS['date_prefix'] + _PARSING_COMPONENTS['month'] + pyparsing.Suppress('-') + _PARSING_COMPONENTS['day'] + pyparsing.Suppress('-') + _PARSING_COMPONENTS['year'] + _PARSING_COMPONENTS['component_prefix'] + _PARSING_COMPONENTS['component']) # Grammar for individual log event lines with a minutes offset from UTC. LINE_GRAMMAR_OFFSET = ( _PARSING_COMPONENTS['msg_left_delimiter'] + _PARSING_COMPONENTS['text'] + _PARSING_COMPONENTS['msg_right_delimiter'] + _PARSING_COMPONENTS['hour'] + pyparsing.Suppress(':') + _PARSING_COMPONENTS['minute'] + pyparsing.Suppress(':') + _PARSING_COMPONENTS['second'] + pyparsing.Suppress('.') + _PARSING_COMPONENTS['fraction_of_second'] + _PARSING_COMPONENTS['utc_offset_minutes'] + _PARSING_COMPONENTS['date_prefix'] + _PARSING_COMPONENTS['month'] + pyparsing.Suppress('-') + _PARSING_COMPONENTS['day'] + pyparsing.Suppress('-') + _PARSING_COMPONENTS['year'] + _PARSING_COMPONENTS['component_prefix'] + _PARSING_COMPONENTS['component']) LINE_STRUCTURES = [ ('log_entry', LINE_GRAMMAR_BASE + _PARSING_COMPONENTS['line_remainder']), ('log_entry_at_end', LINE_GRAMMAR_BASE + _PARSING_COMPONENTS['lastline_remainder'] + pyparsing.lineEnd), ('log_entry_offset', LINE_GRAMMAR_OFFSET + _PARSING_COMPONENTS['line_remainder']), ('log_entry_offset_at_end', LINE_GRAMMAR_OFFSET + _PARSING_COMPONENTS['lastline_remainder'] + pyparsing.lineEnd) ] def _GetISO8601String(self, structure): """Retrieves an ISO8601 date time string from the structure. The date and time values in the SCCM log are formatted as: time="19:33:19.766-330" date="11-28-2014" Args: structure (pyparsing.ParseResults): structure of tokens derived from a line of a text file. Returns: str: ISO 8601 date time string. Raises: ValueError: if the structure cannot be converted into a date time string. """ fraction_of_second_length = len(structure.fraction_of_second) if fraction_of_second_length not in (3, 6, 7): raise ValueError( 'unsupported time fraction of second length: {0:d}'.format( fraction_of_second_length)) try: fraction_of_second = int(structure.fraction_of_second, 10) except (TypeError, ValueError) as exception: raise ValueError( 'unable to determine fraction of second with error: {0!s}'. format(exception)) # TODO: improve precision support, but for now ignore the 100ns precision. if fraction_of_second_length == 7: fraction_of_second, _ = divmod(fraction_of_second, 10) date_time_string = '{0:04d}-{1:02d}-{2:02d}T{3:02d}:{4:02d}:{5:02d}'.format( structure.year, structure.month, structure.day, structure.hour, structure.minute, structure.second) if fraction_of_second_length > 0: date_time_string = '{0:s}.{1:d}'.format(date_time_string, fraction_of_second) utc_offset_minutes = structure.get('utc_offset_minutes', None) if utc_offset_minutes is not None: try: time_zone_offset = int(utc_offset_minutes[1:], 10) except (IndexError, ValueError) as exception: raise ValueError( 'Unable to parse time zone offset with error: {0!s}.'. format(exception)) time_zone_hours, time_zone_minutes = divmod(time_zone_offset, 60) date_time_string = '{0:s}{1:s}{2:02d}:{3:02d}'.format( date_time_string, utc_offset_minutes[0], time_zone_hours, time_zone_minutes) return date_time_string def ParseRecord(self, parser_mediator, key, structure): """Parse the record and return an SCCM log event object. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfvfs. file_object (dfvfs.FileIO): a file-like object. structure (pyparsing.ParseResults): structure of tokens derived from a line of a text file. Raises: ParseError: when the structure type is unknown. """ if key not in ('log_entry', 'log_entry_at_end', 'log_entry_offset', 'log_entry_offset_at_end'): raise errors.ParseError( 'Unable to parse record, unknown structure: {0:s}'.format(key)) try: date_time_string = self._GetISO8601String(structure) except ValueError as exception: parser_mediator.ProduceExtractionError( 'unable to determine date time string with error: {0!s}'. format(exception)) fraction_of_second_length = len(structure.fraction_of_second) if fraction_of_second_length == 3: date_time = dfdatetime_time_elements.TimeElementsInMilliseconds() elif fraction_of_second_length in (6, 7): date_time = dfdatetime_time_elements.TimeElementsInMicroseconds() try: date_time.CopyFromStringISO8601(date_time_string) except ValueError as exception: parser_mediator.ProduceExtractionError( 'unable to parse date time value: {0:s} with error: {1!s}'. format(date_time_string, exception)) return event_data = SCCMLogEventData() event_data.component = structure.component # TODO: pass line number to offset or remove. event_data.offset = 0 event_data.text = structure.text event = time_events.DateTimeValuesEvent( date_time, definitions.TIME_DESCRIPTION_WRITTEN) parser_mediator.ProduceEventWithEventData(event, event_data) def VerifyStructure(self, parser_mediator, lines): """Verifies whether content corresponds to an SCCM log file. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfvfs. lines (str): one or more lines from the text file. Returns: bool: True if this is the correct parser, False otherwise. """ # Identify the token to which we attempt a match. match = self._PARSING_COMPONENTS['msg_left_delimiter'].match # Because logs files can lead with a partial event, # we can't assume that the first character (post-BOM) # in the file is the beginning of our match - so we # look for match anywhere in lines. return match in lines
class SyslogParser(text_parser.PyparsingMultiLineTextParser): """Parses syslog formatted log files""" NAME = 'syslog' DATA_FORMAT = 'System log (syslog) file' _ENCODING = 'utf-8' _plugin_classes = {} # The reporter and facility fields can contain any printable character, but # to allow for processing of syslog formats that delimit the reporter and # facility with printable characters, we remove certain common delimiters # from the set of printable characters. _REPORTER_CHARACTERS = ''.join( [c for c in pyparsing.printables if c not in [':', '[', '<']]) _FACILITY_CHARACTERS = ''.join( [c for c in pyparsing.printables if c not in [':', '>']]) _SYSLOG_SEVERITY = [ 'EMERG', 'ALERT', 'CRIT', 'ERR', 'WARNING', 'NOTICE', 'INFO', 'DEBUG' ] # TODO: change pattern to allow only spaces as a field separator. _BODY_PATTERN = ( r'.*?(?=($|\n\w{3}\s+\d{1,2}\s\d{2}:\d{2}:\d{2})|' \ r'($|\n\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{6}' \ r'[\+|-]\d{2}:\d{2}\s))') # The rsyslog file format (RSYSLOG_FileFormat) consists of: # %TIMESTAMP% %HOSTNAME% %syslogtag%%msg% # # Where %TIMESTAMP% is in RFC-3339 date time format e.g. # 2020-05-31T00:00:45.698463+00:00 _RSYSLOG_VERIFICATION_PATTERN = (r'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.' r'\d{6}[\+|-]\d{2}:\d{2} ' + _BODY_PATTERN) # The rsyslog traditional file format (RSYSLOG_TraditionalFileFormat) # consists of: # %TIMESTAMP% %HOSTNAME% %syslogtag%%msg% # # Where %TIMESTAMP% is in yearless ctime date time format e.g. # Jan 22 07:54:32 # TODO: change pattern to allow only spaces as a field separator. _RSYSLOG_TRADITIONAL_VERIFICATION_PATTERN = ( r'^\w{3}\s+\d{1,2}\s\d{2}:\d{2}:\d{2}\s' + _BODY_PATTERN) # The Chrome OS syslog messages are of a format beginning with an # ISO 8601 combined date and time expression with timezone designator: # 2016-10-25T12:37:23.297265-07:00 # # This will then be followed by the SYSLOG Severity which will be one of: # EMERG,ALERT,CRIT,ERR,WARNING,NOTICE,INFO,DEBUG # # 2016-10-25T12:37:23.297265-07:00 INFO _CHROMEOS_VERIFICATION_PATTERN = ( r'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.' r'\d{6}[\+|-]\d{2}:\d{2}\s' r'(EMERG|ALERT|CRIT|ERR|WARNING|NOTICE|INFO|DEBUG)' + _BODY_PATTERN) # Bundle all verification patterns into a single regular expression. _VERIFICATION_REGEX = re.compile('({0:s})'.format('|'.join([ _CHROMEOS_VERIFICATION_PATTERN, _RSYSLOG_VERIFICATION_PATTERN, _RSYSLOG_TRADITIONAL_VERIFICATION_PATTERN ]))) _PYPARSING_COMPONENTS = { 'year': text_parser.PyparsingConstants.FOUR_DIGITS.setResultsName('year'), 'two_digit_month': (text_parser.PyparsingConstants.TWO_DIGITS.setResultsName( 'two_digit_month')), 'month': text_parser.PyparsingConstants.MONTH.setResultsName('month'), 'day': text_parser.PyparsingConstants.ONE_OR_TWO_DIGITS.setResultsName('day'), 'hour': text_parser.PyparsingConstants.TWO_DIGITS.setResultsName('hour'), 'minute': text_parser.PyparsingConstants.TWO_DIGITS.setResultsName('minute'), 'second': text_parser.PyparsingConstants.TWO_DIGITS.setResultsName('second'), 'fractional_seconds': pyparsing.Word(pyparsing.nums).setResultsName('fractional_seconds'), 'hostname': pyparsing.Word(pyparsing.printables).setResultsName('hostname'), 'reporter': pyparsing.Word(_REPORTER_CHARACTERS).setResultsName('reporter'), 'pid': text_parser.PyparsingConstants.PID.setResultsName('pid'), 'facility': pyparsing.Word(_FACILITY_CHARACTERS).setResultsName('facility'), 'severity': pyparsing.oneOf(_SYSLOG_SEVERITY).setResultsName('severity'), 'body': pyparsing.Regex(_BODY_PATTERN, re.DOTALL).setResultsName('body'), 'comment_body': pyparsing.SkipTo(' ---').setResultsName('body') } _PYPARSING_COMPONENTS['date'] = ( _PYPARSING_COMPONENTS['month'] + _PYPARSING_COMPONENTS['day'] + _PYPARSING_COMPONENTS['hour'] + pyparsing.Suppress(':') + _PYPARSING_COMPONENTS['minute'] + pyparsing.Suppress(':') + _PYPARSING_COMPONENTS['second'] + pyparsing.Optional( pyparsing.Suppress('.') + _PYPARSING_COMPONENTS['fractional_seconds'])) _PYPARSING_COMPONENTS['rfc3339_datetime'] = pyparsing.Combine( pyparsing.Word(pyparsing.nums, exact=4) + pyparsing.Literal('-') + pyparsing.Word(pyparsing.nums, exact=2) + pyparsing.Literal('-') + pyparsing.Word(pyparsing.nums, exact=2) + pyparsing.Literal('T') + pyparsing.Word(pyparsing.nums, exact=2) + pyparsing.Literal(':') + pyparsing.Word(pyparsing.nums, exact=2) + pyparsing.Literal(':') + pyparsing.Word(pyparsing.nums, exact=2) + pyparsing.Literal('.') + pyparsing.Word(pyparsing.nums, exact=6) + pyparsing.oneOf(['-', '+']) + pyparsing.Word(pyparsing.nums, exact=2) + pyparsing.Optional( pyparsing.Literal(':') + pyparsing.Word(pyparsing.nums, exact=2)), joinString='', adjacent=True) _CHROMEOS_SYSLOG_LINE = ( _PYPARSING_COMPONENTS['rfc3339_datetime'].setResultsName('datetime') + _PYPARSING_COMPONENTS['severity'] + _PYPARSING_COMPONENTS['reporter'] + pyparsing.Optional(pyparsing.Suppress(':')) + pyparsing.Optional( pyparsing.Suppress('[') + _PYPARSING_COMPONENTS['pid'] + pyparsing.Suppress(']')) + pyparsing.Optional(pyparsing.Suppress(':')) + _PYPARSING_COMPONENTS['body'] + pyparsing.lineEnd()) _RSYSLOG_LINE = ( _PYPARSING_COMPONENTS['rfc3339_datetime'].setResultsName('datetime') + _PYPARSING_COMPONENTS['hostname'] + _PYPARSING_COMPONENTS['reporter'] + pyparsing.Optional( pyparsing.Suppress('[') + _PYPARSING_COMPONENTS['pid'] + pyparsing.Suppress(']')) + pyparsing.Optional( pyparsing.Suppress('<') + _PYPARSING_COMPONENTS['facility'] + pyparsing.Suppress('>')) + pyparsing.Optional(pyparsing.Suppress(':')) + _PYPARSING_COMPONENTS['body'] + pyparsing.lineEnd()) _RSYSLOG_TRADITIONAL_LINE = ( _PYPARSING_COMPONENTS['date'] + _PYPARSING_COMPONENTS['hostname'] + _PYPARSING_COMPONENTS['reporter'] + pyparsing.Optional( pyparsing.Suppress('[') + _PYPARSING_COMPONENTS['pid'] + pyparsing.Suppress(']')) + pyparsing.Optional( pyparsing.Suppress('<') + _PYPARSING_COMPONENTS['facility'] + pyparsing.Suppress('>')) + pyparsing.Optional(pyparsing.Suppress(':')) + _PYPARSING_COMPONENTS['body'] + pyparsing.lineEnd()) _SYSLOG_COMMENT = (_PYPARSING_COMPONENTS['date'] + pyparsing.Suppress(':') + pyparsing.Suppress('---') + _PYPARSING_COMPONENTS['comment_body'] + pyparsing.Suppress('---') + pyparsing.LineEnd()) _KERNEL_SYSLOG_LINE = ( _PYPARSING_COMPONENTS['date'] + pyparsing.Literal('kernel').setResultsName('reporter') + pyparsing.Suppress(':') + _PYPARSING_COMPONENTS['body'] + pyparsing.lineEnd()) LINE_STRUCTURES = [('chromeos_syslog_line', _CHROMEOS_SYSLOG_LINE), ('kernel_syslog_line', _KERNEL_SYSLOG_LINE), ('rsyslog_line', _RSYSLOG_LINE), ('rsyslog_traditional_line', _RSYSLOG_TRADITIONAL_LINE), ('syslog_comment', _SYSLOG_COMMENT)] _SUPPORTED_KEYS = frozenset([key for key, _ in LINE_STRUCTURES]) def __init__(self): """Initializes a parser.""" super(SyslogParser, self).__init__() self._last_month = 0 self._maximum_year = 0 self._plugin_by_reporter = {} self._year_use = 0 def _UpdateYear(self, mediator, month): """Updates the year to use for events, based on last observed month. Args: mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfvfs. month (int): month observed by the parser, where January is 1. """ if not self._year_use: self._year_use = mediator.GetEstimatedYear() if not self._maximum_year: self._maximum_year = mediator.GetLatestYear() if not self._last_month: self._last_month = month return # Some syslog daemons allow out-of-order sequences, so allow some leeway # to not cause Apr->May->Apr to cause the year to increment. # See http://bugzilla.adiscon.com/show_bug.cgi?id=527 if self._last_month > (month + 1): if self._year_use != self._maximum_year: self._year_use += 1 self._last_month = month def EnablePlugins(self, plugin_includes): """Enables parser plugins. Args: plugin_includes (list[str]): names of the plugins to enable, where None or an empty list represents all plugins. Note that the default plugin is handled separately. """ super(SyslogParser, self).EnablePlugins(plugin_includes) self._plugin_by_reporter = {} for plugin in self._plugins: self._plugin_by_reporter[plugin.REPORTER] = plugin def ParseRecord(self, parser_mediator, key, structure): """Parses a matching entry. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfvfs. key (str): name of the parsed structure. structure (pyparsing.ParseResults): elements parsed from the file. Raises: ParseError: when the structure type is unknown. """ if key not in self._SUPPORTED_KEYS: raise errors.ParseError( 'Unable to parse record, unknown structure: {0:s}'.format(key)) if key in ('chromeos_syslog_line', 'rsyslog_line'): date_time = dfdatetime_time_elements.TimeElementsInMicroseconds() iso8601_string = self._GetValueFromStructure(structure, 'datetime') try: date_time.CopyFromStringISO8601(iso8601_string) except ValueError: parser_mediator.ProduceExtractionWarning( 'invalid date time value: {0:s}'.format(iso8601_string)) return else: # TODO: add support for fractional seconds. month = self._GetValueFromStructure(structure, 'month') try: month = timelib.MONTH_DICT.get(month.lower(), 0) except AttributeError: parser_mediator.ProduceExtractionWarning( 'invalid month value: {0!s}'.format(month)) return if month != 0: self._UpdateYear(parser_mediator, month) day = self._GetValueFromStructure(structure, 'day') hours = self._GetValueFromStructure(structure, 'hour') minutes = self._GetValueFromStructure(structure, 'minute') seconds = self._GetValueFromStructure(structure, 'second') time_elements_tuple = (self._year_use, month, day, hours, minutes, seconds) try: date_time = dfdatetime_time_elements.TimeElements( time_elements_tuple=time_elements_tuple) date_time.is_local_time = True except ValueError: parser_mediator.ProduceExtractionWarning( 'invalid date time value: {0!s}'.format( time_elements_tuple)) return plugin = None if key == 'syslog_comment': event_data = SyslogCommentEventData() event_data.body = self._GetValueFromStructure(structure, 'body') # TODO: pass line number to offset or remove. event_data.offset = 0 else: event_data = SyslogLineEventData() event_data.body = self._GetValueFromStructure(structure, 'body') event_data.hostname = self._GetValueFromStructure( structure, 'hostname') # TODO: pass line number to offset or remove. event_data.offset = 0 event_data.pid = self._GetValueFromStructure(structure, 'pid') event_data.reporter = self._GetValueFromStructure( structure, 'reporter') event_data.severity = self._GetValueFromStructure( structure, 'severity') plugin = self._plugin_by_reporter.get(event_data.reporter, None) if plugin: attributes = { 'body': event_data.body, 'hostname': event_data.hostname, 'pid': event_data.pid, 'reporter': event_data.reporter, 'severity': event_data.severity } try: # TODO: pass event_data instead of attributes. plugin.Process(parser_mediator, date_time, attributes) except errors.WrongPlugin: plugin = None if not plugin: event = time_events.DateTimeValuesEvent( date_time, definitions.TIME_DESCRIPTION_WRITTEN, time_zone=parser_mediator.timezone) parser_mediator.ProduceEventWithEventData(event, event_data) def VerifyStructure(self, parser_mediator, lines): """Verifies that this is a syslog-formatted file. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfvfs. lines (str): one or more lines from the text file. Returns: bool: True if this is the correct parser, False otherwise. """ return bool(self._VERIFICATION_REGEX.match(lines))
return instruction.String (string, locn=_location (src, locn)) #@pp.traceParseAction def _make_number (src, locn, toks): assert len (toks) == 1 return instruction.Number (float (toks [0]), locn=_location (src, locn)) #@pp.traceParseAction def _make_procedure (src, locn, toks): assert len (toks) == 1 return instruction.Procedure (toks [0], locn=_location (src, locn)) # Matches a number which may be negative, include a decimal point or exponential notation. # e.g. 1, -1, 3.14, 314e-2 _number = pp.Regex (r'-?\d+(\.\d*)?([eE]-?\d+)?').setParseAction (_make_number).setName ('number') _ident = pp.Word (pp.alphas, pp.alphanums + '_') _comment = pp.Regex (r'#.*').suppress () _operator = _ident.copy ().setParseAction (_make_operator).setName ('operator') _string = pp.quotedString.setParseAction (_make_string).setName ('string') _open_brace = pp.Keyword ('{').suppress () _close_brace = pp.Keyword ('}').suppress () _boolean = (pp.Keyword ('true').setParseAction (_make_true).setName ('true') ^ pp.Keyword ('false').setParseAction (_make_false).setName ('false')) _procedure = pp.Forward () operation = pp.ZeroOrMore (_comment ^ _boolean ^ _number ^ _procedure ^ _string ^ _operator) _procedure << pp.Group (_open_brace + operation + _close_brace).setName ('procedure').setParseAction (_make_procedure)
def __init__(self): # supported operators operator = pp.Regex( r"<=|>=|<>|\!=|==|<|>|not|in|regex_partial|regex_exact|geo_box|geo_radius|geo_polygon|contains_any|contains_all|substr|contains_near|any|contains_substr|near|contains|wildcard" ).setName("operator").addParseAction(self.validateOperator) # literals number = pp.Regex(r"[+-]?\d+(:?\.\d*)?(:?[eE][+-]?\d+)?").setName( "number") numberList = pp.Group( pp.Literal('[') + number + pp.ZeroOrMore("," + number) + pp.Literal(']')).setName("numberList") string = pp.dblQuotedString literals = number | numberList | string # symbols identifier = pp.Regex( r"[a-z][a-z_]+(?:\.[a-z][a-z_]+)*").addParseAction( self.validateIdentifier).setName("identifier") # we'll get there... subExpr = pp.Forward() # predicates stream = pp.Group(pp.Literal("stream") + string).setName("stream") exists = pp.Group(identifier + pp.Literal("exists")).setName("exists") # boolean predicates comparison = pp.Group(identifier + operator + literals | literals + operator + identifier).setName("comparison") condition = comparison | stream | exists | subExpr subExpr << pp.nestedExpr(content=condition) # standard boolean operator precedence expr = pp.operatorPrecedence(condition, [ ( pp.CaselessLiteral("not"), 1, pp.opAssoc.RIGHT, ), ( pp.CaselessLiteral("AND"), 2, pp.opAssoc.LEFT, ), ( pp.CaselessLiteral("OR"), 2, pp.opAssoc.LEFT, ), ]) # tag "thing" { expr } tag = pp.Group( pp.Literal("tag") + pp.quotedString + pp.nestedExpr("{", "}", expr)).setName("tag") # return { expr } a_return = pp.Group( pp.Literal("return") + pp.nestedExpr("{", "}", expr)).setName("return") # a single expression or tag [, tag, ...] return { expression } parser = expr | (pp.OneOrMore(tag) + a_return) # handle multilines parser.setDefaultWhitespaceChars(" \t\n\r") # handle // comments parser.ignore("//" + pp.restOfLine) self.parser = parser
return p.parseFile(file_name).asList() except pp.ParseException: msg = "Error Trying to parse: {} in file: {}".format(p, file_name) print(msg) raise def skip_supress(z): """Suppress stream until `z`""" return pp.Suppress(pp.SkipTo(z)) # parse utils natural = pp.Word(pp.nums) float_number = pp.Regex(r'(\-)?(\d+)?(\.)(\d*)?([eE][\-\+]\d+)?') skipLine = pp.Suppress(skip_supress('\n')) comment = pp.Suppress(pp.Literal(';')) + skipLine optional_comment = pp.ZeroOrMore(comment) word = pp.Word(pp.alphanums + "*") line = pp.Group( pp.OneOrMore(float_number | word) + pp.Optional(comment)) lines = pp.Group(pp.OneOrMore(line)) brackets = pp.Suppress("[") + word + pp.Suppress("]")
class sparc_syntax: divide = False noprefix = False comment = pp.Regex(r'\#.*') symbol = pp.Regex(r'[A-Za-z_.$][A-Za-z0-9_.$]*').setParseAction(lambda r: env.ext(r[0],size=32)) mnemo = pp.LineStart() + symbol + pp.Optional(pp.Literal(',a')) mnemo.setParseAction(lambda r: r[0].ref.lower()+''.join(r[1:])) integer = pp.Regex(r'[1-9][0-9]*').setParseAction(lambda r: int(r[0],10)) hexa = pp.Regex(r'0[xX][0-9a-fA-F]+').setParseAction(lambda r: int(r[0],16)) octa = pp.Regex(r'0[0-7]*').setParseAction(lambda r: int(r[0],8)) bina = pp.Regex(r'0[bB][01]+').setParseAction(lambda r: int(r[0],2)) char = pp.Regex(r"('.)|('\\\\)").setParseAction(lambda r: ord(r[0])) number = integer|hexa|octa|bina|char number.setParseAction(lambda r: env.cst(r[0],32)) term = symbol|number exp = pp.Forward() op_one = pp.oneOf("- ~") op_sig = pp.oneOf("+ -") op_mul = pp.oneOf("* /") op_cmp = pp.oneOf("== != <= >= < > <>") op_bit = pp.oneOf("^ && || & |") operators = [(op_one,1,pp.opAssoc.RIGHT), (op_sig,2,pp.opAssoc.LEFT), (op_mul,2,pp.opAssoc.LEFT), (op_cmp,2,pp.opAssoc.LEFT), (op_bit,2,pp.opAssoc.LEFT), ] reg = pp.Suppress('%')+pp.NotAny(pp.oneOf('hi lo'))+symbol hilo = pp.oneOf('%hi %lo')+pp.Suppress('(')+exp+pp.Suppress(')') exp << pp.operatorPrecedence(term|reg|hilo,operators) adr = pp.Suppress('[')+exp+pp.Suppress(']') mem = adr #+pp.Optional(symbol|imm) mem.setParseAction(lambda r: env.mem(r[0])) opd = exp|mem|reg opds = pp.Group(pp.delimitedList(opd)) instr = mnemo + pp.Optional(opds) + pp.Optional(comment) def action_reg(toks): rname = toks[0] if rname.ref.startswith('asr'): return env.reg(rname.ref) return env.__dict__[rname.ref] def action_hilo(toks): v = toks[1] return env.hi(v) if toks[0]=='%hi' else env.lo(v).zeroextend(32) def action_exp(toks): tok = toks[0] if isinstance(tok,env.exp): return tok if len(tok)==2: op=tok[0] r=tok[1] if isinstance(r,list): r=action_exp(r) return env.oper(op,r) elif len(tok)==3: op=tok[1] l=tok[0] r=tok[2] if isinstance(l,list): l=action_exp(l) if isinstance(r,list): r=action_exp(r) return env.oper(op,l,r) else: return tok def action_instr(toks): i = instruction('') i.mnemonic = toks[0] if len(toks)>1: i.operands = toks[1][0:] return asmhelper(i) # actions: reg.setParseAction(action_reg) hilo.setParseAction(action_hilo) exp.setParseAction(action_exp) instr.setParseAction(action_instr)
""" import pyparsing as pp pp.ParserElement.enablePackrat() LBRACE, RBRACE, LPAR, RPAR, SEMI = map(pp.Suppress, "{}();") EQ = pp.Literal('=') keywords = (WHILE, IF, PRINT, PUTC, ELSE) = map(pp.Keyword, "while if print putc else".split()) any_keyword = pp.MatchFirst(keywords) identifier = ~any_keyword + pp.pyparsing_common.identifier integer = pp.pyparsing_common.integer string = pp.QuotedString( '"', convertWhitespaceEscapes=False).setName("quoted string") char = pp.Regex(r"'\\?.'") expr = pp.infixNotation(identifier | integer | char, [ ( pp.oneOf("+ - !"), 1, pp.opAssoc.RIGHT, ), ( pp.oneOf("* / %"), 2, pp.opAssoc.LEFT, ), ( pp.oneOf("+ -"), 2,
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. import re from six.moves.urllib import parse as urllib_parse import pyparsing as pp uninary_operators = ("not", ) binary_operator = (u">=", u"<=", u"!=", u">", u"<", u"=", u"==", u"eq", u"ne", u"lt", u"gt", u"ge", u"le") multiple_operators = (u"and", u"or") operator = pp.Regex(u"|".join(binary_operator)) null = pp.Regex("None|none|null").setParseAction(pp.replaceWith(None)) boolean = "False|True|false|true" boolean = pp.Regex(boolean).setParseAction(lambda t: t[0].lower() == "true") hex_string = lambda n: pp.Word(pp.hexnums, exact=n) uuid = pp.Combine( hex_string(8) + ("-" + hex_string(4)) * 3 + "-" + hex_string(12)) number = r"[+-]?\d+(:?\.\d*)?(:?[eE][+-]?\d+)?" number = pp.Regex(number).setParseAction(lambda t: float(t[0])) identifier = pp.Word(pp.alphas, pp.alphanums + "_") quoted_string = pp.QuotedString('"') | pp.QuotedString("'") comparison_term = pp.Forward() in_list = pp.Group( pp.Suppress('[') + pp.Optional(pp.delimitedList(comparison_term)) + pp.Suppress(']'))("list") comparison_term << (null | boolean | uuid | identifier | number
# import pyparsing as pp from pyparsing import pyparsing_common as ppc pp.ParserElement.enablePackrat() COLON, LBRACK, RBRACK, LBRACE, RBRACE, TILDE, CARAT = map( pp.Literal, ":[]{}~^") LPAR, RPAR = map(pp.Suppress, "()") and_, or_, not_, to_ = map(pp.CaselessKeyword, "AND OR NOT TO".split()) keyword = and_ | or_ | not_ | to_ expression = pp.Forward() valid_word = pp.Regex( r'([a-zA-Z0-9*_+.-]|\\\\|\\([+\-!(){}\[\]^"~*?:]|\|\||&&))+').setName( "word") valid_word.setParseAction(lambda t: t[0].replace('\\\\', chr(127)).replace( '\\', '').replace(chr(127), '\\')) string = pp.QuotedString('"') required_modifier = pp.Literal("+")("required") prohibit_modifier = pp.Literal("-")("prohibit") integer = ppc.integer() proximity_modifier = pp.Group(TILDE + integer("proximity")) number = ppc.fnumber() fuzzy_modifier = TILDE + pp.Optional(number, default=0.5)("fuzzy") term = pp.Forward() field_name = valid_word().setName("fieldname")