#!/usr/bin/env python3 # -*- coding: utf-8 -*- import math import pyparsing as pp from pyparsing_ext import * from pyparsing_ext.pylang import * # grammar opTable = arithOpTable OPUNC = pp.Word('$~', '0123456789+-*/^&%<>=@!~:') opTable.append({'token':OPUNC}) smallGrammar = ProgrammingGrammarParser(keywords=commonKeywords, constants=[{'token':NUMBER, 'action':NumberAction}, {'token':STRING, 'action':StringAction}, {'token':pp.oneOf('True False'), 'action':BooleAction}], variables = [{'token':IDEN, 'action':VariableAction}], operators=opTable, functions=[{'token':IDEN('function'), 'action':FunctionAction}, {'token':(PUNC('left'), PUNC('right')), 'action':BifixAction}]) # semantics bifixDict = {('|', '|'): abs, ('[', '_]'): math.floor, ('[', '^]'): math.ceil} smallDict = arithDict; smallDict.update(bifixDict) pydict = {'len':len, 'abs':abs, 'min':min, 'max':max,'str':str,'sum':sum, 'tuple':tuple, 'any':any, 'all':all, 'tuple':tuple, 'list':list, 'dict':dict, 'int':int} smallDict.update(pydict) # language smallpyLanguage = ProgrammingLanguage(name="SmallPython", grammar=smallGrammar, calculator=Calculator(dict_=smallDict)) smallpyLanguage.info = { 'version': '0.0', 'paths': [], 'suffix': '.spy'}
class TestRepetition(PyparsingExpressionTestCase): tests = [ PpTestSpec( desc="Match several words", expr=(pp.Word("x") | pp.Word("y"))[...], text="xxyxxyyxxyxyxxxy", expected_list=[ "xx", "y", "xx", "yy", "xx", "y", "x", "y", "xxx", "y" ], ), PpTestSpec( desc="Match several words, skipping whitespace", expr=(pp.Word("x") | pp.Word("y"))[...], text="x x y xxy yxx y xyx xxy", expected_list=[ "x", "x", "y", "xx", "y", "y", "xx", "y", "x", "y", "x", "xx", "y", ], ), PpTestSpec( desc="Match several words, skipping whitespace (old style)", expr=pp.OneOrMore(pp.Word("x") | pp.Word("y")), text="x x y xxy yxx y xyx xxy", expected_list=[ "x", "x", "y", "xx", "y", "y", "xx", "y", "x", "y", "x", "xx", "y", ], ), PpTestSpec( desc= "Match words and numbers - show use of results names to collect types of tokens", expr=(pp.Word(pp.alphas)("alpha*") | pp.pyparsing_common.integer("int*"))[...], text="sdlfj23084ksdfs08234kjsdlfkjd0934", expected_list=["sdlfj", 23084, "ksdfs", 8234, "kjsdlfkjd", 934], expected_dict={ "alpha": ["sdlfj", "ksdfs", "kjsdlfkjd"], "int": [23084, 8234, 934], }, ), PpTestSpec( desc="Using delimited_list (comma is the default delimiter)", expr=pp.delimited_list(pp.Word(pp.alphas)), text="xxyx,xy,y,xxyx,yxx, xy", expected_list=["xxyx", "xy", "y", "xxyx", "yxx", "xy"], ), PpTestSpec( desc= "Using delimited_list (comma is the default delimiter) with trailing delimiter", expr=pp.delimited_list(pp.Word(pp.alphas), allow_trailing_delim=True), text="xxyx,xy,y,xxyx,yxx, xy,", expected_list=["xxyx", "xy", "y", "xxyx", "yxx", "xy"], ), PpTestSpec( desc="Using delimited_list, with ':' delimiter", expr=pp.delimited_list(pp.Word(pp.hexnums, exact=2), delim=":", combine=True), text="0A:4B:73:21:FE:76", expected_list=["0A:4B:73:21:FE:76"], ), PpTestSpec( desc="Using delimited_list, with ':' delimiter", expr=pp.delimited_list( pp.Word(pp.hexnums, exact=2), delim=":", combine=True, allow_trailing_delim=True, ), text="0A:4B:73:21:FE:76:", expected_list=["0A:4B:73:21:FE:76:"], ), ]
C_BRACKET = pp.Literal(']') O_BRACE = pp.Literal('{') C_BRACE = pp.Literal('}') O_PAREN = pp.Literal('(') C_PAREN = pp.Literal(')') DOLLAR = pp.Literal('$') AT = pp.Literal('@') SLASH = pp.Literal('/') DBL_VLINE = pp.Literal('||') #Subtree application and testing S_APP = pp.Keyword('::', identChars='?!') S_APP_EX = pp.Keyword('::!', identChars='?') S_TEST = pp.Keyword('::?') ARITH = pp.Word('-+*/^%', exact=1) NAME = pp.Word(pp.alphas) IG_NAME = pp.Word('_', pp.alphas) NUM = pp.Word(pp.nums + '-_d/') #negation, formatting, decimal, and fraction STRING = pp.dblQuotedString NON_PATH_VAR = pp.Forward() PATH_VAR = pp.Forward() VAR_HEADER = pp.Group(DOLLAR | AT) PATH_VAR << VAR_HEADER.setResultsName('VAR_SCOPE') + \ pp.Group(pp.Keyword('..', ' .')).setResultsName('PATH_ACCESS') + \ pp.Word(pp.alphas + pp.nums).setResultsName('VARNAME') - \
pp.Suppress('.') + _identifier('table_name') + _ignore('ADD CONSTRAINT') + _identifier('constraint_name') + _ignore('FOREIGN KEY') + pp.Suppress('(') + _identifier('column_name') + pp.Suppress(')') + _ignore('REFERENCES') + _identifier('foreign_schema') + pp.Suppress('.') + _identifier('foreign_table_name') + pp.Suppress('(') + _identifier('foreign_column_name') + pp.Suppress(')')) create_table_expr: pp.ParseExpression = ( _ignore('CREATE', 'GLOBAL', 'LOCAL', 'TEMPORARY', 'TEMP', 'UNLOGGED', 'TABLE', 'IF NOT EXISTS') + _identifier('schema') + pp.Suppress('.') + _identifier('table_name') + pp.Suppress('(') + pp.delimitedList( (pp.Suppress('CONSTRAINT' + pp.Regex(r'[^,]+'))) | pp.Group( _identifier('name') + _identifier('data_type') + pp.Suppress(pp.Optional(pp.Word(r'\[\]'))) + _ignore('WITHOUT TIME ZONE', 'WITH TIME ZONE', 'PRECISION', 'VARYING') + pp.Optional( pp.Suppress('(') + pp.Regex(r'\d+\s*,*\s*\d*') + _ignore('CHAR', 'BYTEST') + pp.Suppress(')')) + pp.Suppress( pp.Optional(pp.CaselessKeyword('DEFAULT') + 'false' | 'true')) + pp.Suppress( pp.Optional( pp.Regex( r"(?!--)(\b(COMMENT|DEFAULT)\b\s+[^,]+|([A-Za-z0-9_'\": -]|[^\x01-\x7E])*)", re.IGNORECASE, ), ), ) + pp.Suppress(pp.Optional(pp.Word(r'\[\]'))) + pp.Optional(pp.CaselessKeyword('NOT NULL')) ('not_null').setParseAction(bool), ), )('columns') + pp.Suppress(')'))
def _build_tgrep_parser(set_parse_actions=True): """ Builds a pyparsing-based parser object for tokenizing and interpreting tgrep search strings. """ tgrep_op = pyparsing.Optional("!") + pyparsing.Regex( "[$%,.<>][%,.<>0-9-':]*") tgrep_qstring = pyparsing.QuotedString(quoteChar='"', escChar="\\", unquoteResults=False) tgrep_node_regex = pyparsing.QuotedString(quoteChar="/", escChar="\\", unquoteResults=False) tgrep_qstring_icase = pyparsing.Regex( 'i@\\"(?:[^"\\n\\r\\\\]|(?:\\\\.))*\\"') tgrep_node_regex_icase = pyparsing.Regex( "i@\\/(?:[^/\\n\\r\\\\]|(?:\\\\.))*\\/") tgrep_node_literal = pyparsing.Regex("[^][ \r\t\n;:.,&|<>()$!@%'^=]+") tgrep_expr = pyparsing.Forward() tgrep_relations = pyparsing.Forward() tgrep_parens = pyparsing.Literal("(") + tgrep_expr + ")" tgrep_nltk_tree_pos = (pyparsing.Literal("N(") + pyparsing.Optional( pyparsing.Word(pyparsing.nums) + "," + pyparsing.Optional( pyparsing.delimitedList(pyparsing.Word(pyparsing.nums), delim=",") + pyparsing.Optional(","))) + ")") tgrep_node_label = pyparsing.Regex("[A-Za-z0-9]+") tgrep_node_label_use = pyparsing.Combine("=" + tgrep_node_label) # see _tgrep_segmented_pattern_action tgrep_node_label_use_pred = tgrep_node_label_use.copy() macro_name = pyparsing.Regex("[^];:.,&|<>()[$!@%'^=\r\t\n ]+") macro_name.setWhitespaceChars("") macro_use = pyparsing.Combine("@" + macro_name) tgrep_node_expr = (tgrep_node_label_use_pred | macro_use | tgrep_nltk_tree_pos | tgrep_qstring_icase | tgrep_node_regex_icase | tgrep_qstring | tgrep_node_regex | "*" | tgrep_node_literal) tgrep_node_expr2 = ( tgrep_node_expr + pyparsing.Literal("=").setWhitespaceChars("") + tgrep_node_label.copy().setWhitespaceChars("")) | tgrep_node_expr tgrep_node = tgrep_parens | (pyparsing.Optional("'") + tgrep_node_expr2 + pyparsing.ZeroOrMore("|" + tgrep_node_expr)) tgrep_brackets = pyparsing.Optional("!") + "[" + tgrep_relations + "]" tgrep_relation = tgrep_brackets | (tgrep_op + tgrep_node) tgrep_rel_conjunction = pyparsing.Forward() tgrep_rel_conjunction << ( tgrep_relation + pyparsing.ZeroOrMore(pyparsing.Optional("&") + tgrep_rel_conjunction)) tgrep_relations << tgrep_rel_conjunction + pyparsing.ZeroOrMore( "|" + tgrep_relations) tgrep_expr << tgrep_node + pyparsing.Optional(tgrep_relations) tgrep_expr_labeled = tgrep_node_label_use + pyparsing.Optional( tgrep_relations) tgrep_expr2 = tgrep_expr + pyparsing.ZeroOrMore(":" + tgrep_expr_labeled) macro_defn = (pyparsing.Literal("@") + pyparsing.White().suppress() + macro_name + tgrep_expr2) tgrep_exprs = ( pyparsing.Optional(macro_defn + pyparsing.ZeroOrMore(";" + macro_defn) + ";") + tgrep_expr2 + pyparsing.ZeroOrMore(";" + (macro_defn | tgrep_expr2)) + pyparsing.ZeroOrMore(";").suppress()) if set_parse_actions: tgrep_node_label_use.setParseAction(_tgrep_node_label_use_action) tgrep_node_label_use_pred.setParseAction( _tgrep_node_label_pred_use_action) macro_use.setParseAction(_tgrep_macro_use_action) tgrep_node.setParseAction(_tgrep_node_action) tgrep_node_expr2.setParseAction(_tgrep_bind_node_label_action) tgrep_parens.setParseAction(_tgrep_parens_action) tgrep_nltk_tree_pos.setParseAction(_tgrep_nltk_tree_pos_action) tgrep_relation.setParseAction(_tgrep_relation_action) tgrep_rel_conjunction.setParseAction(_tgrep_conjunction_action) tgrep_relations.setParseAction(_tgrep_rel_disjunction_action) macro_defn.setParseAction(_macro_defn_action) # the whole expression is also the conjunction of two # predicates: the first node predicate, and the remaining # relation predicates tgrep_expr.setParseAction(_tgrep_conjunction_action) tgrep_expr_labeled.setParseAction(_tgrep_segmented_pattern_action) tgrep_expr2.setParseAction( functools.partial(_tgrep_conjunction_action, join_char=":")) tgrep_exprs.setParseAction(_tgrep_exprs_action) return tgrep_exprs.ignore("#" + pyparsing.restOfLine)
class MalformedExpression(Exception): pass class DuplicatePeriodCapture(Exception): def __init__(self, name): self._name = name def __str__(self): return 'Duplicate period capture name: "{}"'.format(self._name) # common grammar elements _e = pp.CaselessLiteral('e') _number = (pp.Combine( pp.Word('+-' + pp.nums, pp.nums) + pp.Optional('.' + pp.Optional(pp.Word(pp.nums))) + pp.Optional(_e + pp.Word('+-' + pp.nums, pp.nums))).setResultsName('number')) _quoted_string = pp.QuotedString('"', '\\').setResultsName('quoted-string') _identifier = pp.Word(pp.alphas + '_', pp.alphanums + '_').setResultsName('id') _tph_scope_prefix = (pp.Literal( period.DynScope.TPH.value).setResultsName('tph-scope-prefix')) _spc_scope_prefix = (pp.Literal( period.DynScope.SPC.value).setResultsName('spc-scope-prefix')) _seh_scope_prefix = (pp.Literal( period.DynScope.SEH.value).setResultsName('seh-scope-prefix')) _sec_scope_prefix = (pp.Literal( period.DynScope.SEC.value).setResultsName('sec-scope-prefix')) _ec_scope_prefix = (pp.Literal( period.DynScope.EC.value).setResultsName('ec-scope-prefix'))
def __parse_tc_filter_port(text): port_pattern = (pp.SkipTo("port=", include=True) + pp.Word(pp.nums)) return port_pattern.parseString(text)[-1]
# import pyparsing as pp atomicWeight = { "O": 15.9994, "H": 1.00794, "Na": 22.9897, "Cl": 35.4527, "C": 12.0107, } digits = "0123456789" # Version 1 element = pp.Word(pp.alphas.upper(), pp.alphas.lower(), max=2) # for stricter matching, use this Regex instead # element = Regex("A[cglmrstu]|B[aehikr]?|C[adeflmorsu]?|D[bsy]|" # "E[rsu]|F[emr]?|G[ade]|H[efgos]?|I[nr]?|Kr?|L[airu]|" # "M[dgnot]|N[abdeiop]?|Os?|P[abdmortu]?|R[abefghnu]|" # "S[bcegimnr]?|T[abcehilm]|U(u[bhopqst])?|V|W|Xe|Yb?|Z[nr]") elementRef = pp.Group(element + pp.Optional(pp.Word(digits), default="1")) formula = elementRef[...] def sum_atomic_weights(element_list): return sum(atomicWeight[elem] * int(qty) for elem, qty in element_list) formula.runTests( """\
| min | 7 | 43 | 7 | 15 | 82 | 98 | 1 | 37 | | max | 11 | 52 | 10 | 17 | 85 | 112 | 4 | 39 | | ave | 9 | 47 | 8 | 16 | 84 | 106 | 3 | 38 | | sdev | 1 | 3 | 1 | 1 | 1 | 3 | 1 | 1 | +-------+------+------+------+------+------+------+------+------+ """ # define grammar for datatable heading = ( pp.Literal( "+-------+------+------+------+------+------+------+------+------+") + "| | A1 | B1 | C1 | D1 | A2 | B2 | C2 | D2 |" + "+=======+======+======+======+======+======+======+======+======+" ).suppress() vert = pp.Literal("|").suppress() number = pp.Word(pp.nums) rowData = pp.Group(vert + pp.Word(pp.alphas) + vert + pp.delimitedList(number, "|") + vert) trailing = pp.Literal( "+-------+------+------+------+------+------+------+------+------+" ).suppress() datatable = heading + pp.Dict(pp.ZeroOrMore(rowData)) + trailing # now parse data and print results data = datatable.parseString(testData) print(data) # shortcut for import pprint; pprint.pprint(data.asList()) data.pprint()
def eisen_grammar(): # define parser grammar # this is rough and imposes no structure on float and interger expressions # but works if es is properly formed fnum = pp.Word(".+-*/()"+pp.nums) tid = pp.oneOf('x y z s rx ry rz', caseless=True)('tid') # these are color transforms that we're ignoring cid = pp.oneOf('h hue sat b brightness a alpha m', caseless=True) tvalues = pp.OneOrMore(fnum)('tvalues') gtrans = pp.Group(tid + tvalues).setResultsName('trans', listAllMatches=True) ctrans = cid + tvalues c2trans = pp.CaselessKeyword('color') + pp.Word(pp.alphanums + '#') c3trans = pp.CaselessKeyword('blend') + pp.Word(pp.alphanums + '#') + fnum trans = gtrans | ctrans | c2trans | c3trans rule_name = pp.NotAny(pp.CaselessKeyword('rule')) + \ pp.Word(pp.alphas, pp.alphanums+'_') loop_multiplier = fnum('count') + pp.Suppress('*') loop = pp.Group(pp.Optional(loop_multiplier) + pp.Suppress('{') + pp.ZeroOrMore(trans) + pp.Suppress('}')).setResultsName('loop', listAllMatches=True) md = pp.oneOf('md maxdepth', caseless=True) md_mod = md + fnum('md') + pp.Optional('>' + rule_name('successor_rule')) weight = pp.oneOf('w weight', caseless=True) w_mod = weight + fnum('wm') shape_words = pp.oneOf(['box', 'grid', 'sphere', 'line'], caseless=True) shape = pp.Combine(shape_words + pp.Optional(pp.Word(pp.alphas + ':'))) global_md = pp.CaselessKeyword('set') + md \ + fnum('global_md') shape_call = (pp.Optional(loop) + shape('shape')).setResultsName('bcall', listAllMatches=True) rule_call = (pp.ZeroOrMore(loop) + rule_name('rule_name')).setResultsName('rcall', listAllMatches=True) call = shape_call | rule_call rule = pp.Group(pp.Suppress(pp.CaselessKeyword('rule')) + rule_name('name') + (pp.Optional(md_mod) & pp.Optional(w_mod)) + pp.Suppress('{') + pp.OneOrMore(call) + pp.Suppress('}')) entry = pp.Group(pp.OneOrMore(call)).setResultsName('entry_calls', listAllMatches=True) main = pp.Group(pp.OneOrMore(rule)).setResultsName('rule_defs', listAllMatches=True) file_def = pp.Optional(global_md) + entry + main file_def.ignore(pp.cppStyleComment) # more stuff to ignore set_words = pp.oneOf('seed maxobjects maxsize minsize background ' + 'colorpool translation rotation pivot scale ' + 'raytracer syncrandom', caseless=True) set_ignore = pp.CaselessKeyword('set') + set_words + pp.restOfLine file_def.ignore(set_ignore) return file_def
import pyparsing from miasm.expression.expression import ExprInt, ExprId, ExprLoc, ExprSlice, \ ExprMem, ExprCond, ExprCompose, ExprOp, ExprAssign, LocKey integer = pyparsing.Word(pyparsing.nums).setParseAction(lambda t: int(t[0])) hex_word = pyparsing.Literal('0x') + pyparsing.Word(pyparsing.hexnums) hex_int = pyparsing.Combine(hex_word).setParseAction(lambda t: int(t[0], 16)) str_int_pos = (hex_int | integer) str_int_neg = (pyparsing.Suppress('-') + \ (hex_int | integer)).setParseAction(lambda t: -t[0]) str_int = str_int_pos | str_int_neg STR_EXPRINT = pyparsing.Suppress("ExprInt") STR_EXPRID = pyparsing.Suppress("ExprId") STR_EXPRLOC = pyparsing.Suppress("ExprLoc") STR_EXPRSLICE = pyparsing.Suppress("ExprSlice") STR_EXPRMEM = pyparsing.Suppress("ExprMem") STR_EXPRCOND = pyparsing.Suppress("ExprCond") STR_EXPRCOMPOSE = pyparsing.Suppress("ExprCompose") STR_EXPROP = pyparsing.Suppress("ExprOp") STR_EXPRASSIGN = pyparsing.Suppress("ExprAssign") LOCKEY = pyparsing.Suppress("LocKey") STR_COMMA = pyparsing.Suppress(",") LPARENTHESIS = pyparsing.Suppress("(") RPARENTHESIS = pyparsing.Suppress(")")
class SCCMParser(text_parser.PyparsingMultiLineTextParser): """Parser for Windows System Center Configuration Manager (SCCM) logs.""" NAME = 'sccm' DESCRIPTION = 'Parser for SCCM logs files.' _ENCODING = 'utf-8-sig' # Increasing the buffer size as SCCM messages are commonly well larger # than the default value. BUFFER_SIZE = 16384 LINE_STRUCTURES = [] _FOUR_DIGITS = text_parser.PyparsingConstants.FOUR_DIGITS _ONE_OR_TWO_DIGITS = text_parser.PyparsingConstants.ONE_OR_TWO_DIGITS # PyParsing Components used to construct grammars for parsing lines. _PARSING_COMPONENTS = { 'msg_left_delimiter': pyparsing.Literal('<![LOG['), 'msg_right_delimiter': pyparsing.Literal(']LOG]!><time="'), 'year': _FOUR_DIGITS.setResultsName('year'), 'month': _ONE_OR_TWO_DIGITS.setResultsName('month'), 'day': _ONE_OR_TWO_DIGITS.setResultsName('day'), 'fraction_of_second': pyparsing.Regex(r'\d{3,7}').setResultsName('fraction_of_second'), 'utc_offset_minutes': pyparsing.Regex(r'[-+]\d{2,3}').setResultsName('utc_offset_minutes'), 'date_prefix': pyparsing.Literal('" date="').setResultsName('date_prefix'), 'component_prefix': pyparsing.Literal('" component="').setResultsName('component_prefix'), 'component': pyparsing.Word(pyparsing.alphanums).setResultsName('component'), 'text': pyparsing.Regex(r'.*?(?=(]LOG]!><time="))', re.DOTALL).setResultsName('text'), 'line_remainder': pyparsing.Regex(r'.*?(?=(\<!\[LOG\[))', re.DOTALL).setResultsName('line_remainder'), 'lastline_remainder': pyparsing.restOfLine.setResultsName('lastline_remainder'), 'hour': _ONE_OR_TWO_DIGITS.setResultsName('hour'), 'minute': text_parser.PyparsingConstants.TWO_DIGITS.setResultsName('minute'), 'second': text_parser.PyparsingConstants.TWO_DIGITS.setResultsName('second') } # Base grammar for individual log event lines. LINE_GRAMMAR_BASE = ( _PARSING_COMPONENTS['msg_left_delimiter'] + _PARSING_COMPONENTS['text'] + _PARSING_COMPONENTS['msg_right_delimiter'] + _PARSING_COMPONENTS['hour'] + pyparsing.Suppress(':') + _PARSING_COMPONENTS['minute'] + pyparsing.Suppress(':') + _PARSING_COMPONENTS['second'] + pyparsing.Suppress('.') + _PARSING_COMPONENTS['fraction_of_second'] + _PARSING_COMPONENTS['date_prefix'] + _PARSING_COMPONENTS['month'] + pyparsing.Suppress('-') + _PARSING_COMPONENTS['day'] + pyparsing.Suppress('-') + _PARSING_COMPONENTS['year'] + _PARSING_COMPONENTS['component_prefix'] + _PARSING_COMPONENTS['component']) # Grammar for individual log event lines with a minutes offset from UTC. LINE_GRAMMAR_OFFSET = ( _PARSING_COMPONENTS['msg_left_delimiter'] + _PARSING_COMPONENTS['text'] + _PARSING_COMPONENTS['msg_right_delimiter'] + _PARSING_COMPONENTS['hour'] + pyparsing.Suppress(':') + _PARSING_COMPONENTS['minute'] + pyparsing.Suppress(':') + _PARSING_COMPONENTS['second'] + pyparsing.Suppress('.') + _PARSING_COMPONENTS['fraction_of_second'] + _PARSING_COMPONENTS['utc_offset_minutes'] + _PARSING_COMPONENTS['date_prefix'] + _PARSING_COMPONENTS['month'] + pyparsing.Suppress('-') + _PARSING_COMPONENTS['day'] + pyparsing.Suppress('-') + _PARSING_COMPONENTS['year'] + _PARSING_COMPONENTS['component_prefix'] + _PARSING_COMPONENTS['component']) LINE_STRUCTURES = [ ('log_entry', LINE_GRAMMAR_BASE + _PARSING_COMPONENTS['line_remainder']), ('log_entry_at_end', LINE_GRAMMAR_BASE + _PARSING_COMPONENTS['lastline_remainder'] + pyparsing.lineEnd), ('log_entry_offset', LINE_GRAMMAR_OFFSET + _PARSING_COMPONENTS['line_remainder']), ('log_entry_offset_at_end', LINE_GRAMMAR_OFFSET + _PARSING_COMPONENTS['lastline_remainder'] + pyparsing.lineEnd) ] def _GetISO8601String(self, structure): """Retrieves an ISO8601 date time string from the structure. The date and time values in the SCCM log are formatted as: time="19:33:19.766-330" date="11-28-2014" Args: structure (pyparsing.ParseResults): structure of tokens derived from a line of a text file. Returns: str: ISO 8601 date time string. Raises: ValueError: if the structure cannot be converted into a date time string. """ fraction_of_second = self._GetValueFromStructure( structure, 'fraction_of_second') fraction_of_second_length = len(fraction_of_second) if fraction_of_second_length not in (3, 6, 7): raise ValueError( 'unsupported time fraction of second length: {0:d}'.format( fraction_of_second_length)) try: fraction_of_second = int(fraction_of_second, 10) except (TypeError, ValueError) as exception: raise ValueError( 'unable to determine fraction of second with error: {0!s}'. format(exception)) # TODO: improve precision support, but for now ignore the 100ns precision. if fraction_of_second_length == 7: fraction_of_second, _ = divmod(fraction_of_second, 10) year = self._GetValueFromStructure(structure, 'year') month = self._GetValueFromStructure(structure, 'month') day_of_month = self._GetValueFromStructure(structure, 'day') hours = self._GetValueFromStructure(structure, 'hour') minutes = self._GetValueFromStructure(structure, 'minute') seconds = self._GetValueFromStructure(structure, 'second') date_time_string = '{0:04d}-{1:02d}-{2:02d}T{3:02d}:{4:02d}:{5:02d}'.format( year, month, day_of_month, hours, minutes, seconds) if fraction_of_second_length > 0: date_time_string = '{0:s}.{1:d}'.format(date_time_string, fraction_of_second) utc_offset_minutes = self._GetValueFromStructure( structure, 'utc_offset_minutes') if utc_offset_minutes is not None: try: time_zone_offset = int(utc_offset_minutes[1:], 10) except (IndexError, ValueError) as exception: raise ValueError( 'Unable to parse time zone offset with error: {0!s}.'. format(exception)) time_zone_hours, time_zone_minutes = divmod(time_zone_offset, 60) date_time_string = '{0:s}{1:s}{2:02d}:{3:02d}'.format( date_time_string, utc_offset_minutes[0], time_zone_hours, time_zone_minutes) return date_time_string def ParseRecord(self, parser_mediator, key, structure): """Parse the record and return an SCCM log event object. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfvfs. key (str): name of the parsed structure. structure (pyparsing.ParseResults): structure of tokens derived from a line of a text file. Raises: ParseError: when the structure type is unknown. """ if key not in ('log_entry', 'log_entry_at_end', 'log_entry_offset', 'log_entry_offset_at_end'): raise errors.ParseError( 'Unable to parse record, unknown structure: {0:s}'.format(key)) try: date_time_string = self._GetISO8601String(structure) except ValueError as exception: parser_mediator.ProduceExtractionWarning( 'unable to determine date time string with error: {0!s}'. format(exception)) fraction_of_second = self._GetValueFromStructure( structure, 'fraction_of_second') fraction_of_second_length = len(fraction_of_second) if fraction_of_second_length == 3: date_time = dfdatetime_time_elements.TimeElementsInMilliseconds() elif fraction_of_second_length in (6, 7): date_time = dfdatetime_time_elements.TimeElementsInMicroseconds() try: date_time.CopyFromStringISO8601(date_time_string) except ValueError as exception: parser_mediator.ProduceExtractionWarning( 'unable to parse date time value: {0:s} with error: {1!s}'. format(date_time_string, exception)) return event_data = SCCMLogEventData() event_data.component = self._GetValueFromStructure( structure, 'component') # TODO: pass line number to offset or remove. event_data.offset = 0 event_data.text = self._GetValueFromStructure(structure, 'text') event = time_events.DateTimeValuesEvent( date_time, definitions.TIME_DESCRIPTION_WRITTEN) parser_mediator.ProduceEventWithEventData(event, event_data) def VerifyStructure(self, parser_mediator, lines): """Verifies whether content corresponds to an SCCM log file. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfvfs. lines (str): one or more lines from the text file. Returns: bool: True if this is the correct parser, False otherwise. """ # Identify the token to which we attempt a match. match = self._PARSING_COMPONENTS['msg_left_delimiter'].match # Because logs files can lead with a partial event, # we can't assume that the first character (post-BOM) # in the file is the beginning of our match - so we # look for match anywhere in lines. return match in lines
def _gen_parser(): r""" Generates PyParsing grammar for parsing common powershell operations. Tests: >>> parser = _gen_parser() >>> parser.parseString("'{1} {0}'-f 'world','hello'") (['hello world'], {}) >>> parser.parseString(''' ... 'fGshellolNRfGs'-rEplaCE ((([cHaR]108+[cHaR]78+[cHaR]82))),'!' .rePLace('fGs',[cHaR]96)''') (['`hello!`'], {}) >>> parser.parseString("'ATBZCFD'-spLIT 'Z'-SPLIT'T' -spLiT 'F'") (['A', 'B', 'C', 'D'], {}) >>> parser.parseString("$ENv:PuBlIc[13]") (['i'], {}) >>> parser.parseString("('h', 'e', 'l', 'lo')-JOIn ''") (['hello'], {}) >>> parser.parseString("'he`llo'") (['hello'], {}) >>> parser.parseString("'FOtestingFO'.RePLaCE('FO','`')") (['`testing`'], {}) """ char = ("[" + pp.CaselessKeyword("char") + "]" + pp.Word(pp.nums)("num")).setParseAction(lambda t: chr(int(t.num))) string = ((pp.Suppress("'") + "`" + pp.Suppress("'")) | (pp.Suppress('"') + "`" + pp.Suppress('"')) | pp.QuotedString("'", escChar="`", escQuote="''", multiline=True, convertWhitespaceEscapes=False) | pp.QuotedString('"', escChar="`", escQuote='""', multiline=True, convertWhitespaceEscapes=False)) variable = ("$" + pp.oneOf(_VARIABLE_LOOKUP.keys(), caseless=True)("var") ).setParseAction(lambda t: _VARIABLE_LOOKUP[t.var.lower()]) _string = pp.Suppress( pp.Optional("[" + pp.CaselessKeyword("string") + "]")) + OptionalParen( pp.Suppress(pp.Optional("[" + pp.CaselessKeyword("string") + "]")) + string | char | variable) concat_string = OptionalParen( pp.delimitedList(OptionalParen(_string), delim="+").setParseAction(lambda t: "".join(t))) # TODO: Support ranges and other fancy indexing. indexing = (concat_string("data") + "[" + pp.delimitedList(pp.Word(pp.nums))("indices") + "]").setParseAction(_indexing) # Combine used to enforce there is no space between "c" and "replace" _replace_command = pp.Combine( pp.Optional(pp.CaselessLiteral("c")) + pp.CaselessKeyword("replace"))("command") string_replace = (concat_string("data") + pp.OneOrMore( pp.Group((pp.Combine("-" + _replace_command) + concat_string("old") + "," + concat_string("new")) | ("." + ("'" + _replace_command + "'" | '"' + _replace_command + '"' | _replace_command) + ("(" + concat_string("old") + "," + concat_string("new") + ")"))))("replace")).setParseAction(_string_replace) string_format = (concat_string("format_string") + pp.OneOrMore( pp.Group( pp.CaselessKeyword("-f") + pp.delimitedList(concat_string) ("params")))("format")).setParseAction(_string_format) split = (concat_string("data") + pp.OneOrMore( pp.Group((pp.CaselessKeyword("-split") + concat_string("delimiters")) | ("." + pp.CaselessKeyword("split") + "(" + concat_string("delimiters") + ")")))("split") ).setParseAction(_split) join = (OptionalParen(pp.delimitedList(concat_string)("string_list")) + pp.CaselessKeyword("-join") + concat_string("join_string") ).setParseAction(lambda t: t.join_string.join(t.string_list)) join_unary = ( (pp.CaselessKeyword("-join") | pp.CaselessKeyword("[string]::join")) + "(" + OptionalParen(pp.delimitedList(concat_string)("string_list")) + ")").setParseAction(lambda t: "".join(t.string_list)) # fmt: off poss_elements = OptionalParen(string_format | string_replace | split | join_unary | join | indexing | concat_string) # fmt: on return poss_elements
class OrNode(Node): pass # Or operator between comparison group class MultipleNode(Node): pass # GRAMMAR and_operator = pyparsing.oneOf(['and', '&'], caseless=True) or_operator = pyparsing.oneOf(['or', '|'], caseless=True) ident = pyparsing.Word(pyparsing.alphanums + '.' + '/' + ':' + '_' + '-' + '*' + '^').setParseAction(lambda t: t[0].replace('_', ' ')) # OPERATORS equal_exact = pyparsing.Keyword('==', caseless=True) # exact match equal_contains = pyparsing.Keyword('=', caseless=True).setParseAction( lambda t: '$regex') # contains match regex = pyparsing.Keyword('~', caseless=True).setParseAction( lambda t: '$regex') # regex match greater_than_equal = pyparsing.Keyword('>=', caseless=True).setParseAction( lambda t: '$gte') # greater than or equal greater_than = pyparsing.Keyword('>', caseless=True).setParseAction( lambda t: '$gt') # greater than lower_than_equal = pyparsing.Keyword('<=', caseless=True).setParseAction( lambda t: '$lte') # lower than or equal lower_than = pyparsing.Keyword('<', caseless=True).setParseAction(
class MacAppFirewallParser(text_parser.PyparsingSingleLineTextParser): """Parse text based on appfirewall.log file.""" NAME = u'mac_appfirewall_log' DESCRIPTION = u'Parser for appfirewall.log files.' ENCODING = u'utf-8' # Define how a log line should look like. # Example: 'Nov 2 04:07:35 DarkTemplar-2.local socketfilterfw[112] ' # '<Info>: Dropbox: Allow (in:0 out:2)' # INFO: process_name is going to have a white space at the beginning. DATE_TIME = pyparsing.Group( text_parser.PyparsingConstants.THREE_LETTERS.setResultsName(u'month') + text_parser.PyparsingConstants.ONE_OR_TWO_DIGITS.setResultsName(u'day') + text_parser.PyparsingConstants.TIME_ELEMENTS) FIREWALL_LINE = ( DATE_TIME.setResultsName(u'date_time') + pyparsing.Word(pyparsing.printables).setResultsName(u'computer_name') + pyparsing.Word(pyparsing.printables).setResultsName(u'agent') + pyparsing.Literal(u'<').suppress() + pyparsing.CharsNotIn(u'>').setResultsName(u'status') + pyparsing.Literal(u'>:').suppress() + pyparsing.CharsNotIn(u':').setResultsName(u'process_name') + pyparsing.Literal(u':') + pyparsing.SkipTo(pyparsing.lineEnd).setResultsName(u'action')) # Repeated line. # Example: Nov 29 22:18:29 --- last message repeated 1 time --- REPEATED_LINE = ( DATE_TIME.setResultsName(u'date_time') + pyparsing.Literal(u'---').suppress() + pyparsing.CharsNotIn(u'---').setResultsName(u'process_name') + pyparsing.Literal(u'---').suppress()) LINE_STRUCTURES = [ (u'logline', FIREWALL_LINE), (u'repeated', REPEATED_LINE)] def __init__(self): """Initializes a parser object.""" super(MacAppFirewallParser, self).__init__() self._last_month = 0 self._previous_structure = None self._year_use = 0 def _GetTimeElementsTuple(self, structure): """Retrieves a time elements tuple from the structure. Args: structure (pyparsing.ParseResults): structure of tokens derived from a line of a text file. Returns: tuple: contains: year (int): year. month (int): month, where 1 represents January. day_of_month (int): day of month, where 1 is the first day of the month. hours (int): hours. minutes (int): minutes. seconds (int): seconds. """ month, day, hours, minutes, seconds = structure.date_time # Note that dfdatetime_time_elements.TimeElements will raise ValueError # for an invalid month. month = timelib.MONTH_DICT.get(month.lower(), 0) if month != 0 and month < self._last_month: # Gap detected between years. self._year_use += 1 return (self._year_use, month, day, hours, minutes, seconds) def _ParseLogLine(self, parser_mediator, structure, key): """Parse a single log line and produce an event object. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfvfs. key (str): identifier of the structure of tokens. structure (pyparsing.ParseResults): structure of tokens derived from a line of a text file. """ time_elements_tuple = self._GetTimeElementsTuple(structure) try: date_time = dfdatetime_time_elements.TimeElements( time_elements_tuple=time_elements_tuple) except ValueError: parser_mediator.ProduceExtractionError( u'invalid date time value: {0!s}'.format(structure.date_time)) return self._last_month = time_elements_tuple[1] # If the actual entry is a repeated entry, we take the basic information # from the previous entry, but using the timestmap from the actual entry. if key == u'logline': self._previous_structure = structure else: structure = self._previous_structure # Pyparsing reads in RAW, but the text is in UTF8. try: action = structure.action.decode(u'utf-8') except UnicodeDecodeError: logging.warning( u'Decode UTF8 failed, the message string may be cut short.') action = structure.action.decode(u'utf-8', u'ignore') event_data = MacAppFirewallLogEventData() event_data.action = action event_data.agent = structure.agent event_data.computer_name = structure.computer_name # Due to the use of CharsNotIn pyparsing structure contains whitespaces # that need to be removed. event_data.process_name = structure.process_name.strip() event_data.status = structure.status event = time_events.DateTimeValuesEvent( date_time, definitions.TIME_DESCRIPTION_ADDED) parser_mediator.ProduceEventWithEventData(event, event_data) def ParseRecord(self, parser_mediator, key, structure): """Parses a log record structure and produces events. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfvfs. key (str): identifier of the structure of tokens. structure (pyparsing.ParseResults): structure of tokens derived from a line of a text file. Raises: ParseError: when the structure type is unknown. """ if key not in (u'logline', u'repeated'): raise errors.ParseError( u'Unable to parse record, unknown structure: {0:s}'.format(key)) self._ParseLogLine(parser_mediator, structure, key) def VerifyStructure(self, parser_mediator, line): """Verify that this file is a Mac AppFirewall log file. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfvfs. line (bytes): line from a text file. Returns: bool: True if the line is in the expected format, False if not. """ self._last_month = 0 self._year_use = parser_mediator.GetEstimatedYear() try: structure = self.FIREWALL_LINE.parseString(line) except pyparsing.ParseException as exception: logging.debug(( u'Unable to parse file as a Mac AppFirewall log file with error: ' u'{0:s}').format(exception)) return False if structure.action != u'creating /var/log/appfirewall.log': logging.debug( u'Not a Mac AppFirewall log file, invalid action: {0!s}'.format( structure.action)) return False if structure.status != u'Error': logging.debug( u'Not a Mac AppFirewall log file, invalid status: {0!s}'.format( structure.status)) return False time_elements_tuple = self._GetTimeElementsTuple(structure) try: dfdatetime_time_elements.TimeElements( time_elements_tuple=time_elements_tuple) except ValueError: logging.debug(( u'Not a Mac AppFirewall log file, invalid date and time: ' u'{0!s}').format(structure.date_time)) return False self._last_month = time_elements_tuple[1] return True
def symbol(init_chars: str) -> pp.ParserElement: rv = pp.Combine(pp.Optional('_') + pp.Word(init_chars, pp.alphanums + '_')) return ~ReservedSymbols + rv
import pyparsing import re import functools # Code generation specific libraries import common #----------------------------------- # Definitions #----------------------------------- # Define default value if a named token has not been found, and thus not been set TokenNotSet = '' # Define identifier Identifier = pyparsing.Word(pyparsing.alphas, pyparsing.alphanums + '_') TypeIdentifier = pyparsing.Combine( Identifier + pyparsing.Optional(pyparsing.Literal('.') + Identifier)) # Define literals Semicolon = pyparsing.Literal(';').suppress() Pointer = pyparsing.Literal('*') OpenBracket = pyparsing.Literal('[').suppress() CloseBracket = pyparsing.Literal(']').suppress() DotDot = pyparsing.Literal('..').suppress() Equal = pyparsing.Literal('=').suppress() # Define keywords KeywordDirIn = pyparsing.Keyword(common.DIR_IN) KeywordDirOut = pyparsing.Keyword(common.DIR_OUT) Direction = KeywordDirIn | KeywordDirOut
import sys, re # standard string format (used if no format is explicitely given) STANDARD_STRING_FORMAT = "[$__REALTIME_TIMESTAMP] $MESSAGE" """ GRAMMAR """ # types of patterns, annotated PAT_NR, PAT_NE, PAT_LT, PAT_ST, PAT_LE, PAT_SE, PAT_STR, PAT_STAR, PAT_AVAIL, PAT_NEG, PAT_REG = [ 'PAT_%s' % s for s in 'NR NR_NE NR_LT NR_ST NR_LE NR_SE STR STAR AVAIL NEG REG'.split() ] # Basic types hexadecimal = (pp.Suppress('0x') + pp.Word(pp.hexnums)).setParseAction(lambda x: int(x[0], 16)) octogonal = ('0' + pp.Optional( pp.Word("01234567"))).setParseAction(lambda x: int("".join(x), 8)) decimal = pp.Word(pp.nums).setParseAction(lambda x: int(x[0], 10)) string = pp.QuotedString('"', escChar='\\', unquoteResults=True) | pp.QuotedString( "'", escChar='\\', unquoteResults=True) # Pattern types pat_number = (hexadecimal | octogonal | decimal).setParseAction(lambda x: (PAT_NR, x[0])) pat_ne = (pp.Suppress('!=') + (hexadecimal | octogonal | decimal) ).setParseAction(lambda x: (PAT_NE, x[0])) pat_lt = (pp.Suppress('>') + (hexadecimal | octogonal | decimal) ).setParseAction(lambda x: (PAT_LT, x[0])) pat_st = (pp.Suppress('<') + (hexadecimal | octogonal | decimal)
def __parse_tc_filter_network(text): network_pattern = (pp.SkipTo("network=", include=True) + pp.Word(pp.alphanums + "." + "/")) return network_pattern.parseString(text)[-1]
boolean = pyparsing.Regex(boolean).setParseAction( lambda t: t[0].lower()[0] == "t") quoted_string = pyparsing.QuotedString('"', escChar="\\") unquoted_string = pyparsing.OneOrMore( pyparsing.CharsNotIn(" ,=\\") + pyparsing.Optional( pyparsing.OneOrMore(( pyparsing.Literal("\\ ") | pyparsing.Literal("\\,") | pyparsing.Literal("\\=") | pyparsing.Literal("\\") ).setParseAction(lambda s, loc, tok: tok[0][-1])))).setParseAction( lambda s, loc, tok: "".join(list(tok))) measurement = tag_key = tag_value = field_key = quoted_string | unquoted_string number = r"[+-]?\d+(:?\.\d*)?(:?[eE][+-]?\d+)?" number = pyparsing.Regex(number).setParseAction( lambda s, loc, tok: float(tok[0])) integer = (pyparsing.Word( pyparsing.nums).setParseAction(lambda s, loc, tok: int(tok[0])) + pyparsing.Suppress("i")) field_value = integer | number | quoted_string timestamp = pyparsing.Word(pyparsing.nums).setParseAction( lambda s, loc, tok: numpy.datetime64(int(tok[0]), 'ns')) line_protocol = ( measurement + # Tags pyparsing.Optional(pyparsing.Suppress(",") + pyparsing.delimitedList( pyparsing.OneOrMore( pyparsing.Group(tag_key + pyparsing.Suppress("=") + tag_value), ",")).setParseAction(lambda s, loc, tok: dict(list(tok))), default={}) + pyparsing.Suppress(" ") + # Fields pyparsing.delimitedList(
def __init__(self, obj, config=str()): #print(config) #_config = """input = '@text1', mapping = 'low = 1, medium = 2, high = 3'""" super(Enum, self).__init__(obj, config=config) #print(config) #print(self.config) #print(config == _config) if self.config is None or len(self.config) < 1 or not isinstance( self.config, str): raise ValueError('Enum plugin function requires a config string') inputkeyword = 'input' mappingkeyword = 'mapping' if not inputkeyword in self.config: raise ValueError( 'A input keyword argument must be specified for the Enum plugin function' ) if not mappingkeyword in self.config: raise ValueError( 'A mapping keyword argument must be specified for the Enum plugin function' ) attrexpr = p.Combine( p.Literal("'").suppress() + (p.Literal('@') | p.Literal('!')).suppress() + p.Word(p.alphanums) + p.Literal("'").suppress()) inputexpr = p.CaselessKeyword(inputkeyword).suppress() + p.Literal( '=').suppress() + attrexpr mappingexpr = p.CaselessKeyword(mappingkeyword).suppress() + p.Literal( '=').suppress() + p.sglQuotedString() expr = inputexpr + p.Literal(',').suppress() + mappingexpr self.input = None self.mapping = None _matches = [] for x in expr.scanString(self.config): _matches.append(x) if len(_matches) > 1: raise IndexError( 'There should only be one input and mapping keyword set in the Enum plugin function\'s config but %s was received' % _matches) #print(_matches) rawconfig = _matches[0] mappingdict = {} for mapitem in rawconfig[0][1][1:-1].split(','): k, v = mapitem.split('=') mappingdict[k.strip()] = v.strip() self.input = rawconfig[0][0] self.mapping = pt.trie() for k, v in mappingdict.items(): self.mapping[k.lower()] = v
import pyparsing as pp from bling import ast expression = pp.Forward() null = pp.Keyword('null').setParseAction(ast.Null) boolean = pp.Keyword('true') | pp.Keyword('false') boolean.setParseAction(ast.Boolean) number = pp.Combine( pp.Optional('-') + ('0' | pp.Word('123456789', pp.nums)) + pp.Optional('.' + pp.Word(pp.nums)) + pp.Optional(pp.Word('eE', exact=1) + pp.Word(pp.nums + '+-', pp.nums))) number.setParseAction(ast.Number) nibble = pp.Word( pp.hexnums, exact=1).setParseAction(lambda tokens: int(tokens[0] + tokens[0], 16)) byte = pp.Word(pp.hexnums, exact=2).setParseAction(lambda tokens: int(tokens[0], 16)) hex_rgb = pp.Suppress('#') + (byte * 3 | nibble * 3) color = hex_rgb.setParseAction(ast.Color) literal = null | boolean | number | color def identifier(): return pp.Word(pp.alphas + "_") reference = identifier().setParseAction(ast.Reference)
def _identifier(name: str) -> pp.Token: return (pp.Optional(pp.Suppress('"')) + pp.Word(pp.alphanums + '_')(name) + pp.Optional(pp.Suppress('"')))
def identifier(): return pp.Word(pp.alphas + "_")
import pyparsing as pp from pydbml.definitions.generic import name from pydbml.definitions.common import _, _c, end, note, note_object from pydbml.definitions.column import table_column from pydbml.definitions.index import indexes from pydbml.classes import Table pp.ParserElement.setDefaultWhitespaceChars(' \t\r') alias = pp.WordStart() + pp.Literal('as').suppress() - pp.WordEnd() - name hex_char = pp.Word(pp.srange('[0-9a-fA-F]'), exact=1) hex_color = ("#" - (hex_char * 3 ^ hex_char * 6)).leaveWhitespace() header_color = (pp.CaselessLiteral('headercolor:').suppress() + _ - pp.Combine(hex_color)('header_color')) table_setting = _ + (note('note') | header_color) + _ table_settings = '[' + table_setting + (',' + table_setting)[...] + ']' def parse_table_settings(s, l, t): ''' [headercolor: #cccccc, note: 'note'] ''' result = {} if 'note' in t: result['note'] = t['note'] if 'header_color' in t: result['header_color'] = t['header_color'] return result
def _parse_ios_interfaces(data, acls_as_list=True, auto_cleanup=True, skip_disabled=True): """ Walks through a IOS interface config and returns a dict of parts. Intended for use by `~trigger.cmds.NetACLInfo.ios_parse()` but was written to be portable. :param acls_as_list: Whether you want acl names as strings instead of list members, e.g. :param auto_cleanup: Whether you want to pass results through cleanup_results(). Default: ``True``) "ABC123" vs. ['ABC123']. (Default: ``True``) :param skip_disabled: Whether to skip disabled interfaces. (Default: ``True``) """ import pyparsing as pp # Setup bang = pp.Literal("!").suppress() anychar = pp.Word(pp.printables) nonbang = pp.Word(''.join([x for x in pp.printables if x != "!"]) + '\n\r\t ') comment = bang + pp.restOfLine.suppress() #weird things to ignore in foundries aaa_line = pp.Literal("aaa").suppress() + pp.restOfLine.suppress() module_line = pp.Literal("module").suppress() + pp.restOfLine.suppress() startup_line = pp.Literal("Startup").suppress() + pp.restOfLine.suppress() ver_line = pp.Literal("ver") + anychar #+ pp.restOfLine.suppress() #using SkipTO instead now #foundry example: #telnet@olse1-dc5#show configuration | include ^(interface | ip address | ip access-group | description|!) #! #Startup-config data location is flash memory #! #Startup configuration: #! #ver 07.5.05hT53 #! #module 1 bi-0-port-m4-management-module #module 2 bi-8-port-gig-module #there is a lot more that foundry is including in the output that should be ignored interface_keyword = pp.Keyword("interface") unwanted = pp.SkipTo(interface_keyword, include=False).suppress() #unwanted = pp.ZeroOrMore(bang ^ comment ^ aaa_line ^ module_line ^ startup_line ^ ver_line) octet = pp.Word(pp.nums, max=3) ipaddr = pp.Combine(octet + "." + octet + "." + octet + "." + octet) address = ipaddr netmask = ipaddr cidr = pp.Literal("/").suppress() + pp.Word(pp.nums, max=2) # Description desc_keyword = pp.Keyword("description") description = pp.Dict(pp.Group(desc_keyword + pp.Group(pp.restOfLine))) # Addresses #cisco example: # ip address 172.29.188.27 255.255.255.224 secondary # #foundry example: # ip address 10.62.161.187/26 ipaddr_keyword = pp.Keyword("ip address").suppress() secondary = pp.Literal("secondary").suppress() #foundry matches on cidr and cisco matches on netmask #netmask converted to cidr in cleanup ip_tuple = pp.Group(address + (cidr ^ netmask)).setResultsName( 'addr', listAllMatches=True) negotiated = pp.Literal('negotiated') # Seen on Cisco 886 ip_address = ipaddr_keyword + (negotiated ^ ip_tuple) + pp.Optional(secondary) addrs = pp.ZeroOrMore(ip_address) # ACLs acl_keyword = pp.Keyword("ip access-group").suppress() # acl_name to be [''] or '' depending on acls_as_list acl_name = pp.Group(anychar) if acls_as_list else anychar direction = pp.oneOf('in out').suppress() acl_in = acl_keyword + pp.FollowedBy(acl_name + pp.Literal('in')) acl_in.setParseAction(pp.replaceWith('acl_in')) acl_out = acl_keyword + pp.FollowedBy(acl_name + pp.Literal('out')) acl_out.setParseAction(pp.replaceWith('acl_out')) acl = pp.Dict(pp.Group((acl_in ^ acl_out) + acl_name)) + direction acls = pp.ZeroOrMore(acl) # Interfaces iface_keyword = pp.Keyword("interface").suppress() foundry_awesome = pp.Literal(" ").suppress() + anychar #foundry exmaple: #! #interface ethernet 6/6 # ip access-group 126 in # ip address 172.18.48.187 255.255.255.255 #cisco example: #! #interface Port-channel1 # description gear1-mtc : AE1 : iwslbfa1-mtc-sw0 : : 1x1000 : 172.20.166.0/24 : : : # ip address 172.20.166.251 255.255.255.0 interface = pp.Combine(anychar + pp.Optional(foundry_awesome)) iface_body = pp.Optional(description) + pp.Optional(acls) + pp.Optional( addrs) + pp.Optional(acls) #foundry's body is acl then ip and cisco's is ip then acl iface_info = pp.Optional(unwanted) + iface_keyword + pp.Dict( pp.Group(interface + iface_body)) + pp.Optional(pp.SkipTo(bang)) interfaces = pp.Dict(pp.ZeroOrMore(iface_info)) # This is where the parsing is actually happening try: results = interfaces.parseString(data) except: # (ParseException, ParseFatalException, RecursiveGrammarException): results = {} if auto_cleanup: return _cleanup_interface_results(results, skip_disabled=skip_disabled) return results
class TestCommonHelperExpressions(PyparsingExpressionTestCase): tests = [ PpTestSpec( desc="A comma-delimited list of words", expr=pp.delimited_list(pp.Word(pp.alphas)), text="this, that, blah,foo, bar", expected_list=["this", "that", "blah", "foo", "bar"], ), PpTestSpec( desc="A counted array of words", expr=pp.Group(pp.counted_array(pp.Word("ab")))[...], text="2 aaa bbb 0 3 abab bbaa abbab", expected_list=[["aaa", "bbb"], [], ["abab", "bbaa", "abbab"]], ), PpTestSpec( desc="skipping comments with ignore", expr=(pp.pyparsing_common.identifier("lhs") + "=" + pp.pyparsing_common.fnumber("rhs")).ignore( pp.cpp_style_comment), text="abc_100 = /* value to be tested */ 3.1416", expected_list=["abc_100", "=", 3.1416], expected_dict={ "lhs": "abc_100", "rhs": 3.1416 }, ), PpTestSpec( desc= "some pre-defined expressions in pyparsing_common, and building a dotted identifier with delimted_list", expr=(pp.pyparsing_common.number("id_num") + pp.delimitedList( pp.pyparsing_common.identifier, ".", combine=True)("name") + pp.pyparsing_common.ipv4_address("ip_address")), text="1001 www.google.com 192.168.10.199", expected_list=[1001, "www.google.com", "192.168.10.199"], expected_dict={ "id_num": 1001, "name": "www.google.com", "ip_address": "192.168.10.199", }, ), PpTestSpec( desc= "using one_of (shortcut for Literal('a') | Literal('b') | Literal('c'))", expr=pp.one_of("a b c")[...], text="a b a b b a c c a b b", expected_list=[ "a", "b", "a", "b", "b", "a", "c", "c", "a", "b", "b" ], ), PpTestSpec( desc="parsing nested parentheses", expr=pp.nested_expr(), text="(a b (c) d (e f g ()))", expected_list=[["a", "b", ["c"], "d", ["e", "f", "g", []]]], ), PpTestSpec( desc="parsing nested braces", expr=(pp.Keyword("if") + pp.nested_expr()("condition") + pp.nested_expr("{", "}")("body")), text='if ((x == y) || !z) {printf("{}");}', expected_list=[ "if", [["x", "==", "y"], "||", "!z"], ["printf(", '"{}"', ");"], ], expected_dict={ "condition": [[["x", "==", "y"], "||", "!z"]], "body": [["printf(", '"{}"', ");"]], }, ), ]
class XChatLogParser(text_parser.PyparsingSingleLineTextParser): """Parse XChat log files.""" NAME = 'xchatlog' DESCRIPTION = 'Parser for XChat log files.' _ENCODING = 'utf-8' # Common (header/footer/body) pyparsing structures. # TODO: Only English ASCII timestamp supported ATM, add support for others. _WEEKDAY = pyparsing.Group( pyparsing.Keyword('Sun') | pyparsing.Keyword('Mon') | pyparsing.Keyword('Tue') | pyparsing.Keyword('Wed') | pyparsing.Keyword('Thu') | pyparsing.Keyword('Fri') | pyparsing.Keyword('Sat')) # Header/footer pyparsing structures. # Sample: "**** BEGIN LOGGING AT Mon Dec 31 21:11:55 2011". # Note that "BEGIN LOGGING" text is localized (default, English) and can be # different if XChat locale is different. _HEADER_SIGNATURE = pyparsing.Keyword('****') _HEADER_DATE_TIME = pyparsing.Group( _WEEKDAY.setResultsName('weekday') + text_parser.PyparsingConstants.THREE_LETTERS.setResultsName('month') + text_parser.PyparsingConstants.ONE_OR_TWO_DIGITS.setResultsName( 'day') + text_parser.PyparsingConstants.TIME_ELEMENTS + text_parser.PyparsingConstants.FOUR_DIGITS.setResultsName('year')) _LOG_ACTION = pyparsing.Group( pyparsing.Word(pyparsing.printables) + pyparsing.Word(pyparsing.printables) + pyparsing.Word(pyparsing.printables)) _HEADER = (_HEADER_SIGNATURE.suppress() + _LOG_ACTION.setResultsName('log_action') + _HEADER_DATE_TIME.setResultsName('date_time')) # Body (nickname, text and/or service messages) pyparsing structures. # Sample: "dec 31 21:11:58 <fpi> ola plas-ing guys!". _DATE_TIME = pyparsing.Group( text_parser.PyparsingConstants.THREE_LETTERS.setResultsName('month') + text_parser.PyparsingConstants.ONE_OR_TWO_DIGITS.setResultsName( 'day') + text_parser.PyparsingConstants.TIME_ELEMENTS) _NICKNAME = pyparsing.QuotedString( '<', endQuoteChar='>').setResultsName('nickname') _LOG_LINE = (_DATE_TIME.setResultsName('date_time') + pyparsing.Optional(_NICKNAME) + pyparsing.SkipTo(pyparsing.lineEnd).setResultsName('text')) LINE_STRUCTURES = [ ('logline', _LOG_LINE), ('header', _HEADER), ('header_signature', _HEADER_SIGNATURE), ] def __init__(self): """Initializes a parser object.""" super(XChatLogParser, self).__init__() self._last_month = 0 self._xchat_year = None self.offset = 0 def _GetTimeElementsTuple(self, structure): """Retrieves a time elements tuple from the structure. Args: structure (pyparsing.ParseResults): structure of tokens derived from a line of a text file. Returns: tuple: containing: year (int): year. month (int): month, where 1 represents January. day_of_month (int): day of month, where 1 is the first day of the month. hours (int): hours. minutes (int): minutes. seconds (int): seconds. """ month, day, hours, minutes, seconds = structure.date_time month = timelib.MONTH_DICT.get(month.lower(), 0) if month != 0 and month < self._last_month: # Gap detected between years. self._xchat_year += 1 return (self._xchat_year, month, day, hours, minutes, seconds) def _ParseHeader(self, parser_mediator, structure): """Parses a log header. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfvfs. structure (pyparsing.ParseResults): structure of tokens derived from a line of a text file. """ _, month, day, hours, minutes, seconds, year = structure.date_time month = timelib.MONTH_DICT.get(month.lower(), 0) time_elements_tuple = (year, month, day, hours, minutes, seconds) try: date_time = dfdatetime_time_elements.TimeElements( time_elements_tuple=time_elements_tuple) date_time.is_local_time = True except ValueError: parser_mediator.ProduceExtractionError( 'invalid date time value: {0!s}'.format(structure.date_time)) return self._last_month = month event_data = XChatLogEventData() if structure.log_action[0] == 'BEGIN': self._xchat_year = year event_data.text = 'XChat start logging' elif structure.log_action[0] == 'END': self._xchat_year = None event_data.text = 'XChat end logging' else: logger.debug('Unknown log action: {0:s}.'.format(' '.join( structure.log_action))) return event = time_events.DateTimeValuesEvent( date_time, definitions.TIME_DESCRIPTION_ADDED, time_zone=parser_mediator.timezone) parser_mediator.ProduceEventWithEventData(event, event_data) def _ParseLogLine(self, parser_mediator, structure): """Parses a log line. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfvfs. structure (pyparsing.ParseResults): structure of tokens derived from a line of a text file. """ if not self._xchat_year: return time_elements_tuple = self._GetTimeElementsTuple(structure) try: date_time = dfdatetime_time_elements.TimeElements( time_elements_tuple=time_elements_tuple) date_time.is_local_time = True except ValueError: parser_mediator.ProduceExtractionError( 'invalid date time value: {0!s}'.format(structure.date_time)) return self._last_month = time_elements_tuple[1] event_data = XChatLogEventData() event_data.nickname = structure.nickname # The text string contains multiple unnecessary whitespaces that need to # be removed, thus the split and re-join. event_data.text = ' '.join(structure.text.split()) event = time_events.DateTimeValuesEvent( date_time, definitions.TIME_DESCRIPTION_ADDED, time_zone=parser_mediator.timezone) parser_mediator.ProduceEventWithEventData(event, event_data) def ParseRecord(self, parser_mediator, key, structure): """Parses a log record structure and produces events. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfvfs. key (str): identifier of the structure of tokens. structure (pyparsing.ParseResults): structure of tokens derived from a line of a text file. Raises: ParseError: when the structure type is unknown. """ if key not in ('header', 'header_signature', 'logline'): raise errors.ParseError( 'Unable to parse record, unknown structure: {0:s}'.format(key)) if key == 'logline': self._ParseLogLine(parser_mediator, structure) elif key == 'header': self._ParseHeader(parser_mediator, structure) elif key == 'header_signature': # If this key is matched (after others keys failed) we got a different # localized header and we should stop parsing until a new good header # is found. Stop parsing is done setting xchat_year to 0. # Note that the code assumes that LINE_STRUCTURES will be used in the # exact order as defined! logger.warning('Unknown locale header.') self._xchat_year = 0 def VerifyStructure(self, parser_mediator, line): """Verify that this file is a XChat log file. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfvfs. line (str): line from a text file. Returns: bool: True if the line is in the expected format, False if not. """ try: structure = self._HEADER.parseString(line) except pyparsing.ParseException: logger.debug('Not a XChat log file') return False _, month, day, hours, minutes, seconds, year = structure.date_time month = timelib.MONTH_DICT.get(month.lower(), 0) time_elements_tuple = (year, month, day, hours, minutes, seconds) try: dfdatetime_time_elements.TimeElements( time_elements_tuple=time_elements_tuple) except ValueError: logger.debug( 'Not a XChat log file, invalid date and time: {0!s}'.format( structure.date_time)) return False return True
class PopularityContestParser(text_parser.PyparsingSingleLineTextParser): """Parse popularity contest log files.""" NAME = u'popularity_contest' DESCRIPTION = u'Parser for popularity contest log files.' _ASCII_PRINTABLES = pyparsing.printables _UNICODE_PRINTABLES = u''.join( unichr(character) for character in xrange(65536) if not unichr(character).isspace()) MRU = pyparsing.Word(_UNICODE_PRINTABLES).setResultsName(u'mru') PACKAGE = pyparsing.Word(_ASCII_PRINTABLES).setResultsName(u'package') TAG = pyparsing.QuotedString(u'<', endQuoteChar=u'>').setResultsName(u'tag') TIMESTAMP = text_parser.PyparsingConstants.INTEGER.setResultsName( u'timestamp') HEADER = ( pyparsing.Literal(u'POPULARITY-CONTEST-').suppress() + text_parser.PyparsingConstants.INTEGER.setResultsName(u'session') + pyparsing.Literal(u'TIME:').suppress() + TIMESTAMP + pyparsing.Literal(u'ID:').suppress() + pyparsing.Word(pyparsing.alphanums, exact=32).setResultsName(u'id') + pyparsing.SkipTo(pyparsing.LineEnd()).setResultsName(u'details')) FOOTER = ( pyparsing.Literal(u'END-POPULARITY-CONTEST-').suppress() + text_parser.PyparsingConstants.INTEGER.setResultsName(u'session') + pyparsing.Literal(u'TIME:').suppress() + TIMESTAMP) LOG_LINE = (TIMESTAMP.setResultsName(u'atime') + TIMESTAMP.setResultsName(u'ctime') + (PACKAGE + TAG | PACKAGE + MRU + pyparsing.Optional(TAG))) LINE_STRUCTURES = [ (u'logline', LOG_LINE), (u'header', HEADER), (u'footer', FOOTER), ] _ENCODING = u'UTF-8' def _ParseLogLine(self, parser_mediator, structure): """Parses an event object from the log line. Args: parser_mediator: A parser mediator object (instance of ParserMediator). structure: the log line structure object (instance of pyparsing.ParseResults). """ # Required fields are <mru> and <atime> and we are not interested in # log lines without <mru>. if not structure.mru: return # The <atime> field (as <ctime>) is always present but could be 0. # In case of <atime> equal to 0, we are in <NOFILES> case, safely return # without logging. if structure.atime: # TODO: not doing any check on <tag> fields, even if only informative # probably it could be better to check for the expected values. event_object = PopularityContestEvent( structure.atime, eventdata.EventTimestamp.ACCESS_TIME, structure.package, structure.mru, tag=structure.tag) parser_mediator.ProduceEvent(event_object) if structure.ctime: event_object = PopularityContestEvent( structure.ctime, eventdata.EventTimestamp.ENTRY_MODIFICATION_TIME, structure.package, structure.mru, tag=structure.tag) parser_mediator.ProduceEvent(event_object) def ParseRecord(self, parser_mediator, key, structure): """Parses a log record structure and produces events. Args: parser_mediator: A parser mediator object (instance of ParserMediator). key: An identification string indicating the name of the parsed structure. structure: A pyparsing.ParseResults object from a line in the log file. """ # TODO: Add anomaly objects for abnormal timestamps, such as when the log # timestamp is greater than the session start. if key == u'logline': self._ParseLogLine(parser_mediator, structure) elif key == u'header': if not structure.timestamp: logging.debug( u'PopularityContestParser, header with invalid timestamp.') return session = u'{0!s}'.format(structure.session) event_object = PopularityContestSessionEvent( structure.timestamp, session, u'start', details=structure.details, hostid=structure.id) parser_mediator.ProduceEvent(event_object) elif key == u'footer': if not structure.timestamp: logging.debug( u'PopularityContestParser, footer with invalid timestamp.') return session = u'{0!s}'.format(structure.session) event_object = PopularityContestSessionEvent( structure.timestamp, session, u'end') parser_mediator.ProduceEvent(event_object) else: logging.warning( u'PopularityContestParser, unknown structure: {0:s}.'.format( key)) def VerifyStructure(self, parser_mediator, line): """Verify that this file is a Popularity Contest log file. Args: parser_mediator: A parser mediator object (instance of ParserMediator). line: A single line from the text file. Returns: True if this is the correct parser, False otherwise. """ try: header_struct = self.HEADER.parseString(line) except pyparsing.ParseException: logging.debug(u'Not a Popularity Contest log file, invalid header') return False if not timelib.Timestamp.FromPosixTime(header_struct.timestamp): logging.debug( u'Invalid Popularity Contest log file header timestamp.') return False return True
class SSHSyslogPlugin(interface.SyslogPlugin): """A plugin for creating events from syslog message produced by SSH.""" NAME = 'ssh' DATA_FORMAT = 'SSH syslog line' REPORTER = 'sshd' _AUTHENTICATION_METHOD = (pyparsing.Keyword('password') | pyparsing.Keyword('publickey')) _PYPARSING_COMPONENTS = { 'address': text_parser.PyparsingConstants.IP_ADDRESS.setResultsName('address'), 'authentication_method': _AUTHENTICATION_METHOD.setResultsName('authentication_method'), 'fingerprint': pyparsing.Combine( pyparsing.Literal('RSA ') + pyparsing.Word(':' + pyparsing.hexnums)).setResultsName('fingerprint'), 'port': pyparsing.Word(pyparsing.nums, max=5).setResultsName('port'), 'protocol': pyparsing.Literal('ssh2').setResultsName('protocol'), 'username': pyparsing.Word(pyparsing.alphanums).setResultsName('username'), } _LOGIN_GRAMMAR = ( pyparsing.Literal('Accepted') + _PYPARSING_COMPONENTS['authentication_method'] + pyparsing.Literal('for') + _PYPARSING_COMPONENTS['username'] + pyparsing.Literal('from') + _PYPARSING_COMPONENTS['address'] + pyparsing.Literal('port') + _PYPARSING_COMPONENTS['port'] + _PYPARSING_COMPONENTS['protocol'] + pyparsing.Optional( pyparsing.Literal(':') + _PYPARSING_COMPONENTS['fingerprint']) + pyparsing.StringEnd()) _FAILED_CONNECTION_GRAMMAR = ( pyparsing.Literal('Failed') + _PYPARSING_COMPONENTS['authentication_method'] + pyparsing.Literal('for') + _PYPARSING_COMPONENTS['username'] + pyparsing.Literal('from') + _PYPARSING_COMPONENTS['address'] + pyparsing.Literal('port') + _PYPARSING_COMPONENTS['port'] + pyparsing.StringEnd()) _OPENED_CONNECTION_GRAMMAR = (pyparsing.Literal('Connection from') + _PYPARSING_COMPONENTS['address'] + pyparsing.Literal('port') + _PYPARSING_COMPONENTS['port'] + pyparsing.LineEnd()) MESSAGE_GRAMMARS = [ ('login', _LOGIN_GRAMMAR), ('failed_connection', _FAILED_CONNECTION_GRAMMAR), ('opened_connection', _OPENED_CONNECTION_GRAMMAR), ] def _ParseMessage(self, parser_mediator, key, date_time, tokens): """Produces an event from a syslog body that matched one of the grammars. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfvfs. key (str): name of the matching grammar. date_time (dfdatetime.DateTimeValues): date and time values. tokens (dict[str, str]): tokens derived from a syslog message based on the defined grammar. Raises: ValueError: If an unknown key is provided. """ if key not in ('failed_connection', 'login', 'opened_connection'): raise ValueError('Unknown grammar key: {0:s}'.format(key)) if key == 'login': event_data = SSHLoginEventData() elif key == 'failed_connection': event_data = SSHFailedConnectionEventData() elif key == 'opened_connection': event_data = SSHOpenedConnectionEventData() event_data.address = tokens.get('address', None) event_data.authentication_method = tokens.get('authentication_method', None) event_data.body = tokens.get('body', None) event_data.fingerprint = tokens.get('fingerprint', None) event_data.hostname = tokens.get('hostname', None) event_data.pid = tokens.get('pid', None) event_data.protocol = tokens.get('protocol', None) event_data.port = tokens.get('port', None) event_data.reporter = tokens.get('reporter', None) event_data.severity = tokens.get('severity', None) event_data.username = tokens.get('username', None) event = time_events.DateTimeValuesEvent( date_time, definitions.TIME_DESCRIPTION_WRITTEN, time_zone=parser_mediator.timezone) parser_mediator.ProduceEventWithEventData(event, event_data)