def VerifyFile(self, file_object): """Check if the file is a PLSRecall.dat file. Args: file_object: file that we want to check. Returns: True if this is a valid PLSRecall.dat file, otherwise False. """ file_object.seek(0, os.SEEK_SET) # The file consists of PL/SQL structures that are equal # size (4125 bytes) TRecallRecord records. It should be # noted that the query value is free form. try: structure = self.PLS_STRUCT.parse_stream(file_object) except (IOError, construct.FieldError): return False # Verify few entries inside the structure. try: timestamp = timelib.Timestamp.FromDelphiTime(structure.TimeStamp) except ValueError: return False if timestamp <= 0: return False # Verify that the timestamp is no more than six years into the future. # Six years is an arbitrary time length just to evaluate the timestamp # against some value. There is no guarantee that this will catch everything. # TODO: Add a check for similarly valid value back in time. Maybe if it the # timestamp is before 1980 we are pretty sure it is invalid? # TODO: This is a very flaky assumption. Find a better one. current_timestamp = timelib.Timestamp.GetNow() if timestamp > current_timestamp + self._SIX_YEARS_IN_MICRO_SECONDS: return False # TODO: Add other verification checks here. For instance make sure # that the query actually looks like a SQL query. This structure produces a # lot of false positives and thus we need to add additional verification to # make sure we are not parsing non-PLSRecall files. # Another check might be to make sure the username looks legitimate, or the # sequence number, or the database name. # For now we just check if all three fields pass our "is this a text" test. if not utils.IsText(structure.Username): return False if not utils.IsText(structure.Query): return False if not utils.IsText(structure.Database): return False # Take the first word from the query field and attempt to match that against # allowed queries. first_word, _, _ = structure.Query.partition(b' ') if first_word.lower() not in self._PLS_KEYWORD: return False return True
def VerifyFile(self, file_object): """Check if the file is a PLSRecall.dat file. Args: file_object: file that we want to check. Returns: True if this is a valid PLSRecall.dat file, otherwise False. """ file_object.seek(0, os.SEEK_SET) # The file consists of PL-SQL structures that are equal # size (4125 bytes) TRecallRecord records. It should be # noted that the query value is free form. try: structure = self.PLS_STRUCT.parse_stream(file_object) except (IOError, construct.FieldError): return False # Verify few entries inside the structure. try: timestamp = timelib.Timestamp.FromDelphiTime(structure.TimeStamp) except ValueError: return False if timestamp <= 0: return False # TODO: Add other verification checks here. For instance make sure # that the query actually looks like a SQL query. This structure produces a # lot of false positives and thus we need to add additional verification to # make sure we are not parsing non-PLSRecall files. # Another check might be to make sure the username looks legitimate, or the # sequence number, or the database name. # For now we just check if all three fields pass our "is this a text" test. if not utils.IsText(structure.Username): return False if not utils.IsText(structure.Query): return False if not utils.IsText(structure.Database): return False return True
def testIsText(self): """Test the IsText method.""" bytes_in = 'thisi My Weird ASCII and non whatever string.' self.assertTrue(utils.IsText(bytes_in)) bytes_in = u'Plaso Síar Og Raðar Þessu' self.assertTrue(utils.IsText(bytes_in)) bytes_in = '\x01\62LSO\xFF' self.assertFalse(utils.IsText(bytes_in)) bytes_in = 'T\x00h\x00i\x00s\x00\x20\x00' self.assertTrue(utils.IsText(bytes_in)) bytes_in = 'Ascii\x00' self.assertTrue(utils.IsText(bytes_in)) bytes_in = 'Ascii Start then...\x00\x99\x23' self.assertFalse(utils.IsText(bytes_in))
def testIsText(self): """Tests the IsText function.""" bytes_in = b'this is My Weird ASCII and non whatever string.' self.assertTrue(utils.IsText(bytes_in)) bytes_in = 'Plaso Síar Og Raðar Þessu' self.assertTrue(utils.IsText(bytes_in)) bytes_in = b'\x01\\62LSO\xFF' self.assertFalse(utils.IsText(bytes_in)) bytes_in = b'T\x00h\x00i\x00s\x00\x20\x00' self.assertTrue(utils.IsText(bytes_in)) bytes_in = b'Ascii\x00' self.assertTrue(utils.IsText(bytes_in)) bytes_in = b'Ascii Open then...\x00\x99\x23' self.assertFalse(utils.IsText(bytes_in))
def _ReadRecord(self, text_file_object, max_line_length=0): """Return a single record from an Opera global_history file. A single record consists of four lines, with each line as: Title of page (or the URL if not there). Website URL. Timestamp in POSIX time. Popularity index (-1 if first time visited). Args: text_file_object: A text file object (instance of dfvfs.TextFile). max_line_length: An integer that denotes the maximum byte length for each line read. Returns: A tuple of: title, url, timestamp, popularity_index. Raises: errors.NotAText: If the file being read is not a text file. """ if max_line_length: title_raw = text_file_object.readline(max_line_length) if len(title_raw) == max_line_length and not title_raw.endswith('\n'): return None, None, None, None if not utils.IsText(title_raw): raise errors.NotAText(u'Title line is not a text.') title = title_raw.strip() else: title = text_file_object.readline().strip() if not title: return None, None, None, None url = text_file_object.readline().strip() if not url: return None, None, None, None timestamp_line = text_file_object.readline().strip() popularity_line = text_file_object.readline().strip() try: timestamp = int(timestamp_line, 10) except ValueError: if len(timestamp_line) > 30: timestamp_line = timestamp_line[0:30] logging.debug(u'Unable to read in timestamp [{!r}]'.format( timestamp_line)) return None, None, None, None try: popularity_index = int(popularity_line, 10) except ValueError: try: logging.debug(u'Unable to read in popularity index[{}]'.format( popularity_line)) except UnicodeDecodeError: logging.debug( u'Unable to read in popularity index [unable to print ' u'bad line]') return None, None, None, None # Try to get the data into unicode. try: title_unicode = title.decode('utf-8') except UnicodeDecodeError: partial_title = title.decode('utf-8', 'ignore') title_unicode = u'Warning: partial line, starts with: {}'.format( partial_title) return title_unicode, url, timestamp, popularity_index
def Parse(self, parser_context, file_entry): """Parse a text file using a pyparsing definition. Args: parser_context: A parser context object (instance of ParserContext). file_entry: A file entry object (instance of dfvfs.FileEntry). Yields: An event object (instance of EventObject). """ self.file_entry = file_entry file_object = file_entry.GetFileObject() if not self.LINE_STRUCTURES: raise errors.UnableToParseFile( u'Line structure undeclared, unable to proceed.') file_object.seek(0, os.SEEK_SET) self._buffer = '' self._FillBuffer(file_object) if not utils.IsText(self._buffer): raise errors.UnableToParseFile( u'Not a text file, unable to proceed.') if not self.VerifyStructure(parser_context, self._buffer): raise errors.UnableToParseFile('Wrong file structure.') # Set the offset to the beginning of the file. self._current_offset = 0 # Read every line in the text file. while self._buffer: # Initialize pyparsing objects. tokens = None start = 0 end = 0 structure_key = None # Try to parse the line using all the line structures. for key, structure in self.LINE_STRUCTURES: try: parsed_structure = next( structure.scanString(self._buffer, maxMatches=1), None) except pyparsing.ParseException: continue if not parsed_structure: continue tokens, start, end = parsed_structure # Only want to parse the structure if it starts # at the beginning of the buffer. if start == 0: structure_key = key break if tokens and not start: parsed_event = self.ParseRecord(parser_context, structure_key, tokens) if parsed_event: parsed_event.offset = self._current_offset parser_context.ProduceEvent(parsed_event, parser_name=self.NAME, file_entry=file_entry) self._current_offset += end self._buffer = self._buffer[end:] else: old_line = self._NextLine(file_object) if old_line: logging.warning(u'Unable to parse log line: {0:s}'.format( repr(old_line))) # Re-fill the buffer. self._FillBuffer(file_object)
def Parse(self, parser_context, file_entry): """Extract data from a text file using a pyparsing definition. Args: parser_context: A parser context object (instance of ParserContext). file_entry: A file entry object (instance of dfvfs.FileEntry). Yields: An event object (instance of EventObject). """ # TODO: find a more elegant way for this; currently the mac_wifi and # syslog parser seem to rely on this member. self.file_entry = file_entry file_object = file_entry.GetFileObject() # TODO: self._line_structures is a work-around and this needs # a structural fix. if not self._line_structures: raise errors.UnableToParseFile( u'Line structure undeclared, unable to proceed.') file_object.seek(0, os.SEEK_SET) text_file_object = text_file.TextFile(file_object) line = self._ReadLine(parser_context, file_entry, text_file_object, max_len=self.MAX_LINE_LENGTH, quiet=True) if not line: raise errors.UnableToParseFile(u'Not a text file.') if len(line) == self.MAX_LINE_LENGTH or len( line) == self.MAX_LINE_LENGTH - 1: logging.debug(( u'Trying to read a line and reached the maximum allowed length of ' u'{0:d}. The last few bytes of the line are: {1:s} [parser ' u'{2:s}]').format(self.MAX_LINE_LENGTH, repr(line[-10:]), self.NAME)) if not utils.IsText(line): raise errors.UnableToParseFile( u'Not a text file, unable to proceed.') if not self.VerifyStructure(parser_context, line): raise errors.UnableToParseFile('Wrong file structure.') # Set the offset to the beginning of the file. self._current_offset = 0 # Read every line in the text file. while line: parsed_structure = None use_key = None # Try to parse the line using all the line structures. for key, structure in self.LINE_STRUCTURES: try: parsed_structure = structure.parseString(line) except pyparsing.ParseException: pass if parsed_structure: use_key = key break if parsed_structure: parsed_event = self.ParseRecord(parser_context, use_key, parsed_structure) if parsed_event: parsed_event.offset = self._current_offset parser_context.ProduceEvent(parsed_event, parser_name=self.NAME, file_entry=file_entry) else: logging.warning( u'Unable to parse log line: {0:s}'.format(line)) self._current_offset = text_file_object.get_offset() line = self._ReadLine(parser_context, file_entry, text_file_object) file_object.close()
def Parse(self, parser_context, file_entry): """Extract data from a text file. Args: parser_context: A parser context object (instance of ParserContext). file_entry: A file entry object (instance of dfvfs.FileEntry). Yields: An event object (instance of EventObject). """ path_spec_printable = u'{0:s}:{1:s}'.format( file_entry.path_spec.type_indicator, file_entry.name) file_object = file_entry.GetFileObject() self.file_entry = file_entry # TODO: this is necessary since we inherit from lexer.SelfFeederMixIn. self.file_object = file_object # Start by checking, is this a text file or not? Before we proceed # any further. file_object.seek(0, os.SEEK_SET) if not utils.IsText(file_object.read(40)): raise errors.UnableToParseFile( u'Not a text file, unable to proceed.') file_object.seek(0, os.SEEK_SET) error_count = 0 file_verified = False # We need to clear out few values in the Lexer before continuing. # There might be some leftovers from previous run. self.error = 0 self.buffer = '' while True: _ = self.NextToken() if self.state == 'INITIAL': self.entry_offset = getattr(self, 'next_entry_offset', 0) self.next_entry_offset = file_object.tell() - len(self.buffer) if not file_verified and self.error >= self.MAX_LINES * 2: logging.debug( u'Lexer error count: {0:d} and current state {1:s}'.format( self.error, self.state)) file_object.close() raise errors.UnableToParseFile( u'[{0:s}] unsupported file: {1:s}.'.format( self.NAME, path_spec_printable)) if self.line_ready: try: event_object = self.ParseLine(parser_context) parser_context.ProduceEvent(event_object, parser_name=self.NAME, file_entry=file_entry) file_verified = True except errors.TimestampNotCorrectlyFormed as exception: error_count += 1 if file_verified: logging.debug( u'[{0:s} VERIFIED] Error count: {1:d} and ERROR: {2:d}' .format(path_spec_printable, error_count, self.error)) logging.warning( u'[{0:s}] Unable to parse timestamp with error: {1:s}' .format(self.NAME, exception)) else: logging.debug(( u'[{0:s} EVALUATING] Error count: {1:d} and ERROR: ' u'{2:d})').format(path_spec_printable, error_count, self.error)) if error_count >= self.MAX_LINES: file_object.close() raise errors.UnableToParseFile( u'[{0:s}] unsupported file: {1:s}.'.format( self.NAME, path_spec_printable)) finally: self.ClearValues() if self.Empty(): # Try to fill the buffer to prevent the parser from ending prematurely. self.Feed() if self.Empty(): break if not file_verified: file_object.close() raise errors.UnableToParseFile( u'[{0:s}] unable to parser file: {1:s}.'.format( self.NAME, path_spec_printable)) file_offset = file_object.get_offset() if file_offset < file_object.get_size(): logging.error( (u'{0:s} prematurely terminated parsing: {1:s} at offset: ' u'0x{2:08x}.').format(self.NAME, path_spec_printable, file_offset)) file_object.close()
def ParseFileObject(self, parser_mediator, file_object, **kwargs): """Parses a text file-like object using a pyparsing definition. Args: parser_mediator: a parser mediator object (instance of ParserMediator). file_object: a file-like object. Raises: UnableToParseFile: when the file cannot be parsed. """ file_entry = parser_mediator.GetFileEntry() # TODO: find a more elegant way for this; currently the mac_wifi and # syslog parser seem to rely on this member. self.file_entry = file_entry # TODO: self._line_structures is a work-around and this needs # a structural fix. if not self._line_structures: raise errors.UnableToParseFile( u'Line structure undeclared, unable to proceed.') text_file_object = text_file.TextFile(file_object) line = self._ReadLine(parser_mediator, file_entry, text_file_object, max_len=self.MAX_LINE_LENGTH, quiet=True) if not line: raise errors.UnableToParseFile(u'Not a text file.') if len(line) == self.MAX_LINE_LENGTH or len( line) == self.MAX_LINE_LENGTH - 1: logging.debug(( u'Trying to read a line and reached the maximum allowed length of ' u'{0:d}. The last few bytes of the line are: {1:s} [parser ' u'{2:s}]').format(self.MAX_LINE_LENGTH, repr(line[-10:]), self.NAME)) if not utils.IsText(line): raise errors.UnableToParseFile( u'Not a text file, unable to proceed.') if not self.VerifyStructure(parser_mediator, line): raise errors.UnableToParseFile(u'Wrong file structure.') # Set the offset to the beginning of the file. self._current_offset = 0 # Read every line in the text file. while line: parsed_structure = None use_key = None # Try to parse the line using all the line structures. for key, structure in self.LINE_STRUCTURES: try: parsed_structure = structure.parseString(line) except pyparsing.ParseException: pass if parsed_structure: use_key = key break if parsed_structure: parsed_event = self.ParseRecord(parser_mediator, use_key, parsed_structure) if parsed_event: parsed_event.offset = self._current_offset parser_mediator.ProduceEvent(parsed_event) else: if len(line) > 80: line = u'{0:s}...'.format(line[0:77]) parser_mediator.ProduceParseError( u'Unable to parse log line: {0:s} at offset {1:d}'.format( repr(line), self._current_offset)) self._current_offset = text_file_object.get_offset() line = self._ReadLine(parser_mediator, file_entry, text_file_object)
def ParseFileObject(self, parser_mediator, file_object, **kwargs): """Parses a text file-like object using a pyparsing definition. Args: parser_mediator: a parser mediator object (instance of ParserMediator). file_object: a file-like object. Raises: UnableToParseFile: when the file cannot be parsed. """ if not self.LINE_STRUCTURES: raise errors.UnableToParseFile(u'Missing line structures.') self._text_reader.Reset() try: self._text_reader.ReadLines(file_object) except UnicodeDecodeError as exception: raise errors.UnableToParseFile( u'Not a text file, with error: {0:s}'.format(exception)) if not utils.IsText(self._text_reader.lines): raise errors.UnableToParseFile( u'Not a text file, unable to proceed.') if not self.VerifyStructure(parser_mediator, self._text_reader.lines): raise errors.UnableToParseFile(u'Wrong file structure.') # Using parseWithTabs() overrides Pyparsing's default replacement of tabs # with spaces to SkipAhead() the correct number of bytes after a match. for key, structure in self.LINE_STRUCTURES: structure.parseWithTabs() # Read every line in the text file. while self._text_reader.lines: # Initialize pyparsing objects. tokens = None start = 0 end = 0 key = None # Try to parse the line using all the line structures. for key, structure in self.LINE_STRUCTURES: try: parsed_structure = next( structure.scanString(self._text_reader.lines, maxMatches=1), None) except pyparsing.ParseException: continue if not parsed_structure: continue tokens, start, end = parsed_structure # Only want to parse the structure if it starts # at the beginning of the buffer. if start == 0: break if tokens and start == 0: self.ParseRecord(parser_mediator, key, tokens) self._text_reader.SkipAhead(file_object, end) else: odd_line = self._text_reader.ReadLine(file_object) if odd_line: if len(odd_line) > 80: odd_line = u'{0:s}...'.format(odd_line[0:77]) parser_mediator.ProduceParseError( u'Unable to parse log line: {0:s}'.format( repr(odd_line))) try: self._text_reader.ReadLines(file_object) except UnicodeDecodeError as exception: parser_mediator.ProduceParseError( u'Unable to read lines from file with error: {0:s}'.format( exception))
def ParseFileObject(self, parser_mediator, file_object, **kwargs): """Parses a text file-like object using a pyparsing definition. Args: parser_mediator: A parser mediator object (instance of ParserMediator). file_object: A file-like object. Raises: UnableToParseFile: when the file cannot be parsed. """ if not self.LINE_STRUCTURES: raise errors.UnableToParseFile(u'Missing line structures.') self._text_reader.Reset() try: self._text_reader.ReadLines(file_object) except UnicodeDecodeError as exception: raise errors.UnableToParseFile( u'Not a text file, with error: {0:s}'.format(exception)) if not utils.IsText(self._text_reader.lines): raise errors.UnableToParseFile( u'Not a text file, unable to proceed.') if not self.VerifyStructure(parser_mediator, self._text_reader.lines): raise errors.UnableToParseFile(u'Wrong file structure.') # Read every line in the text file. while self._text_reader.lines: # Initialize pyparsing objects. tokens = None start = 0 end = 0 key = None # Try to parse the line using all the line structures. for key, structure in self.LINE_STRUCTURES: try: parsed_structure = next( structure.scanString(self._text_reader.lines, maxMatches=1), None) except pyparsing.ParseException: continue if not parsed_structure: continue tokens, start, end = parsed_structure # Only want to parse the structure if it starts # at the beginning of the buffer. if start == 0: break if tokens and start == 0: parsed_event = self.ParseRecord(parser_mediator, key, tokens) if parsed_event: # TODO: need a reliable way to handle this. # parsed_event.offset = self._text_reader.line_offset parser_mediator.ProduceEvent(parsed_event) self._text_reader.SkipAhead(file_object, end) else: odd_line = self._text_reader.ReadLine(file_object) if odd_line: logging.warning(u'Unable to parse log line: {0:s}'.format( repr(odd_line))) try: self._text_reader.ReadLines(file_object) except UnicodeDecodeError as exception: parser_mediator.ProduceParseError( u'Unable to read lines from file with error: {0:s}'.format( exception))
def ParseFileObject(self, parser_mediator, file_object, **kwargs): """Parses a text file-like object using a lexer. Args: parser_mediator: a parser mediator object (instance of ParserMediator). file_object: a file-like object. Raises: UnableToParseFile: when the file cannot be parsed. """ file_entry = parser_mediator.GetFileEntry() path_spec_printable = u'{0:s}:{1:s}'.format( file_entry.path_spec.type_indicator, file_entry.name) # TODO: this is necessary since we inherit from lexer.SelfFeederMixIn. self.file_object = file_object self._file_verified = False # Start by checking, is this a text file or not? Before we proceed # any further. file_object.seek(0, os.SEEK_SET) if not utils.IsText(file_object.read(40)): raise errors.UnableToParseFile( u'Not a text file, unable to proceed.') file_object.seek(0, os.SEEK_SET) error_count = 0 # We need to clear out few values in the Lexer before continuing. # There might be some leftovers from previous run. self.error = 0 self.buffer = b'' while True: _ = self.NextToken() if self.state == u'INITIAL': self.entry_offset = self.next_entry_offset self.next_entry_offset = file_object.tell() - len(self.buffer) if not self._file_verified and self.error >= self.MAX_LINES * 2: logging.debug( u'Lexer error count: {0:d} and current state {1:s}'.format( self.error, self.state)) raise errors.UnableToParseFile( u'[{0:s}] unsupported file: {1:s}.'.format( self.NAME, path_spec_printable)) if self.line_ready: try: self.ParseLine(parser_mediator) self._file_verified = True except errors.TimestampError as exception: error_count += 1 if self._file_verified: logging.debug( u'[{0:s} VERIFIED] Error count: {1:d} and ERROR: {2:d}' .format(path_spec_printable, error_count, self.error)) logging.warning( u'[{0:s}] Unable to parse timestamp with error: {1:s}' .format(self.NAME, exception)) else: logging.debug(( u'[{0:s} EVALUATING] Error count: {1:d} and ERROR: ' u'{2:d})').format(path_spec_printable, error_count, self.error)) if error_count >= self.MAX_LINES: raise errors.UnableToParseFile( u'[{0:s}] unsupported file: {1:s}.'.format( self.NAME, path_spec_printable)) finally: self.ClearValues() if self.Empty(): # Try to fill the buffer to prevent the parser from ending prematurely. self.Feed() if self.Empty(): break if not self._file_verified: raise errors.UnableToParseFile( u'[{0:s}] unable to parse file: {1:s}.'.format( self.NAME, path_spec_printable)) file_offset = file_object.get_offset() if file_offset < file_object.get_size(): parser_mediator.ProduceParseError( (u'{0:s} prematurely terminated parsing: {1:s} at offset: ' u'0x{2:08x}.').format(self.NAME, path_spec_printable, file_offset))
def Parse(self, file_entry): """Extract data from a text file using a pyparsing definition. Args: file_entry: A file entry object. Yields: An event object (EventObject) that contains the parsed attributes. """ # TODO: find a more elegant way for this; currently the mac_wifi and # syslog parser seem to rely on this member. self.file_entry = file_entry file_object = file_entry.GetFileObject() if not self.LINE_STRUCTURES: raise errors.UnableToParseFile( u'Line structure undeclared, unable to proceed.') file_object.seek(0, os.SEEK_SET) text_file_object = text_file.TextFile(file_object) line = self._ReadLine(text_file_object, self.MAX_LINE_LENGTH, True) if not line: raise errors.UnableToParseFile(u'Not a text file.') if len(line) == self.MAX_LINE_LENGTH or len( line) == self.MAX_LINE_LENGTH - 1: logging.debug(( u'Trying to read a line and reached the maximum allowed length of ' '{}. The last few bytes of the line are: {} [parser {}]').format( self.MAX_LINE_LENGTH, repr(line[-10:]), self.parser_name)) if not utils.IsText(line): raise errors.UnableToParseFile(u'Not a text file, unable to proceed.') if not self.VerifyStructure(line): raise errors.UnableToParseFile('Wrong file structure.') # Set the offset to the beginning of the file. self._current_offset = 0 # Read every line in the text file. while line: parsed_structure = None use_key = None # Try to parse the line using all the line structures. for key, structure in self.LINE_STRUCTURES: try: parsed_structure = structure.parseString(line) except pyparsing.ParseException: pass if parsed_structure: use_key = key break if parsed_structure: parsed_event = self.ParseRecord(use_key, parsed_structure) if parsed_event: parsed_event.offset = self._current_offset yield parsed_event else: logging.warning(u'Unable to parse log line: {}'.format(line)) self._current_offset = text_file_object.get_offset() line = self._ReadLine(text_file_object) file_object.close()
def ParseFileObject(self, parser_mediator, file_object, **kwargs): """Parses a text file-like object using a pyparsing definition. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfvfs. file_object (dfvfs.FileIO): file-like object. Raises: UnableToParseFile: when the file cannot be parsed. """ if not self.LINE_STRUCTURES: raise errors.UnableToParseFile('Missing line structures.') self._text_reader.Reset() try: self._text_reader.ReadLines(file_object) except UnicodeDecodeError as exception: raise errors.UnableToParseFile( 'Not a text file, with error: {0!s}'.format(exception)) if not utils.IsText(self._text_reader.lines): raise errors.UnableToParseFile( 'Not a text file, unable to proceed.') if not self.VerifyStructure(parser_mediator, self._text_reader.lines): raise errors.UnableToParseFile('Wrong file structure.') # Using parseWithTabs() overrides Pyparsing's default replacement of tabs # with spaces to SkipAhead() the correct number of bytes after a match. for key, structure in self.LINE_STRUCTURES: structure.parseWithTabs() # Read every line in the text file. while self._text_reader.lines: if parser_mediator.abort: break # Initialize pyparsing objects. tokens = None start = 0 end = 0 key = None # Try to parse the line using all the line structures. for key, structure in self.LINE_STRUCTURES: try: structure_generator = structure.scanString( self._text_reader.lines, maxMatches=1) parsed_structure = next(structure_generator, None) except pyparsing.ParseException: parsed_structure = None if not parsed_structure: continue tokens, start, end = parsed_structure # Only want to parse the structure if it starts # at the beginning of the buffer. if start == 0: break if tokens and start == 0: try: self.ParseRecord(parser_mediator, key, tokens) except (errors.ParseError, errors.TimestampError) as exception: parser_mediator.ProduceExtractionError( 'unable parse record: {0:s} with error: {1!s}'.format( key, exception)) self._text_reader.SkipAhead(file_object, end) else: odd_line = self._text_reader.ReadLine(file_object) if odd_line: if len(odd_line) > 80: odd_line = '{0:s}...'.format(odd_line[:77]) parser_mediator.ProduceExtractionError( 'unable to parse log line: {0:s}'.format( repr(odd_line))) try: self._text_reader.ReadLines(file_object) except UnicodeDecodeError as exception: parser_mediator.ProduceExtractionError( 'unable to read lines with error: {0!s}'.format(exception))
def ParseFileObject(self, parser_mediator, file_object, **kwargs): """Parses a text file-like object using a pyparsing definition. Args: parser_mediator (ParserMediator): mediates interactions between parsers and other components, such as storage and dfvfs. file_object (dfvfs.FileIO): file-like object. Raises: UnableToParseFile: when the file cannot be parsed. """ # TODO: self._line_structures is a work-around and this needs # a structural fix. if not self._line_structures: raise errors.UnableToParseFile( 'Line structure undeclared, unable to proceed.') encoding = self._ENCODING or parser_mediator.codepage text_file_object = text_file.TextFile(file_object, encoding=encoding) try: line = self._ReadLine(text_file_object, max_len=self.MAX_LINE_LENGTH) except UnicodeDecodeError: raise errors.UnableToParseFile( 'Not a text file or encoding not supported.') if not line: raise errors.UnableToParseFile('Not a text file.') if len(line) == self.MAX_LINE_LENGTH or len( line) == self.MAX_LINE_LENGTH - 1: logging.debug(( 'Trying to read a line and reached the maximum allowed length of ' '{0:d}. The last few bytes of the line are: {1:s} [parser ' '{2:s}]').format(self.MAX_LINE_LENGTH, repr(line[-10:]), self.NAME)) if not utils.IsText(line): raise errors.UnableToParseFile( 'Not a text file, unable to proceed.') if not self.VerifyStructure(parser_mediator, line): raise errors.UnableToParseFile('Wrong file structure.') # Set the offset to the beginning of the file. self._current_offset = 0 # Read every line in the text file. while line: if parser_mediator.abort: break parsed_structure = None use_key = None # Try to parse the line using all the line structures. for key, structure in self.LINE_STRUCTURES: try: parsed_structure = structure.parseString(line) except pyparsing.ParseException: pass if parsed_structure: use_key = key break if parsed_structure: parsed_event = self.ParseRecord(parser_mediator, use_key, parsed_structure) if parsed_event: parsed_event.offset = self._current_offset parser_mediator.ProduceEvent(parsed_event) else: if len(line) > 80: line = '{0:s}...'.format(line[:77]) parser_mediator.ProduceExtractionError( 'unable to parse log line: {0:s} at offset {1:d}'.format( repr(line), self._current_offset)) self._current_offset = text_file_object.get_offset() try: line = self._ReadLine(text_file_object) except UnicodeDecodeError: parser_mediator.ProduceExtractionError( 'unable to read and decode log line at offset {0:d}'. format(self._current_offset)) break