Example #1
0
  def VerifyFile(self, file_object):
    """Check if the file is a PLSRecall.dat file.

    Args:
      file_object: file that we want to check.

    Returns:
      True if this is a valid PLSRecall.dat file, otherwise False.
    """
    file_object.seek(0, os.SEEK_SET)

    # The file consists of PL/SQL structures that are equal
    # size (4125 bytes) TRecallRecord records. It should be
    # noted that the query value is free form.
    try:
      structure = self.PLS_STRUCT.parse_stream(file_object)
    except (IOError, construct.FieldError):
      return False

    # Verify few entries inside the structure.
    try:
      timestamp = timelib.Timestamp.FromDelphiTime(structure.TimeStamp)
    except ValueError:
      return False

    if timestamp <= 0:
      return False

    # Verify that the timestamp is no more than six years into the future.
    # Six years is an arbitrary time length just to evaluate the timestamp
    # against some value. There is no guarantee that this will catch everything.
    # TODO: Add a check for similarly valid value back in time. Maybe if it the
    # timestamp is before 1980 we are pretty sure it is invalid?
    # TODO: This is a very flaky assumption. Find a better one.
    current_timestamp = timelib.Timestamp.GetNow()
    if timestamp > current_timestamp + self._SIX_YEARS_IN_MICRO_SECONDS:
      return False

    # TODO: Add other verification checks here. For instance make sure
    # that the query actually looks like a SQL query. This structure produces a
    # lot of false positives and thus we need to add additional verification to
    # make sure we are not parsing non-PLSRecall files.
    # Another check might be to make sure the username looks legitimate, or the
    # sequence number, or the database name.
    # For now we just check if all three fields pass our "is this a text" test.
    if not utils.IsText(structure.Username):
      return False
    if not utils.IsText(structure.Query):
      return False
    if not utils.IsText(structure.Database):
      return False

    # Take the first word from the query field and attempt to match that against
    # allowed queries.
    first_word, _, _ = structure.Query.partition(b' ')

    if first_word.lower() not in self._PLS_KEYWORD:
      return False

    return True
Example #2
0
    def VerifyFile(self, file_object):
        """Check if the file is a PLSRecall.dat file.

    Args:
      file_object: file that we want to check.

    Returns:
      True if this is a valid PLSRecall.dat file, otherwise False.
    """
        file_object.seek(0, os.SEEK_SET)

        # The file consists of PL-SQL structures that are equal
        # size (4125 bytes) TRecallRecord records. It should be
        # noted that the query value is free form.
        try:
            structure = self.PLS_STRUCT.parse_stream(file_object)
        except (IOError, construct.FieldError):
            return False

        # Verify few entries inside the structure.
        try:
            timestamp = timelib.Timestamp.FromDelphiTime(structure.TimeStamp)
        except ValueError:
            return False

        if timestamp <= 0:
            return False

        # TODO: Add other verification checks here. For instance make sure
        # that the query actually looks like a SQL query. This structure produces a
        # lot of false positives and thus we need to add additional verification to
        # make sure we are not parsing non-PLSRecall files.
        # Another check might be to make sure the username looks legitimate, or the
        # sequence number, or the database name.
        # For now we just check if all three fields pass our "is this a text" test.
        if not utils.IsText(structure.Username):
            return False
        if not utils.IsText(structure.Query):
            return False
        if not utils.IsText(structure.Database):
            return False

        return True
Example #3
0
    def testIsText(self):
        """Test the IsText method."""
        bytes_in = 'thisi My Weird ASCII and non whatever string.'
        self.assertTrue(utils.IsText(bytes_in))

        bytes_in = u'Plaso Síar Og Raðar Þessu'
        self.assertTrue(utils.IsText(bytes_in))

        bytes_in = '\x01\62LSO\xFF'
        self.assertFalse(utils.IsText(bytes_in))

        bytes_in = 'T\x00h\x00i\x00s\x00\x20\x00'
        self.assertTrue(utils.IsText(bytes_in))

        bytes_in = 'Ascii\x00'
        self.assertTrue(utils.IsText(bytes_in))

        bytes_in = 'Ascii Start then...\x00\x99\x23'
        self.assertFalse(utils.IsText(bytes_in))
Example #4
0
  def testIsText(self):
    """Tests the IsText function."""
    bytes_in = b'this is My Weird ASCII and non whatever string.'
    self.assertTrue(utils.IsText(bytes_in))

    bytes_in = 'Plaso Síar Og Raðar Þessu'
    self.assertTrue(utils.IsText(bytes_in))

    bytes_in = b'\x01\\62LSO\xFF'
    self.assertFalse(utils.IsText(bytes_in))

    bytes_in = b'T\x00h\x00i\x00s\x00\x20\x00'
    self.assertTrue(utils.IsText(bytes_in))

    bytes_in = b'Ascii\x00'
    self.assertTrue(utils.IsText(bytes_in))

    bytes_in = b'Ascii Open then...\x00\x99\x23'
    self.assertFalse(utils.IsText(bytes_in))
Example #5
0
  def _ReadRecord(self, text_file_object, max_line_length=0):
    """Return a single record from an Opera global_history file.

    A single record consists of four lines, with each line as:
      Title of page (or the URL if not there).
      Website URL.
      Timestamp in POSIX time.
      Popularity index (-1 if first time visited).

    Args:
      text_file_object: A text file object (instance of dfvfs.TextFile).
      max_line_length: An integer that denotes the maximum byte
                       length for each line read.

    Returns:
      A tuple of: title, url, timestamp, popularity_index.

    Raises:
      errors.NotAText: If the file being read is not a text file.
    """
    if max_line_length:
      title_raw = text_file_object.readline(max_line_length)
      if len(title_raw) == max_line_length and not title_raw.endswith('\n'):
        return None, None, None, None
      if not utils.IsText(title_raw):
        raise errors.NotAText(u'Title line is not a text.')
      title = title_raw.strip()
    else:
      title = text_file_object.readline().strip()

    if not title:
      return None, None, None, None

    url = text_file_object.readline().strip()

    if not url:
      return None, None, None, None

    timestamp_line = text_file_object.readline().strip()
    popularity_line = text_file_object.readline().strip()

    try:
      timestamp = int(timestamp_line, 10)
    except ValueError:
      if len(timestamp_line) > 30:
        timestamp_line = timestamp_line[0:30]
      logging.debug(u'Unable to read in timestamp [{!r}]'.format(
          timestamp_line))
      return None, None, None, None

    try:
      popularity_index = int(popularity_line, 10)
    except ValueError:
      try:
        logging.debug(u'Unable to read in popularity index[{}]'.format(
            popularity_line))
      except UnicodeDecodeError:
        logging.debug(
            u'Unable to read in popularity index [unable to print '
            u'bad line]')
      return None, None, None, None

    # Try to get the data into unicode.
    try:
      title_unicode = title.decode('utf-8')
    except UnicodeDecodeError:
      partial_title = title.decode('utf-8', 'ignore')
      title_unicode = u'Warning: partial line, starts with: {}'.format(
          partial_title)

    return title_unicode, url, timestamp, popularity_index
Example #6
0
    def Parse(self, parser_context, file_entry):
        """Parse a text file using a pyparsing definition.

    Args:
      parser_context: A parser context object (instance of ParserContext).
      file_entry: A file entry object (instance of dfvfs.FileEntry).

    Yields:
      An event object (instance of EventObject).
    """
        self.file_entry = file_entry

        file_object = file_entry.GetFileObject()

        if not self.LINE_STRUCTURES:
            raise errors.UnableToParseFile(
                u'Line structure undeclared, unable to proceed.')

        file_object.seek(0, os.SEEK_SET)

        self._buffer = ''
        self._FillBuffer(file_object)

        if not utils.IsText(self._buffer):
            raise errors.UnableToParseFile(
                u'Not a text file, unable to proceed.')

        if not self.VerifyStructure(parser_context, self._buffer):
            raise errors.UnableToParseFile('Wrong file structure.')

        # Set the offset to the beginning of the file.
        self._current_offset = 0

        # Read every line in the text file.
        while self._buffer:
            # Initialize pyparsing objects.
            tokens = None
            start = 0
            end = 0

            structure_key = None

            # Try to parse the line using all the line structures.
            for key, structure in self.LINE_STRUCTURES:
                try:
                    parsed_structure = next(
                        structure.scanString(self._buffer, maxMatches=1), None)
                except pyparsing.ParseException:
                    continue
                if not parsed_structure:
                    continue

                tokens, start, end = parsed_structure

                # Only want to parse the structure if it starts
                # at the beginning of the buffer.
                if start == 0:
                    structure_key = key
                    break

            if tokens and not start:
                parsed_event = self.ParseRecord(parser_context, structure_key,
                                                tokens)
                if parsed_event:
                    parsed_event.offset = self._current_offset
                    parser_context.ProduceEvent(parsed_event,
                                                parser_name=self.NAME,
                                                file_entry=file_entry)

                self._current_offset += end
                self._buffer = self._buffer[end:]
            else:
                old_line = self._NextLine(file_object)
                if old_line:
                    logging.warning(u'Unable to parse log line: {0:s}'.format(
                        repr(old_line)))

            # Re-fill the buffer.
            self._FillBuffer(file_object)
Example #7
0
    def Parse(self, parser_context, file_entry):
        """Extract data from a text file using a pyparsing definition.

    Args:
      parser_context: A parser context object (instance of ParserContext).
      file_entry: A file entry object (instance of dfvfs.FileEntry).

    Yields:
      An event object (instance of EventObject).
    """
        # TODO: find a more elegant way for this; currently the mac_wifi and
        # syslog parser seem to rely on this member.
        self.file_entry = file_entry

        file_object = file_entry.GetFileObject()

        # TODO: self._line_structures is a work-around and this needs
        # a structural fix.
        if not self._line_structures:
            raise errors.UnableToParseFile(
                u'Line structure undeclared, unable to proceed.')

        file_object.seek(0, os.SEEK_SET)
        text_file_object = text_file.TextFile(file_object)

        line = self._ReadLine(parser_context,
                              file_entry,
                              text_file_object,
                              max_len=self.MAX_LINE_LENGTH,
                              quiet=True)
        if not line:
            raise errors.UnableToParseFile(u'Not a text file.')

        if len(line) == self.MAX_LINE_LENGTH or len(
                line) == self.MAX_LINE_LENGTH - 1:
            logging.debug((
                u'Trying to read a line and reached the maximum allowed length of '
                u'{0:d}. The last few bytes of the line are: {1:s} [parser '
                u'{2:s}]').format(self.MAX_LINE_LENGTH, repr(line[-10:]),
                                  self.NAME))

        if not utils.IsText(line):
            raise errors.UnableToParseFile(
                u'Not a text file, unable to proceed.')

        if not self.VerifyStructure(parser_context, line):
            raise errors.UnableToParseFile('Wrong file structure.')

        # Set the offset to the beginning of the file.
        self._current_offset = 0
        # Read every line in the text file.
        while line:
            parsed_structure = None
            use_key = None
            # Try to parse the line using all the line structures.
            for key, structure in self.LINE_STRUCTURES:
                try:
                    parsed_structure = structure.parseString(line)
                except pyparsing.ParseException:
                    pass
                if parsed_structure:
                    use_key = key
                    break

            if parsed_structure:
                parsed_event = self.ParseRecord(parser_context, use_key,
                                                parsed_structure)
                if parsed_event:
                    parsed_event.offset = self._current_offset
                    parser_context.ProduceEvent(parsed_event,
                                                parser_name=self.NAME,
                                                file_entry=file_entry)
            else:
                logging.warning(
                    u'Unable to parse log line: {0:s}'.format(line))

            self._current_offset = text_file_object.get_offset()
            line = self._ReadLine(parser_context, file_entry, text_file_object)

        file_object.close()
Example #8
0
    def Parse(self, parser_context, file_entry):
        """Extract data from a text file.

    Args:
      parser_context: A parser context object (instance of ParserContext).
      file_entry: A file entry object (instance of dfvfs.FileEntry).

    Yields:
      An event object (instance of EventObject).
    """
        path_spec_printable = u'{0:s}:{1:s}'.format(
            file_entry.path_spec.type_indicator, file_entry.name)
        file_object = file_entry.GetFileObject()

        self.file_entry = file_entry
        # TODO: this is necessary since we inherit from lexer.SelfFeederMixIn.
        self.file_object = file_object

        # Start by checking, is this a text file or not? Before we proceed
        # any further.
        file_object.seek(0, os.SEEK_SET)
        if not utils.IsText(file_object.read(40)):
            raise errors.UnableToParseFile(
                u'Not a text file, unable to proceed.')

        file_object.seek(0, os.SEEK_SET)

        error_count = 0
        file_verified = False
        # We need to clear out few values in the Lexer before continuing.
        # There might be some leftovers from previous run.
        self.error = 0
        self.buffer = ''

        while True:
            _ = self.NextToken()

            if self.state == 'INITIAL':
                self.entry_offset = getattr(self, 'next_entry_offset', 0)
                self.next_entry_offset = file_object.tell() - len(self.buffer)

            if not file_verified and self.error >= self.MAX_LINES * 2:
                logging.debug(
                    u'Lexer error count: {0:d} and current state {1:s}'.format(
                        self.error, self.state))
                file_object.close()
                raise errors.UnableToParseFile(
                    u'[{0:s}] unsupported file: {1:s}.'.format(
                        self.NAME, path_spec_printable))

            if self.line_ready:
                try:
                    event_object = self.ParseLine(parser_context)
                    parser_context.ProduceEvent(event_object,
                                                parser_name=self.NAME,
                                                file_entry=file_entry)

                    file_verified = True

                except errors.TimestampNotCorrectlyFormed as exception:
                    error_count += 1
                    if file_verified:
                        logging.debug(
                            u'[{0:s} VERIFIED] Error count: {1:d} and ERROR: {2:d}'
                            .format(path_spec_printable, error_count,
                                    self.error))
                        logging.warning(
                            u'[{0:s}] Unable to parse timestamp with error: {1:s}'
                            .format(self.NAME, exception))

                    else:
                        logging.debug((
                            u'[{0:s} EVALUATING] Error count: {1:d} and ERROR: '
                            u'{2:d})').format(path_spec_printable, error_count,
                                              self.error))

                        if error_count >= self.MAX_LINES:
                            file_object.close()
                            raise errors.UnableToParseFile(
                                u'[{0:s}] unsupported file: {1:s}.'.format(
                                    self.NAME, path_spec_printable))

                finally:
                    self.ClearValues()

            if self.Empty():
                # Try to fill the buffer to prevent the parser from ending prematurely.
                self.Feed()

            if self.Empty():
                break

        if not file_verified:
            file_object.close()
            raise errors.UnableToParseFile(
                u'[{0:s}] unable to parser file: {1:s}.'.format(
                    self.NAME, path_spec_printable))

        file_offset = file_object.get_offset()
        if file_offset < file_object.get_size():
            logging.error(
                (u'{0:s} prematurely terminated parsing: {1:s} at offset: '
                 u'0x{2:08x}.').format(self.NAME, path_spec_printable,
                                       file_offset))
        file_object.close()
Example #9
0
    def ParseFileObject(self, parser_mediator, file_object, **kwargs):
        """Parses a text file-like object using a pyparsing definition.

    Args:
      parser_mediator: a parser mediator object (instance of ParserMediator).
      file_object: a file-like object.

    Raises:
      UnableToParseFile: when the file cannot be parsed.
    """
        file_entry = parser_mediator.GetFileEntry()

        # TODO: find a more elegant way for this; currently the mac_wifi and
        # syslog parser seem to rely on this member.
        self.file_entry = file_entry

        # TODO: self._line_structures is a work-around and this needs
        # a structural fix.
        if not self._line_structures:
            raise errors.UnableToParseFile(
                u'Line structure undeclared, unable to proceed.')

        text_file_object = text_file.TextFile(file_object)

        line = self._ReadLine(parser_mediator,
                              file_entry,
                              text_file_object,
                              max_len=self.MAX_LINE_LENGTH,
                              quiet=True)
        if not line:
            raise errors.UnableToParseFile(u'Not a text file.')

        if len(line) == self.MAX_LINE_LENGTH or len(
                line) == self.MAX_LINE_LENGTH - 1:
            logging.debug((
                u'Trying to read a line and reached the maximum allowed length of '
                u'{0:d}. The last few bytes of the line are: {1:s} [parser '
                u'{2:s}]').format(self.MAX_LINE_LENGTH, repr(line[-10:]),
                                  self.NAME))

        if not utils.IsText(line):
            raise errors.UnableToParseFile(
                u'Not a text file, unable to proceed.')

        if not self.VerifyStructure(parser_mediator, line):
            raise errors.UnableToParseFile(u'Wrong file structure.')

        # Set the offset to the beginning of the file.
        self._current_offset = 0
        # Read every line in the text file.
        while line:
            parsed_structure = None
            use_key = None
            # Try to parse the line using all the line structures.
            for key, structure in self.LINE_STRUCTURES:
                try:
                    parsed_structure = structure.parseString(line)
                except pyparsing.ParseException:
                    pass
                if parsed_structure:
                    use_key = key
                    break

            if parsed_structure:
                parsed_event = self.ParseRecord(parser_mediator, use_key,
                                                parsed_structure)
                if parsed_event:
                    parsed_event.offset = self._current_offset
                    parser_mediator.ProduceEvent(parsed_event)
            else:
                if len(line) > 80:
                    line = u'{0:s}...'.format(line[0:77])
                parser_mediator.ProduceParseError(
                    u'Unable to parse log line: {0:s} at offset {1:d}'.format(
                        repr(line), self._current_offset))

            self._current_offset = text_file_object.get_offset()
            line = self._ReadLine(parser_mediator, file_entry,
                                  text_file_object)
Example #10
0
    def ParseFileObject(self, parser_mediator, file_object, **kwargs):
        """Parses a text file-like object using a pyparsing definition.

    Args:
      parser_mediator: a parser mediator object (instance of ParserMediator).
      file_object: a file-like object.

    Raises:
      UnableToParseFile: when the file cannot be parsed.
    """
        if not self.LINE_STRUCTURES:
            raise errors.UnableToParseFile(u'Missing line structures.')

        self._text_reader.Reset()

        try:
            self._text_reader.ReadLines(file_object)
        except UnicodeDecodeError as exception:
            raise errors.UnableToParseFile(
                u'Not a text file, with error: {0:s}'.format(exception))

        if not utils.IsText(self._text_reader.lines):
            raise errors.UnableToParseFile(
                u'Not a text file, unable to proceed.')

        if not self.VerifyStructure(parser_mediator, self._text_reader.lines):
            raise errors.UnableToParseFile(u'Wrong file structure.')

        # Using parseWithTabs() overrides Pyparsing's default replacement of tabs
        # with spaces to SkipAhead() the correct number of bytes after a match.
        for key, structure in self.LINE_STRUCTURES:
            structure.parseWithTabs()

        # Read every line in the text file.
        while self._text_reader.lines:
            # Initialize pyparsing objects.
            tokens = None
            start = 0
            end = 0

            key = None

            # Try to parse the line using all the line structures.
            for key, structure in self.LINE_STRUCTURES:
                try:
                    parsed_structure = next(
                        structure.scanString(self._text_reader.lines,
                                             maxMatches=1), None)
                except pyparsing.ParseException:
                    continue

                if not parsed_structure:
                    continue

                tokens, start, end = parsed_structure

                # Only want to parse the structure if it starts
                # at the beginning of the buffer.
                if start == 0:
                    break

            if tokens and start == 0:
                self.ParseRecord(parser_mediator, key, tokens)

                self._text_reader.SkipAhead(file_object, end)

            else:
                odd_line = self._text_reader.ReadLine(file_object)
                if odd_line:
                    if len(odd_line) > 80:
                        odd_line = u'{0:s}...'.format(odd_line[0:77])
                    parser_mediator.ProduceParseError(
                        u'Unable to parse log line: {0:s}'.format(
                            repr(odd_line)))

            try:
                self._text_reader.ReadLines(file_object)
            except UnicodeDecodeError as exception:
                parser_mediator.ProduceParseError(
                    u'Unable to read lines from file with error: {0:s}'.format(
                        exception))
Example #11
0
    def ParseFileObject(self, parser_mediator, file_object, **kwargs):
        """Parses a text file-like object using a pyparsing definition.

    Args:
      parser_mediator: A parser mediator object (instance of ParserMediator).
      file_object: A file-like object.

    Raises:
      UnableToParseFile: when the file cannot be parsed.
    """
        if not self.LINE_STRUCTURES:
            raise errors.UnableToParseFile(u'Missing line structures.')

        self._text_reader.Reset()

        try:
            self._text_reader.ReadLines(file_object)
        except UnicodeDecodeError as exception:
            raise errors.UnableToParseFile(
                u'Not a text file, with error: {0:s}'.format(exception))

        if not utils.IsText(self._text_reader.lines):
            raise errors.UnableToParseFile(
                u'Not a text file, unable to proceed.')

        if not self.VerifyStructure(parser_mediator, self._text_reader.lines):
            raise errors.UnableToParseFile(u'Wrong file structure.')

        # Read every line in the text file.
        while self._text_reader.lines:
            # Initialize pyparsing objects.
            tokens = None
            start = 0
            end = 0

            key = None

            # Try to parse the line using all the line structures.
            for key, structure in self.LINE_STRUCTURES:
                try:
                    parsed_structure = next(
                        structure.scanString(self._text_reader.lines,
                                             maxMatches=1), None)
                except pyparsing.ParseException:
                    continue

                if not parsed_structure:
                    continue

                tokens, start, end = parsed_structure

                # Only want to parse the structure if it starts
                # at the beginning of the buffer.
                if start == 0:
                    break

            if tokens and start == 0:
                parsed_event = self.ParseRecord(parser_mediator, key, tokens)
                if parsed_event:
                    # TODO: need a reliable way to handle this.
                    # parsed_event.offset = self._text_reader.line_offset
                    parser_mediator.ProduceEvent(parsed_event)

                self._text_reader.SkipAhead(file_object, end)

            else:
                odd_line = self._text_reader.ReadLine(file_object)
                if odd_line:
                    logging.warning(u'Unable to parse log line: {0:s}'.format(
                        repr(odd_line)))

            try:
                self._text_reader.ReadLines(file_object)
            except UnicodeDecodeError as exception:
                parser_mediator.ProduceParseError(
                    u'Unable to read lines from file with error: {0:s}'.format(
                        exception))
Example #12
0
    def ParseFileObject(self, parser_mediator, file_object, **kwargs):
        """Parses a text file-like object using a lexer.

    Args:
      parser_mediator: a parser mediator object (instance of ParserMediator).
      file_object: a file-like object.

    Raises:
      UnableToParseFile: when the file cannot be parsed.
    """
        file_entry = parser_mediator.GetFileEntry()
        path_spec_printable = u'{0:s}:{1:s}'.format(
            file_entry.path_spec.type_indicator, file_entry.name)

        # TODO: this is necessary since we inherit from lexer.SelfFeederMixIn.
        self.file_object = file_object
        self._file_verified = False

        # Start by checking, is this a text file or not? Before we proceed
        # any further.
        file_object.seek(0, os.SEEK_SET)
        if not utils.IsText(file_object.read(40)):
            raise errors.UnableToParseFile(
                u'Not a text file, unable to proceed.')

        file_object.seek(0, os.SEEK_SET)

        error_count = 0
        # We need to clear out few values in the Lexer before continuing.
        # There might be some leftovers from previous run.
        self.error = 0
        self.buffer = b''

        while True:
            _ = self.NextToken()

            if self.state == u'INITIAL':
                self.entry_offset = self.next_entry_offset
                self.next_entry_offset = file_object.tell() - len(self.buffer)

            if not self._file_verified and self.error >= self.MAX_LINES * 2:
                logging.debug(
                    u'Lexer error count: {0:d} and current state {1:s}'.format(
                        self.error, self.state))
                raise errors.UnableToParseFile(
                    u'[{0:s}] unsupported file: {1:s}.'.format(
                        self.NAME, path_spec_printable))

            if self.line_ready:
                try:
                    self.ParseLine(parser_mediator)
                    self._file_verified = True

                except errors.TimestampError as exception:
                    error_count += 1
                    if self._file_verified:
                        logging.debug(
                            u'[{0:s} VERIFIED] Error count: {1:d} and ERROR: {2:d}'
                            .format(path_spec_printable, error_count,
                                    self.error))
                        logging.warning(
                            u'[{0:s}] Unable to parse timestamp with error: {1:s}'
                            .format(self.NAME, exception))

                    else:
                        logging.debug((
                            u'[{0:s} EVALUATING] Error count: {1:d} and ERROR: '
                            u'{2:d})').format(path_spec_printable, error_count,
                                              self.error))

                        if error_count >= self.MAX_LINES:
                            raise errors.UnableToParseFile(
                                u'[{0:s}] unsupported file: {1:s}.'.format(
                                    self.NAME, path_spec_printable))

                finally:
                    self.ClearValues()

            if self.Empty():
                # Try to fill the buffer to prevent the parser from ending prematurely.
                self.Feed()

            if self.Empty():
                break

        if not self._file_verified:
            raise errors.UnableToParseFile(
                u'[{0:s}] unable to parse file: {1:s}.'.format(
                    self.NAME, path_spec_printable))

        file_offset = file_object.get_offset()
        if file_offset < file_object.get_size():
            parser_mediator.ProduceParseError(
                (u'{0:s} prematurely terminated parsing: {1:s} at offset: '
                 u'0x{2:08x}.').format(self.NAME, path_spec_printable,
                                       file_offset))
Example #13
0
  def Parse(self, file_entry):
    """Extract data from a text file using a pyparsing definition.

    Args:
      file_entry: A file entry object.

    Yields:
      An event object (EventObject) that contains the parsed
      attributes.
    """
    # TODO: find a more elegant way for this; currently the mac_wifi and
    # syslog parser seem to rely on this member.
    self.file_entry = file_entry

    file_object = file_entry.GetFileObject()

    if not self.LINE_STRUCTURES:
      raise errors.UnableToParseFile(
          u'Line structure undeclared, unable to proceed.')

    file_object.seek(0, os.SEEK_SET)
    text_file_object = text_file.TextFile(file_object)

    line = self._ReadLine(text_file_object, self.MAX_LINE_LENGTH, True)
    if not line:
      raise errors.UnableToParseFile(u'Not a text file.')

    if len(line) == self.MAX_LINE_LENGTH or len(
        line) == self.MAX_LINE_LENGTH - 1:
      logging.debug((
          u'Trying to read a line and reached the maximum allowed length of '
          '{}. The last few bytes of the line are: {} [parser {}]').format(
              self.MAX_LINE_LENGTH, repr(line[-10:]), self.parser_name))

    if not utils.IsText(line):
      raise errors.UnableToParseFile(u'Not a text file, unable to proceed.')

    if not self.VerifyStructure(line):
      raise errors.UnableToParseFile('Wrong file structure.')

    # Set the offset to the beginning of the file.
    self._current_offset = 0
    # Read every line in the text file.
    while line:
      parsed_structure = None
      use_key = None
      # Try to parse the line using all the line structures.
      for key, structure in self.LINE_STRUCTURES:
        try:
          parsed_structure = structure.parseString(line)
        except pyparsing.ParseException:
          pass
        if parsed_structure:
          use_key = key
          break

      if parsed_structure:
        parsed_event = self.ParseRecord(use_key, parsed_structure)
        if parsed_event:
          parsed_event.offset = self._current_offset
          yield parsed_event
      else:
        logging.warning(u'Unable to parse log line: {}'.format(line))

      self._current_offset = text_file_object.get_offset()
      line = self._ReadLine(text_file_object)

    file_object.close()
Example #14
0
    def ParseFileObject(self, parser_mediator, file_object, **kwargs):
        """Parses a text file-like object using a pyparsing definition.

    Args:
      parser_mediator (ParserMediator): mediates interactions between parsers
          and other components, such as storage and dfvfs.
      file_object (dfvfs.FileIO): file-like object.

    Raises:
      UnableToParseFile: when the file cannot be parsed.
    """
        if not self.LINE_STRUCTURES:
            raise errors.UnableToParseFile('Missing line structures.')

        self._text_reader.Reset()

        try:
            self._text_reader.ReadLines(file_object)
        except UnicodeDecodeError as exception:
            raise errors.UnableToParseFile(
                'Not a text file, with error: {0!s}'.format(exception))

        if not utils.IsText(self._text_reader.lines):
            raise errors.UnableToParseFile(
                'Not a text file, unable to proceed.')

        if not self.VerifyStructure(parser_mediator, self._text_reader.lines):
            raise errors.UnableToParseFile('Wrong file structure.')

        # Using parseWithTabs() overrides Pyparsing's default replacement of tabs
        # with spaces to SkipAhead() the correct number of bytes after a match.
        for key, structure in self.LINE_STRUCTURES:
            structure.parseWithTabs()

        # Read every line in the text file.
        while self._text_reader.lines:
            if parser_mediator.abort:
                break

            # Initialize pyparsing objects.
            tokens = None
            start = 0
            end = 0

            key = None

            # Try to parse the line using all the line structures.
            for key, structure in self.LINE_STRUCTURES:
                try:
                    structure_generator = structure.scanString(
                        self._text_reader.lines, maxMatches=1)
                    parsed_structure = next(structure_generator, None)
                except pyparsing.ParseException:
                    parsed_structure = None

                if not parsed_structure:
                    continue

                tokens, start, end = parsed_structure

                # Only want to parse the structure if it starts
                # at the beginning of the buffer.
                if start == 0:
                    break

            if tokens and start == 0:
                try:
                    self.ParseRecord(parser_mediator, key, tokens)
                except (errors.ParseError, errors.TimestampError) as exception:
                    parser_mediator.ProduceExtractionError(
                        'unable parse record: {0:s} with error: {1!s}'.format(
                            key, exception))

                self._text_reader.SkipAhead(file_object, end)

            else:
                odd_line = self._text_reader.ReadLine(file_object)
                if odd_line:
                    if len(odd_line) > 80:
                        odd_line = '{0:s}...'.format(odd_line[:77])
                    parser_mediator.ProduceExtractionError(
                        'unable to parse log line: {0:s}'.format(
                            repr(odd_line)))

            try:
                self._text_reader.ReadLines(file_object)
            except UnicodeDecodeError as exception:
                parser_mediator.ProduceExtractionError(
                    'unable to read lines with error: {0!s}'.format(exception))
Example #15
0
    def ParseFileObject(self, parser_mediator, file_object, **kwargs):
        """Parses a text file-like object using a pyparsing definition.

    Args:
      parser_mediator (ParserMediator): mediates interactions between parsers
          and other components, such as storage and dfvfs.
      file_object (dfvfs.FileIO): file-like object.

    Raises:
      UnableToParseFile: when the file cannot be parsed.
    """
        # TODO: self._line_structures is a work-around and this needs
        # a structural fix.
        if not self._line_structures:
            raise errors.UnableToParseFile(
                'Line structure undeclared, unable to proceed.')

        encoding = self._ENCODING or parser_mediator.codepage
        text_file_object = text_file.TextFile(file_object, encoding=encoding)

        try:
            line = self._ReadLine(text_file_object,
                                  max_len=self.MAX_LINE_LENGTH)
        except UnicodeDecodeError:
            raise errors.UnableToParseFile(
                'Not a text file or encoding not supported.')

        if not line:
            raise errors.UnableToParseFile('Not a text file.')

        if len(line) == self.MAX_LINE_LENGTH or len(
                line) == self.MAX_LINE_LENGTH - 1:
            logging.debug((
                'Trying to read a line and reached the maximum allowed length of '
                '{0:d}. The last few bytes of the line are: {1:s} [parser '
                '{2:s}]').format(self.MAX_LINE_LENGTH, repr(line[-10:]),
                                 self.NAME))

        if not utils.IsText(line):
            raise errors.UnableToParseFile(
                'Not a text file, unable to proceed.')

        if not self.VerifyStructure(parser_mediator, line):
            raise errors.UnableToParseFile('Wrong file structure.')

        # Set the offset to the beginning of the file.
        self._current_offset = 0
        # Read every line in the text file.
        while line:
            if parser_mediator.abort:
                break
            parsed_structure = None
            use_key = None
            # Try to parse the line using all the line structures.
            for key, structure in self.LINE_STRUCTURES:
                try:
                    parsed_structure = structure.parseString(line)
                except pyparsing.ParseException:
                    pass
                if parsed_structure:
                    use_key = key
                    break

            if parsed_structure:
                parsed_event = self.ParseRecord(parser_mediator, use_key,
                                                parsed_structure)
                if parsed_event:
                    parsed_event.offset = self._current_offset
                    parser_mediator.ProduceEvent(parsed_event)
            else:
                if len(line) > 80:
                    line = '{0:s}...'.format(line[:77])
                parser_mediator.ProduceExtractionError(
                    'unable to parse log line: {0:s} at offset {1:d}'.format(
                        repr(line), self._current_offset))

            self._current_offset = text_file_object.get_offset()

            try:
                line = self._ReadLine(text_file_object)
            except UnicodeDecodeError:
                parser_mediator.ProduceExtractionError(
                    'unable to read and decode log line at offset {0:d}'.
                    format(self._current_offset))
                break