Esempio n. 1
0
 def test_guess_file_date_format(self):
     """Test the `dates.guess_file_date_format` function.
     """
     # Sample file data, and expected format of that sample
     data_formats = [
         ('2010/01/28 12:34:56 PM',      '%Y/%m/%d %I:%M:%S %p'),
         ('01/28/10 1:25:49 PM',         '%m/%d/%y %I:%M:%S %p'),
         ('01/28/2010 13:25:49.123',     '%m/%d/%Y %H:%M:%S.%f'),
         ('2010/08/30 13:57:14 blah',    '%Y/%m/%d %H:%M:%S'),
         ('8/30/2010 13:57 blah',        '%m/%d/%Y %H:%M'),
         ('8/30/2010 1:57:00 PM blah',   '%m/%d/%Y %I:%M:%S %p'),
     ]
     for data, expected in data_formats:
         # Create a temporary file containing the data
         filename = write_tempfile(data)
         actual = dates.guess_file_date_format(filename)
         self.assertEqual(actual, expected)
         # Remove the temp file
         os.unlink(filename)
Esempio n. 2
0
File: utils.py Progetto: a-e/csvsee
def grep_files(filenames, matches, dateformat='guess', resolution=60,
               show_progress=True):
    """Search all the given files for matching text, and return a list of
    ``(timestamp, counts)`` for each match, where ``timestamp`` is a
    ``datetime``, and ``counts`` is a dictionary of ``{match: count}``,
    counting the number of times each match was found during intervals of
    ``resolution`` seconds.
    """
    # Counts of each match, used as a template for each row
    row_temp = [(match, 0) for match in matches]
    rows = {}

    # Compile regular expressions for matches
    # (Shaves off a little bit of execution time)
    compiled_matches = [re.compile(expr) for expr in matches]

    # Read each line of each file
    for filename in filenames:
        # Show progress bar?
        if show_progress:
            num_lines = line_count(filename)
            progress = ProgressBar(num_lines, prefix=filename, units='lines')
        # No progress bar, just print the filename being read
        else:
            print("Reading %s" % filename)

        # Guess date format?
        if not dateformat or dateformat == 'guess':
            dateformat = dates.guess_file_date_format(filename)

        # HACK: Fake timestamp in case no real timestamps are ever found
        timestamp = datetime(1970, 1, 1)
        # What line number are we on?
        line_num = 0
        for line in open(filename, 'r'):
            line_num += 1
            # Update progress bar every 1000 lines
            if show_progress:
                if line_num % 1000 == 0 or line_num == num_lines:
                    progress.update(line_num)
                    sys.stdout.write('\r' + str(progress))
                    sys.stdout.flush()

            # Remove leading/trailing whitespace and newlines
            line = line.strip()

            # If line is empty, skip it
            if not line:
                continue

            # See if this line has a timestamp
            try:
                line_timestamp = dates.date_chop(line, dateformat, resolution)
            # No timestamp found, stick with the current one
            except dates.CannotParse:
                pass
            # New timestamp found, switch to it
            else:
                timestamp = line_timestamp

            # If this datestamp hasn't appeared before, add it
            if timestamp not in rows:
                rows[timestamp] = dict(row_temp)

            # Count the number of each match in this line
            for expr in compiled_matches:
                if expr.search(line):
                    rows[timestamp][expr.pattern] += 1

        # If using progress bar, print a newline
        if show_progress:
            sys.stdout.write('\n')

    # Return a sorted list of (match, {counts}) tuples
    return sorted(rows.iteritems())