def test_guess_file_date_format(self): """Test the `dates.guess_file_date_format` function. """ # Sample file data, and expected format of that sample data_formats = [ ('2010/01/28 12:34:56 PM', '%Y/%m/%d %I:%M:%S %p'), ('01/28/10 1:25:49 PM', '%m/%d/%y %I:%M:%S %p'), ('01/28/2010 13:25:49.123', '%m/%d/%Y %H:%M:%S.%f'), ('2010/08/30 13:57:14 blah', '%Y/%m/%d %H:%M:%S'), ('8/30/2010 13:57 blah', '%m/%d/%Y %H:%M'), ('8/30/2010 1:57:00 PM blah', '%m/%d/%Y %I:%M:%S %p'), ] for data, expected in data_formats: # Create a temporary file containing the data filename = write_tempfile(data) actual = dates.guess_file_date_format(filename) self.assertEqual(actual, expected) # Remove the temp file os.unlink(filename)
def grep_files(filenames, matches, dateformat='guess', resolution=60, show_progress=True): """Search all the given files for matching text, and return a list of ``(timestamp, counts)`` for each match, where ``timestamp`` is a ``datetime``, and ``counts`` is a dictionary of ``{match: count}``, counting the number of times each match was found during intervals of ``resolution`` seconds. """ # Counts of each match, used as a template for each row row_temp = [(match, 0) for match in matches] rows = {} # Compile regular expressions for matches # (Shaves off a little bit of execution time) compiled_matches = [re.compile(expr) for expr in matches] # Read each line of each file for filename in filenames: # Show progress bar? if show_progress: num_lines = line_count(filename) progress = ProgressBar(num_lines, prefix=filename, units='lines') # No progress bar, just print the filename being read else: print("Reading %s" % filename) # Guess date format? if not dateformat or dateformat == 'guess': dateformat = dates.guess_file_date_format(filename) # HACK: Fake timestamp in case no real timestamps are ever found timestamp = datetime(1970, 1, 1) # What line number are we on? line_num = 0 for line in open(filename, 'r'): line_num += 1 # Update progress bar every 1000 lines if show_progress: if line_num % 1000 == 0 or line_num == num_lines: progress.update(line_num) sys.stdout.write('\r' + str(progress)) sys.stdout.flush() # Remove leading/trailing whitespace and newlines line = line.strip() # If line is empty, skip it if not line: continue # See if this line has a timestamp try: line_timestamp = dates.date_chop(line, dateformat, resolution) # No timestamp found, stick with the current one except dates.CannotParse: pass # New timestamp found, switch to it else: timestamp = line_timestamp # If this datestamp hasn't appeared before, add it if timestamp not in rows: rows[timestamp] = dict(row_temp) # Count the number of each match in this line for expr in compiled_matches: if expr.search(line): rows[timestamp][expr.pattern] += 1 # If using progress bar, print a newline if show_progress: sys.stdout.write('\n') # Return a sorted list of (match, {counts}) tuples return sorted(rows.iteritems())