def execute(module_name): """ Given a list of files with gdb logs, process all of them. Process the files containing the events. Return True if no error is detected. The event have the form of: -BEGIN 2011-10-14 11:09:29 2011-10-14 11:15:06 /usr/bin/gdb prueba r where q -END [('name', 'debugger'), ('datetime', dtime), ('user', anonymize(user)), ('application', 'gdb'), ('invocation, command), ('session_cmds', session commands), ('session_end', datime when session ended)] """ global filter_function # Get the window date to process events (from_date, until_date) = rules_common.window_dates(module_name) # Get the files to process, lines and mark lines (files, total_lines, mark_lines) = rules_common.files_to_process(module_name) # Loop over all the files total_counter = 0 for file_annotation in files: # Get the file name and (if it exists, the date of the last event) filename = file_annotation[0] last_event = datetime.datetime.min if len(file_annotation[1]) != 0: last_event = datetime.datetime.strptime(file_annotation[1][0], "%Y-%m-%d %H:%M:%S") new_last_event = last_event # Get the user id from the path to the file name user_id = filename.split("/")[-2] anon_user_id = anonymize.find_or_encode_string(user_id) data_in = codecs.open(filename, "r", "utf-8", errors="replace") line_number = 0 session_cmds = [] for line in data_in: line_number += 1 total_counter += 1 if total_counter % mark_lines == 0: print >> sys.stderr, "+", sys.stderr.flush() # Skip the empty lines if line == "\n": continue # See if the user id appears in the command, if so, anonymize if re.search(user_id, line): line = re.sub(user_id, anon_user_id, line) # Chop the command line fields = line[:-1].split() # Beginning of log. Catch command invocation and dates if re.match("^\-BEGIN .+$", line): fields = line.split() try: dtime = datetime.datetime.strptime(" ".join(fields[1:3]), "%Y-%m-%d %H:%M:%S") session_end = datetime.datetime.strptime(" ".join(fields[3:5]), "%Y-%m-%d %H:%M:%S") except ValueError, e: print >> sys.stderr, "WARNING: In file", filename print >> sys.stderr, "Ignoring:", line command = " ".join(fields[5:]) session_cmds = [] continue # If not the end of an event, concatenate the line and keep looping if not re.match("^\-END$", line): session_cmds.append(line[:-1]) continue # At this point we have the complete information about the event if dtime <= last_event: # Event is older than what has been recorded in the # detect_new_files. skip continue # If out of time window, ignore if dtime < from_date or dtime > until_date: continue # If there is a filter function and returns None, skip this event if filter_function != None and filter_function(fields) == None: continue # Record the event with the highest datetime. if dtime > new_last_event: new_last_event = dtime event = ( "gdb", dtime, anon_user_id, [ ("program", "gdb"), ("command", command), ("session_cmds", '"' + ";".join(session_cmds) + '"'), ("session_end", session_end), ], ) try: event_output.out(event) except Exception, e: print "Exception while processing", filename, ":", line_number print str(e) sys.exit(1)
def execute(module_name): """ Given a list of files with Apache logs, process all of them that contain the word mark word and produce the following events: [('name', 'embedded_question_correct'), ('datetime', dtime), ('user', anonymize(user)), ('application', browser?), ('url', URL), ('ip', IP), ('block_id', id)] [('name', 'embedded_question_incorrect'), ('datetime', dtime), ('user', anonymize(user)), ('application', browser?), ('url', URL), ('ip', IP), ('block_id', id)] [('name', 'embedded_question_blank'), ('datetime', dtime), ('user', anonymize(user)), ('application', browser?), ('url', URL), ('ip', IP), ('block_id', id)] [('name', 'embedded_question_show'), ('datetime', dtime), ('user', anonymize(user)), ('application', browser?), ('url', URL), ('ip', IP), ('block_id', id)] """ global clf_re global filter_function global remap_pairs # Get the window date to process events (from_date, until_date) = rules_common.window_dates(module_name) # Get the files to process, lines and mark lines (files, total_lines, mark_lines) = \ rules_common.files_to_process(module_name) # Get the remap_pairs evaluated from the options remap_pairs = eval('[' + rule_manager.get_property(None, module_name, 'remap_pairs') + \ ']') remap_pairs = [(re.compile(x), y) for (x, y) in remap_pairs] # Fetch the word to detect an embeddedq to use it later mark_word = rule_manager.get_property(None, module_name, 'mark_word') # Loop over all the given args total_counter = 0 for file_annotation in files: # Get the file name and (if it exists, the date of the last event) filename = file_annotation[0] last_event = datetime.datetime.min if len(file_annotation[1]) != 0: last_event = datetime.datetime.strptime(file_annotation[1][0], '%Y-%m-%d %H:%M:%S') new_last_event = last_event data_in = codecs.open(filename, 'r', encoding = 'utf8', errors = 'replace') old = '' counter = 0 for line in data_in: counter += 1 total_counter += 1 if total_counter % mark_lines == 0: print >> sys.stderr, '+', sys.stderr.flush() line = line[:-1] fields = clf_re.match(line).groups() if fields[2] == '': raise ValueError('Empty string' + line) # Translate date time of the event dtime = datetime.datetime.strptime(fields[3].strip()[:-6], '%d/%b/%Y:%H:%M:%S') if dtime <= last_event: # Event is older than what has been recorded in the # detect_new_files. skip continue if dtime < from_date or dtime > until_date: # Ignore event because it is outside the given window continue # Split the url to match and see if it has the mark word (method, url, protocol) = fields[4].split() # Only 404 that have the mark word substring are accepted if fields[5] != '404' or url.find(mark_word) == -1: continue # If there is a filter function and returns None, skip this event if filter_function != None and filter_function(fields) == None: continue # Record the event with the highest datetime. if dtime > new_last_event: new_last_event = dtime # At this point we have an event of an embedded question. event_pairs = process_log_line(url, mark_word) for (event_suffix, question_id) in event_pairs: event = ('embedded_question_' + event_suffix, dtime, anonymize.find_or_encode_string(fields[2]), [('application', 'unknown'), ('url', url), ('ip', fields[0]), ('question_id', question_id)]) event_output.out(event) data_in.close() detect_new_files.update(None, module_name + '//' + filename, [new_last_event]) print >> sys.stderr
def execute(module_name): """ Given a list of files with firefox logs, process all of them. Process the files containing the events. Return True if no error is detected. [('name', 'visit_url'), ('datetime', dtime), ('user', anonymize(user)), ('application', 'firefox'), ('invocation', URL)] """ global filter_function # Get the window date to process events (from_date, until_date) = rules_common.window_dates(module_name) # Get the files to process, lines and mark lines (files, total_lines, mark_lines) = \ rules_common.files_to_process(module_name) # Loop over all the files total_counter = 0 for file_annotation in files: # Get the file name and (if it exists, the date of the last event) filename = file_annotation[0] last_event = datetime.datetime.min if len(file_annotation[1]) != 0: last_event = datetime.datetime.strptime(file_annotation[1][0], '%Y-%m-%d %H:%M:%S') new_last_event = last_event # Get the user id from the path to the file name user_id = filename.split('/')[-2] anon_user_id = anonymize.find_or_encode_string(user_id) data_in = codecs.open(filename, 'r', 'utf-8', errors = 'replace') line_number = 0 for line in data_in: line_number += 1 total_counter += 1 if total_counter % mark_lines == 0: print >> sys.stderr, '+', sys.stderr.flush() # See if the user id appears in the command, if so, anonymize if re.search(user_id, line): line = re.sub(user_id, anon_user_id, line) # Chop the command line to find out if it is one of the special # commands: gcc, valgrind, gdb, kate, kdevelop. If so, skip the # processing because it is done in other specific function. fields = line[:-1].split() # If something weird happened and there are no fields, ignoredumpt # the line if len(fields) != 3: print >> sys.stderr, 'WARNING: In file', filename print >> sys.stderr, 'Ignoring:', line continue dtime = datetime.datetime.strptime(' '.join(fields[0:2]).strip(), '%Y-%m-%d %H:%M:%S') if dtime <= last_event: # Event is older than what has been recorded in the # detect_new_files. skip continue if dtime < from_date or dtime > until_date: # Ignore event because it is outside the given window continue # If there is a filter function and returns None, skip this event if filter_function != None and filter_function(fields) == None: continue # Record the event with the highest datetime. if dtime > new_last_event: new_last_event = dtime event = ('visit_url', dtime, anon_user_id, [('application', 'firefox'), ('invocation', fields[2])]) try: event_output.out(event) except Exception, e: print 'Exception while processing', filename, ':', line_number print str(e) sys.exit(1) data_in.close() detect_new_files.update(None, module_name + '//' + filename, [new_last_event])
def execute(module_name): """ Given a list of files with svn apache logs, process all of them. The logs are produced with the following apache configuration commands: CustomLog [destionation log file] "%t %u %{SVN-REPOS-NAME}e %{SVN-ACTION}e" env=SVN-ACTION Sample: [03/Mar/2012:11:43:55 +0100] abel asteams-en update /Teams/Team_09 r3960 send-copyfrom-args For each line in the file, the following event structure is produced [('name', 'svn_' + svn_action), # Svn action is update, diff, etc. ('datetime', dtime), ('user', anonymize(user)), ('repository', repository name), ('directory', directory) (optional), ('revision', r??? (optional)), ('comment', (max 256 chars)) # Only if commit and repository given] """ global filter_function # Get the level of debug debug = int(rule_manager.get_property(None, module_name, "debug")) # Get the window date to process events (from_date, until_date) = rules_common.window_dates(module_name) # Get the files to process, lines and mark lines (files, total_lines, mark_lines) = rules_common.files_to_process(module_name) # Get the flag to see if the commits need to be processed process_commits = rule_manager.get_property(None, module_name, "process_commits") == "" # Loop over all the given args total_counter = 0 for file_annotation in files: # Get the file name and (if it exists, the date of the last event) filename = file_annotation[0] last_event = datetime.datetime.min if len(file_annotation[1]) != 0: last_event = datetime.datetime.strptime(file_annotation[1][0], "%Y-%m-%d %H:%M:%S") new_last_event = last_event data_in = codecs.open(filename, "r", encoding="utf8", errors="replace") old = "" counter = 0 for line in data_in: # Advance counters and print progress character if needed counter += 1 total_counter += 1 if total_counter % mark_lines == 0: print >>sys.stderr, "+", sys.stderr.flush() # Chop line into fields line = line[:-1] fields = line.split() if len(fields) < 3: raise ValueError("Erroneous log line:" + line) # Get the event type to quickly detect if we need to skip it event_type = fields[4] if (not process_commits) and event_type == "commit": continue # Translate date time of the event and check if within process # interval dtime = datetime.datetime.strptime(fields[0][1:], "%d/%b/%Y:%H:%M:%S") if dtime <= last_event: # Event is older than what has been recorded in the # detect_new_files. skip continue if dtime < from_date or dtime > until_date: # Ignore event because it is outside the given window continue # If there is a filter function and returns None, skip this event if filter_function != None and filter_function(fields) == None: continue # Record the event with the highest datetime. if dtime > new_last_event: new_last_event = dtime # Create the first three pairs of the event event = ( "svn_" + event_type, dtime, anonymize.find_or_encode_string(fields[2]), [("repository", fields[3])], ) # Structure of the different events # # checkout-or-export /path r62 depth=infinity # commit harry r100 # diff /path r15:20 depth=infinity ignore-ancestry # get-dir /trunk r17 text # get-file /path r20 props # get-file-revs /path r12:15 include-merged-revisions # get-mergeinfo (/path1 /path2) # lock /path steal # log (/path1,/path2) r20:90 discover-changed-paths revprops=() # replay /path r19 # change-rev-prop r50 propertyname # rev-proplist r34 # status /path r62 depth=infinity # switch /pathA /pathB@50 depth=infinity # unlock /path break # update /path r17 send-copyfrom-args if event_type == "checkout-or-export": event[3].append(("revision", fields[6])) event[3].append(("location", fields[5])) if event_type == "commit": event[3].append(("revision", fields[5])) # Fetch the log message if svn_client is not None if svn_client != None: pass elif event_type == "diff": event[3].append(("location", fields[5] + " " + fields[6])) elif event_type == "get-dir" or event_type == "get-file" or event_type == "update": event[3].append(("revision", fields[6])) event[3].append(("location", fields[5])) elif event_type == "get-file-revs": event[3].append(("revision", "r" + fields[6].split(":")[1])) event[3].append(("location", fields[5])) elif event_type == "lock" or event_type == "unlock": event[3].append(("location", fields[5])) elif event_type == "log": event[3].append(("location", fields[5])) event_output.out(event) data_in.close() detect_new_files.update(None, module_name + "//" + filename, [new_last_event]) print >>sys.stderr
def execute(module_name): """ Given a list of files with gcc logs, process all of them. Process the files containing the events. Return True if no error is detected. The event have the form of: -BEGIN 2011-10-14 11:09:29 2011-10-14 11:15:06 /usr/bin/gcc prueba r where q -END [('name', 'compiler'), ('datetime', dtime), ('user', anonymize(user)), ('application', 'gcc'), ('invocation', command), ('messages', message extract)] """ global filter_function # Get the window date to process events (from_date, until_date) = rules_common.window_dates(module_name) message_lines = int(rule_manager.get_property(None, module_name, 'message_lines')) # Get the files to process, lines and mark lines (files, total_lines, mark_lines) = \ rules_common.files_to_process(module_name) # Loop over all the files total_counter = 0 for file_annotation in files: # Get the file name and (if it exists, the date of the last event) filename = file_annotation[0] last_event = datetime.datetime.min if len(file_annotation[1]) != 0: last_event = datetime.datetime.strptime(file_annotation[1][0], '%Y-%m-%d %H:%M:%S') new_last_event = last_event # Get the user id from the path to the file name user_id = filename.split('/')[-2] anon_user_id = anonymize.find_or_encode_string(user_id) data_in = codecs.open(filename, 'r', 'utf-8', errors = 'replace') line_number = 0 messages = [] begin_fields = [] for line in data_in: line_number += 1 total_counter += 1 if total_counter % mark_lines == 0: print >> sys.stderr, '+', sys.stderr.flush() # Skip the empty lines if line == '\n': continue # See if the user id appears in the command, if so, anonymize if re.search(user_id, line): line = re.sub(user_id, anon_user_id, line) # Chop the command line fields = line[:-1].split() # Beginning of log. Catch command invocation and dates if re.match('^\-BEGIN .+$', line): begin_fields = line.split() try: dtime = \ datetime.datetime.strptime(' '.join(begin_fields[4:6]), '%Y-%m-%d %H:%M:%S') except ValueError, e: print >> sys.stderr, 'WARNING: In file', filename print >> sys.stderr, 'Ignoring:', line command = ' '.join(begin_fields[6:]) messages = [] continue # If not the end of an event, concatenate the line and keep looping if not re.match('^\-END$', line): if len(messages) < message_lines: messages.append(line[:-1]) continue # At this point we have the complete information about the event if dtime <= last_event: # Event is older than what has been recorded in the # detect_new_files. skip continue # If out of time window, ignore if dtime < from_date or dtime > until_date: continue # If there is a filter function and returns None, skip this event if filter_function != None and filter_function(begin_fields) == None: continue # Record the event with the highest datetime. if dtime > new_last_event: new_last_event = dtime event = ('gcc', dtime, anon_user_id, [('program', 'gcc'), ('command', command), ('messages', '"' + '|||'.join(messages) + '"')]) try: event_output.out(event) except Exception, e: print 'Exception while processing', filename, ':', line_number print str(e) sys.exit(1)
def execute(module_name): """ Given a list of files with bash logs, process all of them. [('name', 'bashcmd'), ('datetime', datetime), ('user', anonymize(user_id)), ('program', program), ('command', command)] """ global filter_function # Get the window date to process events (from_date, until_date) = rules_common.window_dates(module_name) # Get the files to process, lines and mark lines (files, total_lines, mark_lines) = \ rules_common.files_to_process(module_name) # Commands that even though the appear as bash, they require special # processing thus, they are processed somewhere else. skip_commands = set(['gcc', 'valgrind', 'gdb', 'kate', 'kdevelop', '/usr/bin/gcc', '/usr/bin/valgrind', '/usr/bin/gdb', '/usr/bin/kate', '/usr/bin/kdevelop']) # Loop over all the files total_counter = 0 for file_annotation in files: # Get the file name and (if it exists, the date of the last event) filename = file_annotation[0] last_event = datetime.datetime.min if len(file_annotation[1]) != 0: last_event = datetime.datetime.strptime(file_annotation[1][0], '%Y-%m-%d %H:%M:%S') new_last_event = last_event # Get the user id from the path to the file name user_id = filename.split('/')[-2] anon_user_id = anonymize.find_or_encode_string(user_id) data_in = codecs.open(filename, 'r', 'utf-8', errors = 'replace') line_number = 0 for line in data_in: line_number += 1 total_counter += 1 if total_counter % mark_lines == 0: print >> sys.stderr, '+', sys.stderr.flush() # Detect and skip empty lines, MS-DOS empty lines, # only if re.match('^[ ]*\n$', line) or re.match('^\r\n$', line) or \ re.match('^#[ ]*\n$', line): continue # Detect timestamp if re.match('^#[0-9]+', line): milliseconds = float(line.split('#')[1]) stamp = datetime.datetime.fromtimestamp(milliseconds) continue if stamp <= last_event: # Event is older than what has been recorded in the # detect_new_files. skip continue if stamp < from_date or stamp > until_date: # Ignore event because it is outside the given window continue # See if the user id appears in the command, if so, anonymize if re.search(user_id, line): line = re.sub(user_id, anon_user_id, line) # Chop the command line to find out if it is one of the special # commands: gcc, valgrind, gdb, kate, kdevelop. If so, skip the # processing because it is done in other specific function. fields = line.split() # If something weird happened and there are no fields, ignoredumpt # the line if len(fields) == 0: continue # Process the command line # Skip certain commands if os.path.basename(fields[0]) in skip_commands: continue # If there is a filter function and returns None, skip this event if filter_function != None and filter_function(fields) == None: continue # Record the event with the highest datetime. if stamp > new_last_event: new_last_event = stamp event = ('bashcmd', stamp, anon_user_id, [('program', fields[0]), ('command', line[:-1])]) try: event_output.out(event) except Exception, e: print 'Exception while processing', filename, ':', line_number print str(e) sys.exit(1) data_in.close() detect_new_files.update(None, module_name + '//' + filename, [new_last_event])
def execute(module_name): """ Given a list of files with kate logs, process all of them. Process the files containing the events. Return True if no error is detected. The event have the form of: 0 2011-10-05 18:30:02 2011-10-05 18:30:15 '/usr/bin/kate' 'numero.dat' Where the fields are: - status - start date/time - stop date/time - program name - file (optional) [('name', 'text_editor'), ('datetime', dtime), ('user', anonymize(user)), ('application', 'kate'), ('invocation', command (program name + file))] """ global filter_function # Get the window date to process events (from_date, until_date) = rules_common.window_dates(module_name) # Get the files to process, lines and mark lines (files, total_lines, mark_lines) = \ rules_common.files_to_process(module_name) # Loop over all the files total_counter = 0 for file_annotation in files: # Get the file name and (if it exists, the date of the last event) filename = file_annotation[0] last_event = datetime.datetime.min if len(file_annotation[1]) != 0: last_event = datetime.datetime.strptime(file_annotation[1][0], '%Y-%m-%d %H:%M:%S') new_last_event = last_event # Get the user id from the path to the file name user_id = filename.split('/')[-2] anon_user_id = anonymize.find_or_encode_string(user_id) data_in = codecs.open(filename, 'r', 'utf-8', errors = 'replace') line_number = 0 for line in data_in: line_number += 1 total_counter += 1 if total_counter % mark_lines == 0: print >> sys.stderr, '+', sys.stderr.flush() # See if the user id appears in the command, if so, anonymize if re.search(user_id, line): line = re.sub(user_id, anon_user_id, line) # Chop the command line to find out if it is one of the special # commands: gcc, valgrind, gdb, kate, kdevelop. If so, skip the # processing because it is done in other specific function. fields = line[:-1].split() # If something weird happened and there are no fields, ignoredumpt # the line if len(fields) < 6: print >> sys.stderr, 'WARNING: In file', filename print >> sys.stderr, 'Not enough fields:', line continue try: dtime = datetime.datetime.strptime(' '.join(fields[1:3]).strip(), '%Y-%m-%d %H:%M:%S') except ValueError, e: print >> sys.stderr, 'WARNING: In file', filename print >> sys.stderr, 'Incorrect fields:', line continue if dtime <= last_event: # Event is older than what has been recorded in the # detect_new_files. skip continue if dtime < from_date or dtime > until_date: # Ignore event because it is outside the given window continue # If there is a filter function and returns None, skip this event if filter_function != None and filter_function(fields) == None: continue cmd = fields[5][1:-1] if len(fields) == 7: cmd = cmd + ' ' + fields[6][1:-1] # Record the event with the highest datetime. if dtime > new_last_event: new_last_event = dtime event = ('text_editor', dtime, anon_user_id, [('program', 'kate'), ('command', cmd)]) try: event_output.out(event) except Exception, e: print 'Exception while processing', filename, ':', line_number print str(e) sys.exit(1)
# If there is a filter function and returns None, skip this event if filter_function != None and filter_function(fields) == None: return new_last_event # Record the event with the highest datetime. if dtime > new_last_event: new_last_event = dtime # Create the event data structure event = (event_type, dtime, anonymize.find_or_encode_string(fields[3]), [('application', 'moodle'), ('community', fields[0]), ('ip', fields[2]), ('resource', fields[5])]) try: event_output.out(event) except Exception, e: print 'Exception while processing', filename, ':', counter print str(e) sys.exit(1) return new_last_event def main(): """ Script that given a file with CSV Moodle events, it process them and prepares to be dumped. script [options] logfile logfile ... Options:
def execute(module_name): """ Process the files contained in the given repository. [('name', 'svn_commit'), ('datetime', dtime), ('user', anonymize(user)), ('program', 'svn'), ('repository', repository name), ('comment', (max 256 chars))] """ global svn_client global filter_function global svn_special_event_comment global svn_special_event_names # Get the level of debug debug = int(rule_manager.get_property(None, module_name, 'debug')) repository = rule_manager.get_property(None, module_name, 'repository') if repository == '': # No data available, no initialization done return repository_root = \ svn_client.info2(repository, depth = pysvn.depth.empty)[0][1]['repos_root_URL'] repository_name = rule_manager.get_property(None, module_name, 'repository_name') # Fetch all the files in the given repository dir_info = svn_client.list(repository, depth = pysvn.depth.immediates) # Select only those that are directories and match the given expression dir_info = [x[0]['repos_path'][1:] for x in dir_info \ if x[0]['kind'] == pysvn.node_kind.dir] source_dirs = fnmatch.filter(dir_info, rule_manager.get_property(None, module_name, 'files')) # Dump the dirs being processed if debug != 0: print >> sys.stderr, repository_root, ':', len(source_dirs), print >> sys.stderr, 'svndirs being processed.' # Get the window date to process events (from_date, until_date) = rules_common.window_dates(module_name) # Set the date/times to ask for the logs if from_date != None: seconds = calendar.timegm(from_date.utctimetuple()) revision_start = pysvn.Revision(pysvn.opt_revision_kind.date, seconds) else: revision_start = pysvn.Revision(pysvn.opt_revision_kind.head) if until_date != None: seconds = calendar.timegm(until_date.utctimetuple()) revision_end = pysvn.Revision(pysvn.opt_revision_kind.date, seconds) else: revision_end = pysvn.Revision(pysvn.opt_revision_kind.number, 0) msg_size = int(rule_manager.get_property(None, module_name, 'msg_length')) # Loop over the directories and collect al the logs all_logs = [] for directory_name in source_dirs: # Slurp al the logs in the server all_logs.extend(svn_client.log(os.path.join(repository_root, directory_name), revision_start = revision_start, revision_end = revision_end)) # Dump the dirs being processed if debug != 0: print >> sys.stderr, len(all_logs), 'logs being processed.' # Loop over all the log elements total_counter = 0 mark_lines = len(all_logs) / 40 + 1 for log_data in all_logs: # Count the logs to print the mark string on the screen total_counter += 1 if total_counter % mark_lines == 0: print >> sys.stderr, '+', sys.stderr.flush() # Fetch the three important fields, author, date/time and msg anon_user_id = anonymize.find_or_encode_string(log_data['author']) dtime = datetime.datetime.fromtimestamp(log_data['date']) # How can be a substring of a specific length be obtained? msg = unicode(log_data['message'], 'utf-8') # This subsetting needs to be done after encoding to make sure the # string is broken in a safe location (and not in the mid of a utf-8 # character). msg = msg[:msg_size] if dtime < from_date or dtime > until_date: # Ignore event because it is outside the given window continue # If there is a filter function and returns None, skip this event if filter_function != None and \ filter_function([log_data['author'], log_data['date'], log_data['message']]) == None: continue try: special_idx = svn_special_event_comment.index(msg) event_name = svn_special_event_names[special_idx] except ValueError, e: event_name = 'svn_commit' event = (event_name, dtime, anon_user_id, [('program', 'svn'), ('repository', repository_name), ('comment', msg)]) try: event_output.out(event) except Exception, e: print 'Exception while processing', module_name print str(e) sys.exit(1)
def execute(module_name): """ Given a list of files with Apache logs, process all of them. [('name', 'visit_url'), ('datetime', dtime), ('user', anonymize(user)), ('application', browser?), ('url', URL), ('ip', IP)] """ global clf_re global filter_function # Get the window date to process events (from_date, until_date) = rules_common.window_dates(module_name) # Get the files to process, lines and mark lines (files, total_lines, mark_lines) = \ rules_common.files_to_process(module_name) # Loop over all the given args total_counter = 0 for file_annotation in files: # Get the file name and (if it exists, the date of the last event) filename = file_annotation[0] last_event = datetime.datetime.min if len(file_annotation[1]) != 0: last_event = datetime.datetime.strptime(file_annotation[1][0], '%Y-%m-%d %H:%M:%S') new_last_event = last_event data_in = codecs.open(filename, 'r', encoding = 'utf8', errors = 'replace') old = '' counter = 0 for line in data_in: counter += 1 total_counter += 1 if total_counter % mark_lines == 0: print >> sys.stderr, '+', sys.stderr.flush() line = line[:-1] fields = clf_re.match(line).groups() if fields[2] == '': raise ValueError('Empty string' + line) # Translate date time of the event dtime = datetime.datetime.strptime(fields[3].strip()[:-6], '%d/%b/%Y:%H:%M:%S') if dtime <= last_event: # Event is older than what has been recorded in the # detect_new_files. skip continue if dtime < from_date or dtime > until_date: # Ignore event because it is outside the given window continue # If there is a filter function and returns None, skip this event if filter_function != None and filter_function(fields) == None: continue # Record the event with the highest datetime. if dtime > new_last_event: new_last_event = dtime (method, url, protocol) = fields[4].split() event = ('visit_url', dtime, anonymize.find_or_encode_string(fields[2]), [('application', 'unknown'), ('url', url), ('ip', fields[0])]) event_output.out(event) data_in.close() detect_new_files.update(None, module_name + '//' + filename, [new_last_event]) print >> sys.stderr