Beispiel #1
def execute(module_name):
    Given a list of files with firefox logs, process all of them. 

    Process the files containing the events. Return True if no error is

    [('name', 'visit_url'), 
     ('datetime', dtime),
     ('user', anonymize(user)),
     ('application', 'firefox'),
     ('invocation', URL)]


    global filter_function

    # Get the window date to process events
    (from_date, until_date) = rules_common.window_dates(module_name)

    # Get the files to process, lines and mark lines
    (files, total_lines, mark_lines) = \

    # Loop over all the files
    total_counter = 0
    for file_annotation in files:
        # Get the file name and (if it exists, the date of the last event)
        filename = file_annotation[0]
        last_event = datetime.datetime.min
        if len(file_annotation[1]) != 0:
            last_event = datetime.datetime.strptime(file_annotation[1][0], 
                                                        '%Y-%m-%d %H:%M:%S')
        new_last_event = last_event

        # Get the user id from the path to the file name
        user_id = filename.split('/')[-2]
        anon_user_id = anonymize.find_or_encode_string(user_id)

        data_in =, 'r', 'utf-8', errors = 'replace')

        line_number = 0
        for line in data_in:
            line_number += 1
            total_counter += 1
            if total_counter % mark_lines == 0:
                print >> sys.stderr, '+',

            # See if the user id appears in the command, if so, anonymize
            if, line):
                line = re.sub(user_id, anon_user_id, line)

            # Chop the command line to find out if it is one of the special
            # commands: gcc, valgrind, gdb, kate, kdevelop. If so, skip the
            # processing because it is done in other specific function.
            fields = line[:-1].split()

            # If something weird happened and there are no fields, ignoredumpt
            # the line
            if len(fields) != 3:
                print >> sys.stderr, 'WARNING: In file', filename
                print >> sys.stderr, 'Ignoring:', line

            dtime = datetime.datetime.strptime(' '.join(fields[0:2]).strip(), 
                                               '%Y-%m-%d %H:%M:%S')
            if dtime <= last_event:
                # Event is older than what has been recorded in the
                # detect_new_files. skip

            if dtime < from_date or dtime > until_date:
                # Ignore event because it is outside the given window
            # If there is a filter function and returns None, skip this event
            if filter_function != None and filter_function(fields) == None:

            # Record the event with the highest datetime.
            if dtime > new_last_event:
                new_last_event = dtime

            event = ('visit_url', dtime, anon_user_id,
                     [('application', 'firefox'), ('invocation',  fields[2])])

            except Exception, e:
                print 'Exception while processing', filename, ':', line_number
                print str(e)
        detect_new_files.update(None, module_name + '//' + filename, 
Beispiel #2
            if filter_function != None and filter_function(fields) == None:

            # Record the event with the highest datetime.
            if dtime > new_last_event:
                new_last_event = dtime

            event = (
                    ("program", "gdb"),
                    ("command", command),
                    ("session_cmds", '"' + ";".join(session_cmds) + '"'),
                    ("session_end", session_end),

            except Exception, e:
                print "Exception while processing", filename, ":", line_number
                print str(e)

        detect_new_files.update(None, module_name + "//" + filename, [new_last_event])

    print >> sys.stderr
Beispiel #3
def execute(module_name):
    Given a list of files with Apache logs, process all of them that contain the
    word mark word and produce the following events:

    [('name', 'embedded_question_correct'),
     ('datetime', dtime),
     ('user', anonymize(user)),
     ('application', browser?),
     ('url', URL), 
     ('ip', IP),
     ('block_id', id)]
    [('name', 'embedded_question_incorrect'),
     ('datetime', dtime),
     ('user', anonymize(user)),
     ('application', browser?),
     ('url', URL), 
     ('ip', IP),
     ('block_id', id)]
    [('name', 'embedded_question_blank'),
     ('datetime', dtime),
     ('user', anonymize(user)),
     ('application', browser?),
     ('url', URL), 
     ('ip', IP),
     ('block_id', id)]
    [('name', 'embedded_question_show'),
     ('datetime', dtime),
     ('user', anonymize(user)),
     ('application', browser?),
     ('url', URL), 
     ('ip', IP),
     ('block_id', id)]

    global clf_re
    global filter_function
    global remap_pairs

    # Get the window date to process events
    (from_date, until_date) = rules_common.window_dates(module_name)

    # Get the files to process, lines and mark lines
    (files, total_lines, mark_lines) = \

    # Get the remap_pairs evaluated from the options
    remap_pairs = eval('[' + rule_manager.get_property(None, module_name,
                                                       'remap_pairs') + \
    remap_pairs = [(re.compile(x), y) for (x, y) in remap_pairs]

    # Fetch the word to detect an embeddedq to use it later
    mark_word = rule_manager.get_property(None, module_name, 'mark_word')

    # Loop over all the given args
    total_counter = 0
    for file_annotation in files:
        # Get the file name and (if it exists, the date of the last event)
        filename = file_annotation[0]
        last_event = datetime.datetime.min
        if len(file_annotation[1]) != 0:
            last_event = datetime.datetime.strptime(file_annotation[1][0], 
                                                        '%Y-%m-%d %H:%M:%S')
        new_last_event = last_event

        data_in =, 'r', encoding = 'utf8', 
                              errors = 'replace')
        old = ''
        counter = 0
        for line in data_in:

            counter += 1
            total_counter += 1
            if total_counter % mark_lines == 0:
                print >> sys.stderr, '+',

            line = line[:-1]
            fields = clf_re.match(line).groups()

            if fields[2] == '':
                raise ValueError('Empty string' + line)

            # Translate date time of the event
            dtime = datetime.datetime.strptime(fields[3].strip()[:-6], 

            if dtime <= last_event:
                # Event is older than what has been recorded in the
                # detect_new_files. skip

            if dtime < from_date or dtime > until_date:
                # Ignore event because it is outside the given window
            # Split the url to match and see if it has the mark word
            (method, url, protocol) = fields[4].split()

            # Only 404 that have the mark word substring are accepted
            if fields[5] != '404' or url.find(mark_word) == -1:

            # If there is a filter function and returns None, skip this event
            if filter_function != None and filter_function(fields) == None:

            # Record the event with the highest datetime.
            if dtime > new_last_event:
                new_last_event = dtime

            # At this point we have an event of an embedded question.

            event_pairs = process_log_line(url, mark_word)
            for (event_suffix, question_id) in event_pairs:
                event = ('embedded_question_' + event_suffix, 
                         [('application', 'unknown'), 
                          ('url', url),
                          ('ip', fields[0]),
                          ('question_id', question_id)])

        detect_new_files.update(None, module_name + '//' + filename, 

    print >> sys.stderr
Beispiel #4
def execute(module_name):
    Given a list of files with svn apache logs, process all of them. The logs are produced with the following apache configuration commands:

    CustomLog [destionation log file]  "%t %u %{SVN-REPOS-NAME}e %{SVN-ACTION}e" env=SVN-ACTION


    [03/Mar/2012:11:43:55 +0100] abel asteams-en update /Teams/Team_09 r3960 send-copyfrom-args

    For each line in the file, the following event structure is produced

    [('name', 'svn_' + svn_action), # Svn action is update, diff, etc. 
     ('datetime', dtime),
     ('user', anonymize(user)),
     ('repository', repository name),
     ('directory', directory) (optional), 
     ('revision', r??? (optional)),
     ('comment', (max 256 chars)) # Only if commit and repository given]

    global filter_function

    # Get the level of debug
    debug = int(rule_manager.get_property(None, module_name, "debug"))

    # Get the window date to process events
    (from_date, until_date) = rules_common.window_dates(module_name)

    # Get the files to process, lines and mark lines
    (files, total_lines, mark_lines) = rules_common.files_to_process(module_name)

    # Get the flag to see if the commits need to be processed
    process_commits = rule_manager.get_property(None, module_name, "process_commits") == ""

    # Loop over all the given args
    total_counter = 0
    for file_annotation in files:
        # Get the file name and (if it exists, the date of the last event)
        filename = file_annotation[0]
        last_event = datetime.datetime.min
        if len(file_annotation[1]) != 0:
            last_event = datetime.datetime.strptime(file_annotation[1][0], "%Y-%m-%d %H:%M:%S")
        new_last_event = last_event

        data_in =, "r", encoding="utf8", errors="replace")
        old = ""
        counter = 0
        for line in data_in:

            # Advance counters and print progress character if needed
            counter += 1
            total_counter += 1
            if total_counter % mark_lines == 0:
                print >>sys.stderr, "+",

            # Chop line into fields
            line = line[:-1]
            fields = line.split()
            if len(fields) < 3:
                raise ValueError("Erroneous log line:" + line)

            # Get the event type to quickly detect if we need to skip it
            event_type = fields[4]
            if (not process_commits) and event_type == "commit":

            # Translate date time of the event and check if within process
            # interval
            dtime = datetime.datetime.strptime(fields[0][1:], "%d/%b/%Y:%H:%M:%S")
            if dtime <= last_event:
                # Event is older than what has been recorded in the
                # detect_new_files. skip
            if dtime < from_date or dtime > until_date:
                # Ignore event because it is outside the given window

            # If there is a filter function and returns None, skip this event
            if filter_function != None and filter_function(fields) == None:

            # Record the event with the highest datetime.
            if dtime > new_last_event:
                new_last_event = dtime

            # Create the first three pairs of the event
            event = (
                "svn_" + event_type,
                [("repository", fields[3])],

            # Structure of the different events
            # checkout-or-export /path r62 depth=infinity
            # commit harry r100
            # diff /path r15:20 depth=infinity ignore-ancestry
            # get-dir /trunk r17 text
            # get-file /path r20 props
            # get-file-revs /path r12:15 include-merged-revisions
            # get-mergeinfo (/path1 /path2)
            # lock /path steal
            # log (/path1,/path2) r20:90 discover-changed-paths revprops=()
            # replay /path r19
            # change-rev-prop r50 propertyname
            # rev-proplist r34
            # status /path r62 depth=infinity
            # switch /pathA /pathB@50 depth=infinity
            # unlock /path break
            # update /path r17 send-copyfrom-args
            if event_type == "checkout-or-export":
                event[3].append(("revision", fields[6]))
                event[3].append(("location", fields[5]))
            if event_type == "commit":
                event[3].append(("revision", fields[5]))
                # Fetch the log message if svn_client is not None
                if svn_client != None:
            elif event_type == "diff":
                event[3].append(("location", fields[5] + " " + fields[6]))
            elif event_type == "get-dir" or event_type == "get-file" or event_type == "update":
                event[3].append(("revision", fields[6]))
                event[3].append(("location", fields[5]))
            elif event_type == "get-file-revs":
                event[3].append(("revision", "r" + fields[6].split(":")[1]))
                event[3].append(("location", fields[5]))
            elif event_type == "lock" or event_type == "unlock":
                event[3].append(("location", fields[5]))
            elif event_type == "log":
                event[3].append(("location", fields[5]))


        detect_new_files.update(None, module_name + "//" + filename, [new_last_event])

    print >>sys.stderr
Beispiel #5
def execute(module_name):
    Given a list of files with bash logs, process all of them. 

    [('name', 'bashcmd'), 
     ('datetime', datetime),
     ('user', anonymize(user_id)),
     ('program', program),
     ('command', command)]

    global filter_function

    # Get the window date to process events
    (from_date, until_date) = rules_common.window_dates(module_name)

    # Get the files to process, lines and mark lines
    (files, total_lines, mark_lines) = \

    # Commands that even though the appear as bash, they require special
    # processing thus, they are processed somewhere else.
    skip_commands = set(['gcc', 'valgrind', 'gdb', 'kate', 'kdevelop',
                         '/usr/bin/gcc', '/usr/bin/valgrind', '/usr/bin/gdb', 
                         '/usr/bin/kate', '/usr/bin/kdevelop'])

    # Loop over all the files
    total_counter = 0
    for file_annotation in files:
        # Get the file name and (if it exists, the date of the last event)
        filename = file_annotation[0]
        last_event = datetime.datetime.min
        if len(file_annotation[1]) != 0:
            last_event = datetime.datetime.strptime(file_annotation[1][0], 
                                                        '%Y-%m-%d %H:%M:%S')
        new_last_event = last_event

        # Get the user id from the path to the file name
        user_id = filename.split('/')[-2]
        anon_user_id = anonymize.find_or_encode_string(user_id)

        data_in =, 'r', 'utf-8', errors = 'replace')

        line_number = 0
        for line in data_in:
            line_number += 1
            total_counter += 1

            if total_counter % mark_lines == 0:
                print >> sys.stderr, '+',

            # Detect and skip empty lines, MS-DOS empty lines, # only
            if re.match('^[ ]*\n$', line) or re.match('^\r\n$', line) or \
                    re.match('^#[ ]*\n$', line):

            # Detect timestamp
            if re.match('^#[0-9]+', line):
                milliseconds = float(line.split('#')[1])
                stamp = datetime.datetime.fromtimestamp(milliseconds)

            if stamp <= last_event:
                # Event is older than what has been recorded in the
                # detect_new_files. skip

            if stamp < from_date or stamp > until_date:
                # Ignore event because it is outside the given window
            # See if the user id appears in the command, if so, anonymize
            if, line):
                line = re.sub(user_id, anon_user_id, line)

            # Chop the command line to find out if it is one of the special
            # commands: gcc, valgrind, gdb, kate, kdevelop. If so, skip the
            # processing because it is done in other specific function.
            fields = line.split()

            # If something weird happened and there are no fields, ignoredumpt
            # the line
            if len(fields) == 0:

            # Process the command line

            # Skip certain commands
            if os.path.basename(fields[0]) in skip_commands:
            # If there is a filter function and returns None, skip this event
            if filter_function != None and filter_function(fields) == None:

            # Record the event with the highest datetime.
            if stamp > new_last_event:
                new_last_event = stamp

                event = ('bashcmd', stamp, anon_user_id, 
                         [('program', fields[0]), ('command',  line[:-1])])

            except Exception, e:
                print 'Exception while processing', filename, ':', line_number
                print str(e)
        detect_new_files.update(None, module_name + '//' + filename, 
Beispiel #6
def process_csv_file(module_name, filename, mark_lines, total_counter, 
                     last_event, from_date, until_date, datetime_fmt):
    Receives the following parameters:
    - module_name: to record the modification of the file.
    - filename: file to process in CSV format
    - mark_lines: the number of lines to process to print out a mark
    - total_counter: total number of lines to be processed
    - last_event: the last event processed by this function in this file
    - from_date - to_date: the date limits to process events

    Returns the total_counter updated with the processed lines.
    - Open the file
    - Loop over each line 
      - Mark a line if needed
      - Split the line into fields
      - Check if the date/time of the event is allowed
      - Store the new_last_event
      - Dump the event 
    - Close the file
    - Update the info in the detect_new_files


    global filter_function

    new_last_event = last_event

    data_in =, 'r', encoding = 'utf8', errors = 'replace')
    old = ''
    counter = 0
    for line in data_in:
        counter += 1
        total_counter += 1
        if total_counter % mark_lines == 0:
            print >> sys.stderr, '+',

        line = line[:-1]
        # Detect, accumulate \x0D to be removed
        if line[-1] == '\x0D':
            old = old + line[:-1]

        # If there is something in old, dump it
        if old != '':
            old = ''

        # Check the number of fields and skip lines without 6 fields
        fields = line.split('\t')
        if len(fields) != 6:

        # Dump event and remember the last one
        new_last_event = check_data_and_dump_event(fields,

    detect_new_files.update(None, module_name + '//' + filename, 

    return total_counter
Beispiel #7
def execute(module_name):
    Given a list of files with Apache logs, process all of them. 

    [('name', 'visit_url'),
     ('datetime', dtime),
     ('user', anonymize(user)),
     ('application', browser?),
     ('url', URL), 
     ('ip', IP)]

    global clf_re
    global filter_function

    # Get the window date to process events
    (from_date, until_date) = rules_common.window_dates(module_name)

    # Get the files to process, lines and mark lines
    (files, total_lines, mark_lines) = \

    # Loop over all the given args
    total_counter = 0
    for file_annotation in files:
        # Get the file name and (if it exists, the date of the last event)
        filename = file_annotation[0]
        last_event = datetime.datetime.min
        if len(file_annotation[1]) != 0:
            last_event = datetime.datetime.strptime(file_annotation[1][0], 
                                                        '%Y-%m-%d %H:%M:%S')
        new_last_event = last_event

        data_in =, 'r', encoding = 'utf8', 
                              errors = 'replace')
        old = ''
        counter = 0
        for line in data_in:

            counter += 1
            total_counter += 1
            if total_counter % mark_lines == 0:
                print >> sys.stderr, '+',

            line = line[:-1]
            fields = clf_re.match(line).groups()

            if fields[2] == '':
                raise ValueError('Empty string' + line)

            # Translate date time of the event
            dtime = datetime.datetime.strptime(fields[3].strip()[:-6], 

            if dtime <= last_event:
                # Event is older than what has been recorded in the
                # detect_new_files. skip

            if dtime < from_date or dtime > until_date:
                # Ignore event because it is outside the given window
            # If there is a filter function and returns None, skip this event
            if filter_function != None and filter_function(fields) == None:

            # Record the event with the highest datetime.
            if dtime > new_last_event:
                new_last_event = dtime

            (method, url, protocol) = fields[4].split()

            event = ('visit_url', dtime,
                     [('application', 'unknown'), 
                      ('url', url),
                      ('ip', fields[0])])

        detect_new_files.update(None, module_name + '//' + filename, 

    print >> sys.stderr