Example #1
0
def execute(module_name):
    """
    Given a list of files with gdb logs, process all of them. 

    Process the files containing the events. Return True if no error is
    detected. The event have the form of:

-BEGIN 2011-10-14 11:09:29 2011-10-14 11:15:06 /usr/bin/gdb prueba
r
where
q
-END

    [('name', 'debugger'), 
     ('datetime', dtime),
     ('user', anonymize(user)),
     ('application', 'gdb'),
     ('invocation, command),
     ('session_cmds', session commands),
     ('session_end', datime when session ended)]

    """

    global filter_function

    # Get the window date to process events
    (from_date, until_date) = rules_common.window_dates(module_name)

    # Get the files to process, lines and mark lines
    (files, total_lines, mark_lines) = rules_common.files_to_process(module_name)

    # Loop over all the files
    total_counter = 0
    for file_annotation in files:
        # Get the file name and (if it exists, the date of the last event)
        filename = file_annotation[0]
        last_event = datetime.datetime.min
        if len(file_annotation[1]) != 0:
            last_event = datetime.datetime.strptime(file_annotation[1][0], "%Y-%m-%d %H:%M:%S")
        new_last_event = last_event

        # Get the user id from the path to the file name
        user_id = filename.split("/")[-2]
        anon_user_id = anonymize.find_or_encode_string(user_id)

        data_in = codecs.open(filename, "r", "utf-8", errors="replace")

        line_number = 0
        session_cmds = []
        for line in data_in:
            line_number += 1
            total_counter += 1

            if total_counter % mark_lines == 0:
                print >> sys.stderr, "+",
                sys.stderr.flush()

            # Skip the empty lines
            if line == "\n":
                continue

            # See if the user id appears in the command, if so, anonymize
            if re.search(user_id, line):
                line = re.sub(user_id, anon_user_id, line)

            # Chop the command line
            fields = line[:-1].split()

            # Beginning of log. Catch command invocation and dates
            if re.match("^\-BEGIN .+$", line):
                fields = line.split()
                try:
                    dtime = datetime.datetime.strptime(" ".join(fields[1:3]), "%Y-%m-%d %H:%M:%S")
                    session_end = datetime.datetime.strptime(" ".join(fields[3:5]), "%Y-%m-%d %H:%M:%S")

                except ValueError, e:
                    print >> sys.stderr, "WARNING: In file", filename
                    print >> sys.stderr, "Ignoring:", line

                command = " ".join(fields[5:])
                session_cmds = []
                continue

            # If not the end of an event, concatenate the line and keep looping
            if not re.match("^\-END$", line):
                session_cmds.append(line[:-1])
                continue

            # At this point we have the complete information about the event

            if dtime <= last_event:
                # Event is older than what has been recorded in the
                # detect_new_files. skip
                continue

            # If out of time window, ignore
            if dtime < from_date or dtime > until_date:
                continue

            # If there is a filter function and returns None, skip this event
            if filter_function != None and filter_function(fields) == None:
                continue

            # Record the event with the highest datetime.
            if dtime > new_last_event:
                new_last_event = dtime

            event = (
                "gdb",
                dtime,
                anon_user_id,
                [
                    ("program", "gdb"),
                    ("command", command),
                    ("session_cmds", '"' + ";".join(session_cmds) + '"'),
                    ("session_end", session_end),
                ],
            )

            try:
                event_output.out(event)
            except Exception, e:
                print "Exception while processing", filename, ":", line_number
                print str(e)
                sys.exit(1)
Example #2
0
def execute(module_name):
    """
    Given a list of files with Apache logs, process all of them that contain the
    word mark word and produce the following events:

    [('name', 'embedded_question_correct'),
     ('datetime', dtime),
     ('user', anonymize(user)),
     ('application', browser?),
     ('url', URL), 
     ('ip', IP),
     ('block_id', id)]
            
    [('name', 'embedded_question_incorrect'),
     ('datetime', dtime),
     ('user', anonymize(user)),
     ('application', browser?),
     ('url', URL), 
     ('ip', IP),
     ('block_id', id)]
            
    [('name', 'embedded_question_blank'),
     ('datetime', dtime),
     ('user', anonymize(user)),
     ('application', browser?),
     ('url', URL), 
     ('ip', IP),
     ('block_id', id)]
            
    [('name', 'embedded_question_show'),
     ('datetime', dtime),
     ('user', anonymize(user)),
     ('application', browser?),
     ('url', URL), 
     ('ip', IP),
     ('block_id', id)]
            
    """

    global clf_re
    global filter_function
    global remap_pairs

    # Get the window date to process events
    (from_date, until_date) = rules_common.window_dates(module_name)

    # Get the files to process, lines and mark lines
    (files, total_lines, mark_lines) = \
        rules_common.files_to_process(module_name)

    # Get the remap_pairs evaluated from the options
    remap_pairs = eval('[' + rule_manager.get_property(None, module_name,
                                                       'remap_pairs') + \
                           ']')
    remap_pairs = [(re.compile(x), y) for (x, y) in remap_pairs]

    # Fetch the word to detect an embeddedq to use it later
    mark_word = rule_manager.get_property(None, module_name, 'mark_word')

    # Loop over all the given args
    total_counter = 0
    for file_annotation in files:
        # Get the file name and (if it exists, the date of the last event)
        filename = file_annotation[0]
        last_event = datetime.datetime.min
        if len(file_annotation[1]) != 0:
            last_event = datetime.datetime.strptime(file_annotation[1][0], 
                                                        '%Y-%m-%d %H:%M:%S')
        new_last_event = last_event

        data_in = codecs.open(filename, 'r', encoding = 'utf8', 
                              errors = 'replace')
        old = ''
        counter = 0
        for line in data_in:

            counter += 1
            total_counter += 1
            
            if total_counter % mark_lines == 0:
                print >> sys.stderr, '+',
                sys.stderr.flush()

            line = line[:-1]
            
            fields = clf_re.match(line).groups()

            if fields[2] == '':
                raise ValueError('Empty string' + line)

            # Translate date time of the event
            dtime = datetime.datetime.strptime(fields[3].strip()[:-6], 
                                               '%d/%b/%Y:%H:%M:%S')

            if dtime <= last_event:
                # Event is older than what has been recorded in the
                # detect_new_files. skip
                continue

            if dtime < from_date or dtime > until_date:
                # Ignore event because it is outside the given window
                continue
            
            # Split the url to match and see if it has the mark word
            (method, url, protocol) = fields[4].split()

            # Only 404 that have the mark word substring are accepted
            if fields[5] != '404' or url.find(mark_word) == -1:
                continue

            # If there is a filter function and returns None, skip this event
            if filter_function != None and filter_function(fields) == None:
                continue

            # Record the event with the highest datetime.
            if dtime > new_last_event:
                new_last_event = dtime

            # At this point we have an event of an embedded question.

            event_pairs = process_log_line(url, mark_word)
            
            for (event_suffix, question_id) in event_pairs:
                event = ('embedded_question_' + event_suffix, 
                         dtime,
                         anonymize.find_or_encode_string(fields[2]),
                         [('application', 'unknown'), 
                          ('url', url),
                          ('ip', fields[0]),
                          ('question_id', question_id)])
                
                event_output.out(event)

        data_in.close()
        detect_new_files.update(None, module_name + '//' + filename, 
                                [new_last_event])

    print >> sys.stderr
Example #3
0
def execute(module_name):
    """
    Given a list of files with firefox logs, process all of them. 

    Process the files containing the events. Return True if no error is
    detected.

    [('name', 'visit_url'), 
     ('datetime', dtime),
     ('user', anonymize(user)),
     ('application', 'firefox'),
     ('invocation', URL)]

    """

    global filter_function

    # Get the window date to process events
    (from_date, until_date) = rules_common.window_dates(module_name)

    # Get the files to process, lines and mark lines
    (files, total_lines, mark_lines) = \
        rules_common.files_to_process(module_name)

    # Loop over all the files
    total_counter = 0
    for file_annotation in files:
        # Get the file name and (if it exists, the date of the last event)
        filename = file_annotation[0]
        last_event = datetime.datetime.min
        if len(file_annotation[1]) != 0:
            last_event = datetime.datetime.strptime(file_annotation[1][0], 
                                                        '%Y-%m-%d %H:%M:%S')
        new_last_event = last_event

        # Get the user id from the path to the file name
        user_id = filename.split('/')[-2]
        anon_user_id = anonymize.find_or_encode_string(user_id)

        data_in = codecs.open(filename, 'r', 'utf-8', errors = 'replace')

        line_number = 0
        for line in data_in:
            line_number += 1
            total_counter += 1
            
            if total_counter % mark_lines == 0:
                print >> sys.stderr, '+',
                sys.stderr.flush()

            # See if the user id appears in the command, if so, anonymize
            if re.search(user_id, line):
                line = re.sub(user_id, anon_user_id, line)

            # Chop the command line to find out if it is one of the special
            # commands: gcc, valgrind, gdb, kate, kdevelop. If so, skip the
            # processing because it is done in other specific function.
            fields = line[:-1].split()

            # If something weird happened and there are no fields, ignoredumpt
            # the line
            if len(fields) != 3:
                print >> sys.stderr, 'WARNING: In file', filename
                print >> sys.stderr, 'Ignoring:', line
                continue

            dtime = datetime.datetime.strptime(' '.join(fields[0:2]).strip(), 
                                               '%Y-%m-%d %H:%M:%S')
            
            if dtime <= last_event:
                # Event is older than what has been recorded in the
                # detect_new_files. skip
                continue

            if dtime < from_date or dtime > until_date:
                # Ignore event because it is outside the given window
                continue
            
            # If there is a filter function and returns None, skip this event
            if filter_function != None and filter_function(fields) == None:
                continue

            # Record the event with the highest datetime.
            if dtime > new_last_event:
                new_last_event = dtime

            event = ('visit_url', dtime, anon_user_id,
                     [('application', 'firefox'), ('invocation',  fields[2])])

            try:
                event_output.out(event)
            except Exception, e:
                print 'Exception while processing', filename, ':', line_number
                print str(e)
                sys.exit(1)
            
        data_in.close()
        detect_new_files.update(None, module_name + '//' + filename, 
                                [new_last_event])
Example #4
0
def execute(module_name):
    """
    Given a list of files with svn apache logs, process all of them. The logs are produced with the following apache configuration commands:

    CustomLog [destionation log file]  "%t %u %{SVN-REPOS-NAME}e %{SVN-ACTION}e" env=SVN-ACTION

Sample:

    [03/Mar/2012:11:43:55 +0100] abel asteams-en update /Teams/Team_09 r3960 send-copyfrom-args

    For each line in the file, the following event structure is produced

    [('name', 'svn_' + svn_action), # Svn action is update, diff, etc. 
     ('datetime', dtime),
     ('user', anonymize(user)),
     ('repository', repository name),
     ('directory', directory) (optional), 
     ('revision', r??? (optional)),
     ('comment', (max 256 chars)) # Only if commit and repository given]
    """

    global filter_function

    # Get the level of debug
    debug = int(rule_manager.get_property(None, module_name, "debug"))

    # Get the window date to process events
    (from_date, until_date) = rules_common.window_dates(module_name)

    # Get the files to process, lines and mark lines
    (files, total_lines, mark_lines) = rules_common.files_to_process(module_name)

    # Get the flag to see if the commits need to be processed
    process_commits = rule_manager.get_property(None, module_name, "process_commits") == ""

    # Loop over all the given args
    total_counter = 0
    for file_annotation in files:
        # Get the file name and (if it exists, the date of the last event)
        filename = file_annotation[0]
        last_event = datetime.datetime.min
        if len(file_annotation[1]) != 0:
            last_event = datetime.datetime.strptime(file_annotation[1][0], "%Y-%m-%d %H:%M:%S")
        new_last_event = last_event

        data_in = codecs.open(filename, "r", encoding="utf8", errors="replace")
        old = ""
        counter = 0
        for line in data_in:

            # Advance counters and print progress character if needed
            counter += 1
            total_counter += 1
            if total_counter % mark_lines == 0:
                print >>sys.stderr, "+",
                sys.stderr.flush()

            # Chop line into fields
            line = line[:-1]
            fields = line.split()
            if len(fields) < 3:
                raise ValueError("Erroneous log line:" + line)

            # Get the event type to quickly detect if we need to skip it
            event_type = fields[4]
            if (not process_commits) and event_type == "commit":
                continue

            # Translate date time of the event and check if within process
            # interval
            dtime = datetime.datetime.strptime(fields[0][1:], "%d/%b/%Y:%H:%M:%S")
            if dtime <= last_event:
                # Event is older than what has been recorded in the
                # detect_new_files. skip
                continue
            if dtime < from_date or dtime > until_date:
                # Ignore event because it is outside the given window
                continue

            # If there is a filter function and returns None, skip this event
            if filter_function != None and filter_function(fields) == None:
                continue

            # Record the event with the highest datetime.
            if dtime > new_last_event:
                new_last_event = dtime

            # Create the first three pairs of the event
            event = (
                "svn_" + event_type,
                dtime,
                anonymize.find_or_encode_string(fields[2]),
                [("repository", fields[3])],
            )

            # Structure of the different events
            #
            # checkout-or-export /path r62 depth=infinity
            # commit harry r100
            # diff /path r15:20 depth=infinity ignore-ancestry
            # get-dir /trunk r17 text
            # get-file /path r20 props
            # get-file-revs /path r12:15 include-merged-revisions
            # get-mergeinfo (/path1 /path2)
            # lock /path steal
            # log (/path1,/path2) r20:90 discover-changed-paths revprops=()
            # replay /path r19
            # change-rev-prop r50 propertyname
            # rev-proplist r34
            # status /path r62 depth=infinity
            # switch /pathA /pathB@50 depth=infinity
            # unlock /path break
            # update /path r17 send-copyfrom-args
            if event_type == "checkout-or-export":
                event[3].append(("revision", fields[6]))
                event[3].append(("location", fields[5]))
            if event_type == "commit":
                event[3].append(("revision", fields[5]))
                # Fetch the log message if svn_client is not None
                if svn_client != None:
                    pass
            elif event_type == "diff":
                event[3].append(("location", fields[5] + " " + fields[6]))
            elif event_type == "get-dir" or event_type == "get-file" or event_type == "update":
                event[3].append(("revision", fields[6]))
                event[3].append(("location", fields[5]))
            elif event_type == "get-file-revs":
                event[3].append(("revision", "r" + fields[6].split(":")[1]))
                event[3].append(("location", fields[5]))
            elif event_type == "lock" or event_type == "unlock":
                event[3].append(("location", fields[5]))
            elif event_type == "log":
                event[3].append(("location", fields[5]))

            event_output.out(event)

        data_in.close()
        detect_new_files.update(None, module_name + "//" + filename, [new_last_event])

    print >>sys.stderr
Example #5
0
def execute(module_name):
    """
    Given a list of files with gcc logs, process all of them. 

    Process the files containing the events. Return True if no error is
    detected. The event have the form of:

-BEGIN 2011-10-14 11:09:29 2011-10-14 11:15:06 /usr/bin/gcc prueba
r
where
q
-END

    [('name', 'compiler'), 
     ('datetime', dtime),
     ('user', anonymize(user)),
     ('application', 'gcc'),
     ('invocation', command),
     ('messages', message extract)]

    """

    global filter_function

    # Get the window date to process events
    (from_date, until_date) = rules_common.window_dates(module_name)

    message_lines = int(rule_manager.get_property(None, module_name, 
                                                  'message_lines'))

    # Get the files to process, lines and mark lines
    (files, total_lines, mark_lines) = \
        rules_common.files_to_process(module_name)

    # Loop over all the files
    total_counter = 0
    for file_annotation in files:
        # Get the file name and (if it exists, the date of the last event)
        filename = file_annotation[0]
        last_event = datetime.datetime.min
        if len(file_annotation[1]) != 0:
            last_event = datetime.datetime.strptime(file_annotation[1][0], 
                                                        '%Y-%m-%d %H:%M:%S')
        new_last_event = last_event

        # Get the user id from the path to the file name
        user_id = filename.split('/')[-2]
        anon_user_id = anonymize.find_or_encode_string(user_id)

        data_in = codecs.open(filename, 'r', 'utf-8', errors = 'replace')

        line_number = 0
        messages = []
        begin_fields = []
        for line in data_in:
            line_number += 1
            total_counter += 1
            
            if total_counter % mark_lines == 0:
                print >> sys.stderr, '+',
                sys.stderr.flush()

            # Skip the empty lines
            if line == '\n':
                continue

            # See if the user id appears in the command, if so, anonymize
            if re.search(user_id, line):
                line = re.sub(user_id, anon_user_id, line)

            # Chop the command line
            fields = line[:-1].split()

            # Beginning of log. Catch command invocation and dates
            if re.match('^\-BEGIN .+$', line):
                begin_fields = line.split()
                try:
                    dtime = \
                        datetime.datetime.strptime(' '.join(begin_fields[4:6]),
                                                   '%Y-%m-%d %H:%M:%S')
                    
                except ValueError, e:
                    print >> sys.stderr, 'WARNING: In file', filename
                    print >> sys.stderr, 'Ignoring:', line

                command = ' '.join(begin_fields[6:])
                messages = []
                continue

            # If not the end of an event, concatenate the line and keep looping
            if not re.match('^\-END$', line):
                if len(messages) < message_lines:
                    messages.append(line[:-1])
                continue
                
            # At this point we have the complete information about the event
            
            if dtime <= last_event:
                # Event is older than what has been recorded in the
                # detect_new_files. skip
                continue

            # If out of time window, ignore
            if dtime < from_date or dtime > until_date:
                continue
            
            # If there is a filter function and returns None, skip this event
            if filter_function != None and filter_function(begin_fields) == None:
                continue

            # Record the event with the highest datetime.
            if dtime > new_last_event:
                new_last_event = dtime

            event = ('gcc', dtime, anon_user_id,
                     [('program', 'gcc'),
                      ('command',  command),
                      ('messages',  '"' + '|||'.join(messages) + '"')])

            try:
                event_output.out(event)
            except Exception, e:
                print 'Exception while processing', filename, ':', line_number
                print str(e)
                sys.exit(1)
Example #6
0
def execute(module_name):
    """
    Given a list of files with bash logs, process all of them. 

    [('name', 'bashcmd'), 
     ('datetime', datetime),
     ('user', anonymize(user_id)),
     ('program', program),
     ('command', command)]
    
    """

    global filter_function

    # Get the window date to process events
    (from_date, until_date) = rules_common.window_dates(module_name)

    # Get the files to process, lines and mark lines
    (files, total_lines, mark_lines) = \
        rules_common.files_to_process(module_name)

    # Commands that even though the appear as bash, they require special
    # processing thus, they are processed somewhere else.
    skip_commands = set(['gcc', 'valgrind', 'gdb', 'kate', 'kdevelop',
                         '/usr/bin/gcc', '/usr/bin/valgrind', '/usr/bin/gdb', 
                         '/usr/bin/kate', '/usr/bin/kdevelop'])

    # Loop over all the files
    total_counter = 0
    for file_annotation in files:
        # Get the file name and (if it exists, the date of the last event)
        filename = file_annotation[0]
        last_event = datetime.datetime.min
        if len(file_annotation[1]) != 0:
            last_event = datetime.datetime.strptime(file_annotation[1][0], 
                                                        '%Y-%m-%d %H:%M:%S')
        new_last_event = last_event

        # Get the user id from the path to the file name
        user_id = filename.split('/')[-2]
        anon_user_id = anonymize.find_or_encode_string(user_id)

        data_in = codecs.open(filename, 'r', 'utf-8', errors = 'replace')

        line_number = 0
        for line in data_in:
            line_number += 1
            total_counter += 1

            if total_counter % mark_lines == 0:
                print >> sys.stderr, '+',
                sys.stderr.flush()

            
            # Detect and skip empty lines, MS-DOS empty lines, # only
            if re.match('^[ ]*\n$', line) or re.match('^\r\n$', line) or \
                    re.match('^#[ ]*\n$', line):
                continue

            # Detect timestamp
            if re.match('^#[0-9]+', line):
                milliseconds = float(line.split('#')[1])
                stamp = datetime.datetime.fromtimestamp(milliseconds)
                continue

            if stamp <= last_event:
                # Event is older than what has been recorded in the
                # detect_new_files. skip
                continue

            if stamp < from_date or stamp > until_date:
                # Ignore event because it is outside the given window
                continue
            
            # See if the user id appears in the command, if so, anonymize
            if re.search(user_id, line):
                line = re.sub(user_id, anon_user_id, line)

            # Chop the command line to find out if it is one of the special
            # commands: gcc, valgrind, gdb, kate, kdevelop. If so, skip the
            # processing because it is done in other specific function.
            fields = line.split()

            # If something weird happened and there are no fields, ignoredumpt
            # the line
            if len(fields) == 0:
                continue

            # Process the command line

            # Skip certain commands
            if os.path.basename(fields[0]) in skip_commands:
                continue
            
            # If there is a filter function and returns None, skip this event
            if filter_function != None and filter_function(fields) == None:
                continue

            # Record the event with the highest datetime.
            if stamp > new_last_event:
                new_last_event = stamp

                event = ('bashcmd', stamp, anon_user_id, 
                         [('program', fields[0]), ('command',  line[:-1])])

            try:
                event_output.out(event)
            except Exception, e:
                print 'Exception while processing', filename, ':', line_number
                print str(e)
                sys.exit(1)
                
        data_in.close()
        detect_new_files.update(None, module_name + '//' + filename, 
                                [new_last_event])
Example #7
0
def execute(module_name):
    """
    Given a list of files with kate logs, process all of them. 

    Process the files containing the events. Return True if no error is
    detected. The event have the form of:

0 2011-10-05 18:30:02 2011-10-05 18:30:15 '/usr/bin/kate' 'numero.dat'

    Where the fields are:
      - status
      - start date/time
      - stop date/time
      - program name
      - file (optional)

    [('name', 'text_editor'), 
     ('datetime', dtime),
     ('user', anonymize(user)),
     ('application', 'kate'),
     ('invocation', command (program name + file))]

    """

    global filter_function

    # Get the window date to process events
    (from_date, until_date) = rules_common.window_dates(module_name)

    # Get the files to process, lines and mark lines
    (files, total_lines, mark_lines) = \
        rules_common.files_to_process(module_name)

    # Loop over all the files
    total_counter = 0
    for file_annotation in files:
        # Get the file name and (if it exists, the date of the last event)
        filename = file_annotation[0]
        last_event = datetime.datetime.min
        if len(file_annotation[1]) != 0:
            last_event = datetime.datetime.strptime(file_annotation[1][0], 
                                                        '%Y-%m-%d %H:%M:%S')
        new_last_event = last_event

        # Get the user id from the path to the file name
        user_id = filename.split('/')[-2]
        anon_user_id = anonymize.find_or_encode_string(user_id)

        data_in = codecs.open(filename, 'r', 'utf-8', errors = 'replace')

        line_number = 0
        for line in data_in:
            line_number += 1
            total_counter += 1
            
            if total_counter % mark_lines == 0:
                print >> sys.stderr, '+',
                sys.stderr.flush()

            # See if the user id appears in the command, if so, anonymize
            if re.search(user_id, line):
                line = re.sub(user_id, anon_user_id, line)

            # Chop the command line to find out if it is one of the special
            # commands: gcc, valgrind, gdb, kate, kdevelop. If so, skip the
            # processing because it is done in other specific function.
            fields = line[:-1].split()

            # If something weird happened and there are no fields, ignoredumpt
            # the line
            if len(fields) < 6:
                print >> sys.stderr, 'WARNING: In file', filename
                print >> sys.stderr, 'Not enough fields:', line
                continue

            try:
                dtime = datetime.datetime.strptime(' '.join(fields[1:3]).strip(), 
                                                   '%Y-%m-%d %H:%M:%S')
            except ValueError, e:
                print >> sys.stderr, 'WARNING: In file', filename
                print >> sys.stderr, 'Incorrect fields:', line
                continue
            
            if dtime <= last_event:
                # Event is older than what has been recorded in the
                # detect_new_files. skip
                continue

            if dtime < from_date or dtime > until_date:
                # Ignore event because it is outside the given window
                continue
            
            # If there is a filter function and returns None, skip this event
            if filter_function != None and filter_function(fields) == None:
                continue

            cmd = fields[5][1:-1]
            if len(fields) == 7:
                cmd = cmd + ' ' + fields[6][1:-1]

            # Record the event with the highest datetime.
            if dtime > new_last_event:
                new_last_event = dtime

            event = ('text_editor', dtime, anon_user_id,
                     [('program', 'kate'), ('command',  cmd)])

            try:
                event_output.out(event)
            except Exception, e:
                print 'Exception while processing', filename, ':', line_number
                print str(e)
                sys.exit(1)
Example #8
0
    # If there is a filter function and returns None, skip this event
    if filter_function != None and filter_function(fields) == None:
        return new_last_event
    
    # Record the event with the highest datetime.
    if dtime > new_last_event:
        new_last_event = dtime

    # Create the event data structure
    event = (event_type, dtime, 
             anonymize.find_or_encode_string(fields[3]),
             [('application', 'moodle'), ('community', fields[0]), 
              ('ip', fields[2]), ('resource', fields[5])])
    
    try:
        event_output.out(event)
    except Exception, e:
        print 'Exception while processing', filename, ':', counter
        print str(e)
        sys.exit(1)

    return new_last_event

def main():
    """
    Script that given a file with CSV Moodle events, it process them and
    prepares to be dumped.

    script [options] logfile logfile ...

    Options:
Example #9
0
def execute(module_name):
    """
    Process the files contained in the given repository.

    [('name', 'svn_commit'), 
     ('datetime', dtime),
     ('user', anonymize(user)),
     ('program', 'svn'),
     ('repository', repository name),
     ('comment', (max 256 chars))]
    """

    global svn_client
    global filter_function
    global svn_special_event_comment
    global svn_special_event_names

    # Get the level of debug
    debug = int(rule_manager.get_property(None, module_name, 'debug'))

    repository = rule_manager.get_property(None, module_name, 'repository')
    if repository == '':
        # No data available, no initialization done
        return

    repository_root = \
        svn_client.info2(repository, 
                         depth = pysvn.depth.empty)[0][1]['repos_root_URL']

    repository_name = rule_manager.get_property(None, module_name, 
                                                'repository_name')

    # Fetch all the files in the given repository
    dir_info = svn_client.list(repository, depth = pysvn.depth.immediates)

    # Select only those that are directories and match the given expression
    dir_info = [x[0]['repos_path'][1:] for x in dir_info \
                    if x[0]['kind'] == pysvn.node_kind.dir]
    source_dirs = fnmatch.filter(dir_info, 
                                 rule_manager.get_property(None, module_name, 
                                                           'files'))

    # Dump the dirs being processed
    if debug != 0:
        print >> sys.stderr, repository_root, ':', len(source_dirs), 
        print >> sys.stderr, 'svndirs being processed.'

    # Get the window date to process events
    (from_date, until_date) = rules_common.window_dates(module_name)

    # Set the date/times to ask for the logs
    if from_date != None:
        seconds = calendar.timegm(from_date.utctimetuple())
        revision_start = pysvn.Revision(pysvn.opt_revision_kind.date,
                                        seconds)
    else:
        revision_start = pysvn.Revision(pysvn.opt_revision_kind.head)

    if until_date != None:
        seconds = calendar.timegm(until_date.utctimetuple())
        revision_end = pysvn.Revision(pysvn.opt_revision_kind.date,
                                      seconds)
    else:
        revision_end = pysvn.Revision(pysvn.opt_revision_kind.number, 0)

    msg_size = int(rule_manager.get_property(None, module_name, 'msg_length'))
    # Loop over the directories and collect al the logs
    all_logs = []
    for directory_name in source_dirs:

        # Slurp al the logs in the server
        all_logs.extend(svn_client.log(os.path.join(repository_root, 
                                                    directory_name),
                                       revision_start = revision_start,
                                       revision_end = revision_end))
        
    # Dump the dirs being processed
    if debug != 0:
        print >> sys.stderr, len(all_logs), 'logs being processed.'

    # Loop over all the log elements
    total_counter = 0
    mark_lines = len(all_logs) / 40 + 1
    for log_data in all_logs:

        # Count the logs to print the mark string on the screen
        total_counter += 1
        if total_counter % mark_lines == 0:
            print >> sys.stderr, '+',
            sys.stderr.flush()

        # Fetch the three important fields, author, date/time and msg
        anon_user_id = anonymize.find_or_encode_string(log_data['author'])
        dtime = datetime.datetime.fromtimestamp(log_data['date'])

        # How can be a substring of a specific length be obtained?
        msg = unicode(log_data['message'], 'utf-8')
        # This subsetting needs to be done after encoding to make sure the
        # string is broken in a safe location (and not in the mid of a utf-8
        # character).
        msg = msg[:msg_size]

        if dtime < from_date or dtime > until_date:
            # Ignore event because it is outside the given window
            continue
        
        # If there is a filter function and returns None, skip this event
        if filter_function != None and \
                filter_function([log_data['author'],
                                 log_data['date'],
                                 log_data['message']]) == None:
            continue

        try:
            special_idx = svn_special_event_comment.index(msg)
            event_name = svn_special_event_names[special_idx]
        except ValueError, e:
            event_name = 'svn_commit'

        event = (event_name, dtime, anon_user_id,
                 [('program', 'svn'), 
                  ('repository', repository_name), 
                  ('comment', msg)])

        try:
            event_output.out(event)
        except Exception, e:
            print 'Exception while processing', module_name
            print str(e)
            sys.exit(1)
Example #10
0
def execute(module_name):
    """
    Given a list of files with Apache logs, process all of them. 

    [('name', 'visit_url'),
     ('datetime', dtime),
     ('user', anonymize(user)),
     ('application', browser?),
     ('url', URL), 
     ('ip', IP)]
            
    """

    global clf_re
    global filter_function

    # Get the window date to process events
    (from_date, until_date) = rules_common.window_dates(module_name)

    # Get the files to process, lines and mark lines
    (files, total_lines, mark_lines) = \
        rules_common.files_to_process(module_name)

    # Loop over all the given args
    total_counter = 0
    for file_annotation in files:
        # Get the file name and (if it exists, the date of the last event)
        filename = file_annotation[0]
        last_event = datetime.datetime.min
        if len(file_annotation[1]) != 0:
            last_event = datetime.datetime.strptime(file_annotation[1][0], 
                                                        '%Y-%m-%d %H:%M:%S')
        new_last_event = last_event

        data_in = codecs.open(filename, 'r', encoding = 'utf8', 
                              errors = 'replace')
        old = ''
        counter = 0
        for line in data_in:

            counter += 1
            total_counter += 1
            
            if total_counter % mark_lines == 0:
                print >> sys.stderr, '+',
                sys.stderr.flush()

            line = line[:-1]
            
            fields = clf_re.match(line).groups()

            if fields[2] == '':
                raise ValueError('Empty string' + line)

            # Translate date time of the event
            dtime = datetime.datetime.strptime(fields[3].strip()[:-6], 
                                               '%d/%b/%Y:%H:%M:%S')

            if dtime <= last_event:
                # Event is older than what has been recorded in the
                # detect_new_files. skip
                continue

            if dtime < from_date or dtime > until_date:
                # Ignore event because it is outside the given window
                continue
            
            # If there is a filter function and returns None, skip this event
            if filter_function != None and filter_function(fields) == None:
                continue

            # Record the event with the highest datetime.
            if dtime > new_last_event:
                new_last_event = dtime

            (method, url, protocol) = fields[4].split()

            event = ('visit_url', dtime,
                     anonymize.find_or_encode_string(fields[2]),
                     [('application', 'unknown'), 
                      ('url', url),
                      ('ip', fields[0])])
            
            event_output.out(event)

        data_in.close()
        detect_new_files.update(None, module_name + '//' + filename, 
                                [new_last_event])

    print >> sys.stderr