Exemple #1
0
def initialize(module_name):
    """
    Initialize the output process for events.
    """

    global debug
    global output_format
    global exclude_users

    # Get the level of debug
    debug = int(rule_manager.get_property(None, module_name, "debug"))

    # Fetch format
    output_format = rule_manager.get_property(None, module_name, "format")

    # Create set of users to exclude
    exclude_users = map(
        lambda x: anonymize.find_or_encode_string(x),
        set(rule_manager.get_property(None, module_prefix, "exclude_users").split(",")),
    )

    # Make sure we initialize the anonymize features common to all methods
    anonymize.initialize()

    if output_format == "CSV":
        init_csv(module_name)
    elif output_format == "mongo":
        init_mongo_db(module_name)
    else:
        init_csv(module_name)
Exemple #2
0
def find_or_encode_string(value, synonyms = None):
    """
    Given a string, obtains its sha256 digest with the password stored in the
    module. The map anonymize_map is updated. The strings included in the
    synonyms are also added to the map with the same key.
    """

    global anonymize_map
    global module_prefix

    # Remove leading and trailing whitespace
    value = value.strip()

    # If anonymize is disabled, terminate
    if anonymize_map == None:
        return value

    # See if any of the IDs is in the
    digest = find_string(value)
    if digest != None:
        # Hit, return
        return digest

    passwd = rule_manager.get_property(None, module_prefix, 'passwd')
    min_length = int(rule_manager.get_property(None, module_prefix,
                                               'min_length'))

    # String not present. Encode, store and return
    digest = hashlib.sha256((value + passwd).encode('utf-8')).hexdigest()

    while anonymize_map.get(digest[0:min_length]):
        min_length += 1

    # Decide the final key
    digest = digest[0:min_length]

    # Look up the given values in LDAP and add to the collection of synonyms
    other_ids = set([value])
    ldap_dict = ldap_lookup.get(value)
    if ldap_dict != None:
        other_ids = other_ids.union(set(map(lambda x: x[0].decode('utf-8'),
                                            ldap_dict.values())))

    if synonyms != None:
        other_ids = other_ids.union(set(synonyms))

    # Other_ids has now all possible synonyms for the given value. See if any of
    # them is in the table.
    old_digest = next((anonymize_map.get(x) for x in other_ids \
                           if anonymize_map.get(x) != None), None)

    # If one synonym was found, take that as the digest.
    if old_digest != None:
        digest = old_digest

    # Propagate the digest to the rest of synonyms
    for other_id in other_ids:
        anonymize_map[other_id] = digest

    return digest
Exemple #3
0
def init_csv(module_name):
    """
    Initalize the dump procedure in CSV format.
    """

    global config_parmas
    global output_file
    # global csv_hash
    global print_ordinal

    # Reset the set of hashes
    # csv_hash = set([])

    # Set the output_file
    if rule_manager.get_property(None, module_name, "output_file") == "":
        output_file = sys.stdout
    else:
        output_file = codecs.open(config_params["output_file"], "w", encoding="utf-8")

    # Create the header to print as first line
    header = ["datetime", "type", "user", "application", "invocation", "aux1", "aux2"]

    # See if the first column should include the ordinal
    print_ordinal = rule_manager.get_property(None, module_name, "print_ordinal") == "yes"
    if print_ordinal:
        header.index(0, "n")

    # Print the first line of the CSV with the column names
    print >> output_file, ",".join(header)
Exemple #4
0
def get(lookup_name):
    """
    Looks up the given name in the directory and returns synonyms
    """

    global module_prefix
    global ldap_obj
    
    # If the object has not been initialized, terminate
    if ldap_obj == None:
        return None

    base = rule_manager.get_property(None, module_prefix, 'base')
    attr_list = rule_manager.get_property(None, module_prefix, 
                                          'fields').split()

    expr = reduce(lambda x, y: '(|(' + y + '=' + lookup_name + ')' \
                      + x + ')', attr_list, '')

    try:
        l = ldap_obj.search_s(base, ldap.SCOPE_SUBTREE, expr, 
                              map(lambda x: str(x), attr_list))
    except:
        # Something went wrong, punt.
        return None

    # If empty result or more than one record, ignore
    if l == [] or len(l) != 1:
        return None

    # Return the dictionary with the attributes of the first element
    return l[0][1]
Exemple #5
0
def init_mongo_db(module_name):
    """
    Initialize the mongo db process
    """

    mongodb.connect(
        rule_manager.get_property(None, module_name, "db_host"),
        rule_manager.get_property(None, module_name, "db_user"),
        rule_manager.get_property(None, module_name, "db_passwd"),
        rule_manager.get_property(None, module_name, "db_name"),
    )
Exemple #6
0
def main():
    """
    Read a configuration file and perform the different event updates. A list of
    the modules to execute can be given.

    script configfile [module module ...]

    Example:

    script update_events.cfg moodle_log apache_log

    """

    global config_defaults

    #######################################################################
    #
    # OPTIONS
    #
    #######################################################################
    args = sys.argv[1:]

    # Check that there are additional arguments
    if len(args) < 1:
        print >> sys.stderr, 'Script needs at least one parameter'
        sys.exit(1)

    if not os.path.exists(args[0]):
        print >> sys.stderr, 'File', args[0], 'not found.'
        sys.exit(1)

    # Initial options included in the global dictionary at the top of the
    # module.
    rule_manager.options = rule_manager.initial_config(config_defaults)

    # Traverse the modules and load the default values
    load_defaults(rule_manager.options)

    # Load the rules in the given configuration file
    rules = rule_manager.load_config_file(None, args[0], {})[1]

    # Initialize the file modification cache mechanism
    detect_new_files.initialize(\
        rule_manager.get_property(None, 
                                  'anonymize', 
                                  'file_modification_cache'), 
        True)

    # Traverse the sections and run first the "initialize" function and then
    # the "execute" function.
    for module_name in rules:
        module_prefix = module_name.split('.')[0]
        getattr(sys.modules[module_prefix], 'initialize')(module_name)

    for module_name in rules:
        module_prefix = module_name.split('.')[0]
        print >> sys.stderr, '### Execute' , module_name
        getattr(sys.modules[module_prefix], 'execute')(module_name)

    return
Exemple #7
0
def files_to_process(module_name):
    """
    Given a module name, obtains from the global rule manager the value of the
    "files" variable, computes the total number of lines, and those that need to
    be processed to print a tick in stdout. Returns:

    ([list of files], total_lines, mark_lines)
    
    """

    # Expand wildcards in file names
    files = sum([glob.glob(x) for x in \
                     rule_manager.get_property(None, module_name, 
                                               'files').split()], 
                [])
    # Fetch value to see if the cache for modified files is enabled
    file_modification_cache = \
        rule_manager.get_property(None, module_name,
                                  'file_modification_cache')
    # If modified files cache enabled, filter out those that were not modified
    if file_modification_cache != '':
        new_files = []
        for x in files:
            file_annotation = detect_new_files.needs_processing(None, 
                                                                module_name + 
                                                                '//' + x)
            if file_annotation == None:
                print >> sys.stderr, 'File', x, 'not modified. Skipping'
            else:
                new_files.append((x, file_annotation[1:]))
        files = new_files
    else:
        files = [(x, ['1970-01-01 00:00:00']) for x in files]

    # Count the total number of lines in the files
    total_lines = sum(map(lambda x: file_len(x[0]), files))
    mark_lines = total_lines / 40 + 1
  
    return (files, total_lines, mark_lines)
Exemple #8
0
def initialize_filter(module_name):
    """
    Gets two options from the given dictionary, the filter file and the filter
    function. Imports the file and if a function with name "initialize_"
    followed by filter_function is found, it is executed. Modifies the
    dictionary so that filter_function points to the function instead of the
    name.  """

    filter_file = rule_manager.get_property(None, module_name, 'filter_file')
    function = None
    if filter_file != '':
        filter_function = rule_manager.get_property(None, module_name,
                                                    'filter_function')

        (head, tail) = os.path.split(filter_file)

        # Add the source directory to the path to fetch python modules
        sys.path.insert(0, head)

        try:
            module = __import__(tail, fromlist=[])
        except ImportError, e:
            print >> sys.stderr, 'Unable to import file', tail
            print str(e)
            sys.exit(1)

        # If the file of the import is not what is expected, notify and
        # terminate.
        if not module.__file__.startswith(head):
            print >> sys.stderr, 'Collision when importing', filter_file
            sys.exit(1)
           
        # Fetch the initialization function, and if found, execute it
        function = None
        try:
            function = getattr(sys.modules[tail], 
                               'initialize_' + filter_function)
        except AttributeError, e:
            pass
Exemple #9
0
def window_dates(module_name):
    """
    Given a module name, it obtains from the global rule_manager object the
    value of the variables 'from_date' and 'until_date'. Translates them to
    datetime.datetime objects and returns the pair (from_date, until_date) as
    result.
    """

    # Translate the date from text to datetime
    from_date = rule_manager.get_property(None, module_name, 'from_date')
    if from_date == '':
        from_date = datetime.datetime.min
    else:
        from_date = datetime.datetime.strptime(from_date, '%Y/%m/%d %H:%M:%S')

    until_date = rule_manager.get_property(None, module_name, 'until_date')
    if until_date == '':
        until_date = datetime.datetime.max
    else:
        until_date = datetime.datetime.strptime(until_date, '%Y/%m/%d %H:%M:%S')

    return (from_date, until_date)
Exemple #10
0
def initialize(module_name):
    """
    Initialization function. Must be here always.
    """

    global filter_function
    global debug

    # Get the level of debug
    debug = int(rule_manager.get_property(None, module_name, 'debug'))

    filter_function = process_filters.initialize_filter(module_name)

    return
Exemple #11
0
def initialize(module_name = None):
    """
    Read an anonymize map from a file. Lines are comma separated pairs of name,
    sha256 key.
    """

    global anonymize_map
    global module_prefix
    global debug

    if module_name == None:
        module_name = module_prefix

    # Get the level of debug
    debug = int(rule_manager.get_property(None, module_name, 'debug'))

    # Get values from config
    map_file = rule_manager.get_property(None, module_name, 'file')
    passwd = rule_manager.get_property(None, module_name, 'passwd')
    min_length = int(rule_manager.get_property(None, module_name,
                                               'min_length'))

    # Load the content in the dictionary
    load_data(map_file)
Exemple #12
0
def initialize(module_name):
    """
    Initialize the ldap_obj.
    """

    global ldap_obj
    global debug

    # Get the level of debug
    debug = int(rule_manager.get_property(None, module_name, 'debug'))

    uri = rule_manager.get_property(None, module_name, 'uri')
    if uri == '':
        # Nothing to do
        return

    try:
        ldap_obj = ldap.initialize(uri)
    except:
        print >> sys.stderr, 'LDAP exception when initializing'
        sys.exit(1)


    print >> sys.stderr, 'LDAP object initialized successfully'
Exemple #13
0
def flush():
    """
    Make sure all the transactions have been executed
    """

    global module_prefix

    # Fetch format
    output_format = rule_manager.get_property(None, module_prefix, "format")

    if output_format == "CSV":
        flush_csv(event_list)
    elif output_format == "mongo":
        flush_mongo_db(event_list)
    else:
        flush_csv(event_list)
Exemple #14
0
def initialize(module_name):
    """
    Initialization function. Must be here always.

    """

    global svn_client
    global filter_function
    global debug
    
    # Get the level of debug
    debug = int(rule_manager.get_property(None, module_name, 'debug'))

    filter_function = process_filters.initialize_filter(module_name)

    svn_client = pysvn.Client()
    svn_client.exception_style = 1

    return
Exemple #15
0
def execute(module_name):
    """
    Given a list of directories with vm logs, process all of them.
    """

    global svn_client

    # Get the level of debug
    debug = int(rule_manager.get_property(None, module_name, 'debug'))

    repository = rule_manager.get_property(None, module_name, 'repository')
    if repository == '':
        # No data available, no initialization done
        return

    repository_root = \
        svn_client.info2(repository, 
                         depth = pysvn.depth.empty)[0][1]['repos_root_URL']

    # Fetch all the files in the given repository
    dir_info = svn_client.list(repository, depth = pysvn.depth.immediates)

    # Select only those that are directories and match the given expression
    dir_info = [x[0]['repos_path'][1:] for x in dir_info \
                    if x[0]['kind'] == pysvn.node_kind.dir]
    source_dirs = fnmatch.filter(dir_info, 
                                 rule_manager.get_property(None, module_name, 
                                                           'files'))

    dst_dir = rule_manager.get_property(None, module_name, 'dst_dir')
    if dst_dir == '':
        print >> sys.stderr, 'VM_Logs: dst_dir is empty.'
        sys.exit(1)

    if not(os.path.exists(dst_dir)):
        os.makedirs(dst_dir)

    # Loop over all the directories
    for directory_name in source_dirs:
        # Calculate the dst full name
        (head, dst_tail) = os.path.split(directory_name)
        dst_full_name = os.path.join(dst_dir, dst_tail)

        # Fetch all the files in the given repository
        file_info = svn_client.list(os.path.join(repository_root, directory_name,
                                                 '.pladata'),
                                    depth = pysvn.depth.immediates)

        # Select only those that are directories and match the *.tgz pattern
        file_info = [x[0]['repos_path'][1:] for x in file_info \
                        if x[0]['kind'] == pysvn.node_kind.file]
        data_files = [x for x in file_info if re.search('[0-9]+_[0-9]+\.tgz$',
                                                       x)]

        if debug != 0:
            print >> sys.stderr, '  Dir', dst_tail, ':', len(data_files), 'files'

        # Loop over all the data files
        for data_file in data_files:
            # Separate file name from dir name
            (head_dir, file_name) = os.path.split(data_file)

            # Obtain the author that did the commit
            data_info = svn_client.info2(os.path.join(repository_root, 
                                                      data_file),
                                         depth = pysvn.depth.empty)
            author_id = data_info[0][1]['last_changed_author']

            # Create the path to the author dir and additional dirs if needed
            dst_author_dir = os.path.join(dst_full_name, author_id)
            if not os.path.exists(dst_author_dir):
                os.makedirs(dst_author_dir)
            dst_file = os.path.join(dst_author_dir, file_name)
            done_author_dir = os.path.join(dst_full_name, author_id, 'tgzs')
            if not os.path.exists(done_author_dir):
                os.makedirs(done_author_dir)

            # If the file has NOT been unpacked already, process it
            if os.path.exists(os.path.join(done_author_dir, file_name)):
                continue

            # Get a copy of the *.tgz from the repository with the export
            # command
            try:
                svn_client.export(os.path.join(repository_root, data_file),
                                  dst_file,
                                  recurse = False)
            except Exception, e:
                print >> sys.stderr, 'Error while exporting', data_file
                print >> sys.stderr, str(e)

            # Expand the data in the tar
            if unpack_tgz_file(dst_file, done_author_dir):
                print >> sys.stderr, 'Error while unpacking', data_file
                continue

            if debug != 0:
                print >> sys.stderr, '    ', dst_tail, 'expanded.'
Exemple #16
0
def execute(module_name):
    """
    Given a list of files with Apache logs, process all of them that contain the
    word mark word and produce the following events:

    [('name', 'embedded_question_correct'),
     ('datetime', dtime),
     ('user', anonymize(user)),
     ('application', browser?),
     ('url', URL), 
     ('ip', IP),
     ('block_id', id)]
            
    [('name', 'embedded_question_incorrect'),
     ('datetime', dtime),
     ('user', anonymize(user)),
     ('application', browser?),
     ('url', URL), 
     ('ip', IP),
     ('block_id', id)]
            
    [('name', 'embedded_question_blank'),
     ('datetime', dtime),
     ('user', anonymize(user)),
     ('application', browser?),
     ('url', URL), 
     ('ip', IP),
     ('block_id', id)]
            
    [('name', 'embedded_question_show'),
     ('datetime', dtime),
     ('user', anonymize(user)),
     ('application', browser?),
     ('url', URL), 
     ('ip', IP),
     ('block_id', id)]
            
    """

    global clf_re
    global filter_function
    global remap_pairs

    # Get the window date to process events
    (from_date, until_date) = rules_common.window_dates(module_name)

    # Get the files to process, lines and mark lines
    (files, total_lines, mark_lines) = \
        rules_common.files_to_process(module_name)

    # Get the remap_pairs evaluated from the options
    remap_pairs = eval('[' + rule_manager.get_property(None, module_name,
                                                       'remap_pairs') + \
                           ']')
    remap_pairs = [(re.compile(x), y) for (x, y) in remap_pairs]

    # Fetch the word to detect an embeddedq to use it later
    mark_word = rule_manager.get_property(None, module_name, 'mark_word')

    # Loop over all the given args
    total_counter = 0
    for file_annotation in files:
        # Get the file name and (if it exists, the date of the last event)
        filename = file_annotation[0]
        last_event = datetime.datetime.min
        if len(file_annotation[1]) != 0:
            last_event = datetime.datetime.strptime(file_annotation[1][0], 
                                                        '%Y-%m-%d %H:%M:%S')
        new_last_event = last_event

        data_in = codecs.open(filename, 'r', encoding = 'utf8', 
                              errors = 'replace')
        old = ''
        counter = 0
        for line in data_in:

            counter += 1
            total_counter += 1
            
            if total_counter % mark_lines == 0:
                print >> sys.stderr, '+',
                sys.stderr.flush()

            line = line[:-1]
            
            fields = clf_re.match(line).groups()

            if fields[2] == '':
                raise ValueError('Empty string' + line)

            # Translate date time of the event
            dtime = datetime.datetime.strptime(fields[3].strip()[:-6], 
                                               '%d/%b/%Y:%H:%M:%S')

            if dtime <= last_event:
                # Event is older than what has been recorded in the
                # detect_new_files. skip
                continue

            if dtime < from_date or dtime > until_date:
                # Ignore event because it is outside the given window
                continue
            
            # Split the url to match and see if it has the mark word
            (method, url, protocol) = fields[4].split()

            # Only 404 that have the mark word substring are accepted
            if fields[5] != '404' or url.find(mark_word) == -1:
                continue

            # If there is a filter function and returns None, skip this event
            if filter_function != None and filter_function(fields) == None:
                continue

            # Record the event with the highest datetime.
            if dtime > new_last_event:
                new_last_event = dtime

            # At this point we have an event of an embedded question.

            event_pairs = process_log_line(url, mark_word)
            
            for (event_suffix, question_id) in event_pairs:
                event = ('embedded_question_' + event_suffix, 
                         dtime,
                         anonymize.find_or_encode_string(fields[2]),
                         [('application', 'unknown'), 
                          ('url', url),
                          ('ip', fields[0]),
                          ('question_id', question_id)])
                
                event_output.out(event)

        data_in.close()
        detect_new_files.update(None, module_name + '//' + filename, 
                                [new_last_event])

    print >> sys.stderr
Exemple #17
0
def execute(module_name):
    """
    Given a list of files with gcc logs, process all of them. 

    Process the files containing the events. Return True if no error is
    detected. The event have the form of:

-BEGIN 2011-10-14 11:09:29 2011-10-14 11:15:06 /usr/bin/gcc prueba
r
where
q
-END

    [('name', 'compiler'), 
     ('datetime', dtime),
     ('user', anonymize(user)),
     ('application', 'gcc'),
     ('invocation', command),
     ('messages', message extract)]

    """

    global filter_function

    # Get the window date to process events
    (from_date, until_date) = rules_common.window_dates(module_name)

    message_lines = int(rule_manager.get_property(None, module_name, 
                                                  'message_lines'))

    # Get the files to process, lines and mark lines
    (files, total_lines, mark_lines) = \
        rules_common.files_to_process(module_name)

    # Loop over all the files
    total_counter = 0
    for file_annotation in files:
        # Get the file name and (if it exists, the date of the last event)
        filename = file_annotation[0]
        last_event = datetime.datetime.min
        if len(file_annotation[1]) != 0:
            last_event = datetime.datetime.strptime(file_annotation[1][0], 
                                                        '%Y-%m-%d %H:%M:%S')
        new_last_event = last_event

        # Get the user id from the path to the file name
        user_id = filename.split('/')[-2]
        anon_user_id = anonymize.find_or_encode_string(user_id)

        data_in = codecs.open(filename, 'r', 'utf-8', errors = 'replace')

        line_number = 0
        messages = []
        begin_fields = []
        for line in data_in:
            line_number += 1
            total_counter += 1
            
            if total_counter % mark_lines == 0:
                print >> sys.stderr, '+',
                sys.stderr.flush()

            # Skip the empty lines
            if line == '\n':
                continue

            # See if the user id appears in the command, if so, anonymize
            if re.search(user_id, line):
                line = re.sub(user_id, anon_user_id, line)

            # Chop the command line
            fields = line[:-1].split()

            # Beginning of log. Catch command invocation and dates
            if re.match('^\-BEGIN .+$', line):
                begin_fields = line.split()
                try:
                    dtime = \
                        datetime.datetime.strptime(' '.join(begin_fields[4:6]),
                                                   '%Y-%m-%d %H:%M:%S')
                    
                except ValueError, e:
                    print >> sys.stderr, 'WARNING: In file', filename
                    print >> sys.stderr, 'Ignoring:', line

                command = ' '.join(begin_fields[6:])
                messages = []
                continue

            # If not the end of an event, concatenate the line and keep looping
            if not re.match('^\-END$', line):
                if len(messages) < message_lines:
                    messages.append(line[:-1])
                continue
                
            # At this point we have the complete information about the event
            
            if dtime <= last_event:
                # Event is older than what has been recorded in the
                # detect_new_files. skip
                continue

            # If out of time window, ignore
            if dtime < from_date or dtime > until_date:
                continue
            
            # If there is a filter function and returns None, skip this event
            if filter_function != None and filter_function(begin_fields) == None:
                continue

            # Record the event with the highest datetime.
            if dtime > new_last_event:
                new_last_event = dtime

            event = ('gcc', dtime, anon_user_id,
                     [('program', 'gcc'),
                      ('command',  command),
                      ('messages',  '"' + '|||'.join(messages) + '"')])

            try:
                event_output.out(event)
            except Exception, e:
                print 'Exception while processing', filename, ':', line_number
                print str(e)
                sys.exit(1)
Exemple #18
0
def execute(module_name):
    """
    Given a list of files with svn apache logs, process all of them. The logs are produced with the following apache configuration commands:

    CustomLog [destionation log file]  "%t %u %{SVN-REPOS-NAME}e %{SVN-ACTION}e" env=SVN-ACTION

Sample:

    [03/Mar/2012:11:43:55 +0100] abel asteams-en update /Teams/Team_09 r3960 send-copyfrom-args

    For each line in the file, the following event structure is produced

    [('name', 'svn_' + svn_action), # Svn action is update, diff, etc. 
     ('datetime', dtime),
     ('user', anonymize(user)),
     ('repository', repository name),
     ('directory', directory) (optional), 
     ('revision', r??? (optional)),
     ('comment', (max 256 chars)) # Only if commit and repository given]
    """

    global filter_function

    # Get the level of debug
    debug = int(rule_manager.get_property(None, module_name, "debug"))

    # Get the window date to process events
    (from_date, until_date) = rules_common.window_dates(module_name)

    # Get the files to process, lines and mark lines
    (files, total_lines, mark_lines) = rules_common.files_to_process(module_name)

    # Get the flag to see if the commits need to be processed
    process_commits = rule_manager.get_property(None, module_name, "process_commits") == ""

    # Loop over all the given args
    total_counter = 0
    for file_annotation in files:
        # Get the file name and (if it exists, the date of the last event)
        filename = file_annotation[0]
        last_event = datetime.datetime.min
        if len(file_annotation[1]) != 0:
            last_event = datetime.datetime.strptime(file_annotation[1][0], "%Y-%m-%d %H:%M:%S")
        new_last_event = last_event

        data_in = codecs.open(filename, "r", encoding="utf8", errors="replace")
        old = ""
        counter = 0
        for line in data_in:

            # Advance counters and print progress character if needed
            counter += 1
            total_counter += 1
            if total_counter % mark_lines == 0:
                print >>sys.stderr, "+",
                sys.stderr.flush()

            # Chop line into fields
            line = line[:-1]
            fields = line.split()
            if len(fields) < 3:
                raise ValueError("Erroneous log line:" + line)

            # Get the event type to quickly detect if we need to skip it
            event_type = fields[4]
            if (not process_commits) and event_type == "commit":
                continue

            # Translate date time of the event and check if within process
            # interval
            dtime = datetime.datetime.strptime(fields[0][1:], "%d/%b/%Y:%H:%M:%S")
            if dtime <= last_event:
                # Event is older than what has been recorded in the
                # detect_new_files. skip
                continue
            if dtime < from_date or dtime > until_date:
                # Ignore event because it is outside the given window
                continue

            # If there is a filter function and returns None, skip this event
            if filter_function != None and filter_function(fields) == None:
                continue

            # Record the event with the highest datetime.
            if dtime > new_last_event:
                new_last_event = dtime

            # Create the first three pairs of the event
            event = (
                "svn_" + event_type,
                dtime,
                anonymize.find_or_encode_string(fields[2]),
                [("repository", fields[3])],
            )

            # Structure of the different events
            #
            # checkout-or-export /path r62 depth=infinity
            # commit harry r100
            # diff /path r15:20 depth=infinity ignore-ancestry
            # get-dir /trunk r17 text
            # get-file /path r20 props
            # get-file-revs /path r12:15 include-merged-revisions
            # get-mergeinfo (/path1 /path2)
            # lock /path steal
            # log (/path1,/path2) r20:90 discover-changed-paths revprops=()
            # replay /path r19
            # change-rev-prop r50 propertyname
            # rev-proplist r34
            # status /path r62 depth=infinity
            # switch /pathA /pathB@50 depth=infinity
            # unlock /path break
            # update /path r17 send-copyfrom-args
            if event_type == "checkout-or-export":
                event[3].append(("revision", fields[6]))
                event[3].append(("location", fields[5]))
            if event_type == "commit":
                event[3].append(("revision", fields[5]))
                # Fetch the log message if svn_client is not None
                if svn_client != None:
                    pass
            elif event_type == "diff":
                event[3].append(("location", fields[5] + " " + fields[6]))
            elif event_type == "get-dir" or event_type == "get-file" or event_type == "update":
                event[3].append(("revision", fields[6]))
                event[3].append(("location", fields[5]))
            elif event_type == "get-file-revs":
                event[3].append(("revision", "r" + fields[6].split(":")[1]))
                event[3].append(("location", fields[5]))
            elif event_type == "lock" or event_type == "unlock":
                event[3].append(("location", fields[5]))
            elif event_type == "log":
                event[3].append(("location", fields[5]))

            event_output.out(event)

        data_in.close()
        detect_new_files.update(None, module_name + "//" + filename, [new_last_event])

    print >>sys.stderr
Exemple #19
0
def execute(module_name):
    """
    Given a list of files with Moodle logs, process all of them. Some lines
    contain spurious 0d in the middle. They are removed.

    [('name', 'lms_' + eventtype), 
     ('datetime', datetime),
     ('user', anonymize(user)),
     ('application', 'moodle'),
     ('community', Community ID),
     ('ip', IP),
     ('resource', fields[5])]
            
    """

    global remap_pairs

    # Get the window date to process events
    (from_date, until_date) = rules_common.window_dates(module_name)

    # Get the files to process, lines and mark lines
    (files, total_lines, mark_lines) = \
        rules_common.files_to_process(module_name)

    # Get the type of file to process
    event_file_type = rule_manager.get_property(None, module_name, 
                                                'event_file_type')
    event_file_type = event_file_type.lower().strip()

    if event_file_type != 'csv' and event_file_type != 'html':
        print >> sys.stderr, 'Incorrect value for option event_file_type'
        print >> sys.stderr, 'Only "csv" or "html" allowed'
        sys.exit(2)
        
    # Get the remap_pairs evaluated from the options
    remap_pairs = eval('[' + rule_manager.get_property(None, module_name,
                                                       'remap_pairs') + \
                           ']')
    remap_pairs = [(re.compile(x), y) for (x, y) in remap_pairs]

    datetime_fmt = rule_manager.get_property(None, module_name, 
                                             'datetime_format')

    print >> sys.stderr, 'Processing', len(files), 'files'

    # Loop over all the given args
    total_counter = 0
    for file_annotation in sorted(files):

        # Get the file name and (if it exists, the date of the last event)
        filename = file_annotation[0]
        last_event = datetime.datetime.min
        if len(file_annotation[1]) != 0:
            last_event = datetime.datetime.strptime(file_annotation[1][0], 
                                                        '%Y-%m-%d %H:%M:%S')

        if event_file_type == 'csv':
            total_counter = process_csv_file(module_name, 
                                             filename, 
                                             mark_lines, 
                                             total_counter,
                                             last_event, 
                                             from_date, 
                                             until_date,
                                             datetime_fmt)
        else:
            total_counter = process_html_file(module_name, 
                                              filename, 
                                              mark_lines, 
                                              total_counter,
                                              last_event, 
                                              from_date, 
                                              until_date,
                                              datetime_fmt)
    print >> sys.stderr
Exemple #20
0
def execute(module_name):
    """
    Process the files contained in the given repository.

    [('name', 'svn_commit'), 
     ('datetime', dtime),
     ('user', anonymize(user)),
     ('program', 'svn'),
     ('repository', repository name),
     ('comment', (max 256 chars))]
    """

    global svn_client
    global filter_function
    global svn_special_event_comment
    global svn_special_event_names

    # Get the level of debug
    debug = int(rule_manager.get_property(None, module_name, 'debug'))

    repository = rule_manager.get_property(None, module_name, 'repository')
    if repository == '':
        # No data available, no initialization done
        return

    repository_root = \
        svn_client.info2(repository, 
                         depth = pysvn.depth.empty)[0][1]['repos_root_URL']

    repository_name = rule_manager.get_property(None, module_name, 
                                                'repository_name')

    # Fetch all the files in the given repository
    dir_info = svn_client.list(repository, depth = pysvn.depth.immediates)

    # Select only those that are directories and match the given expression
    dir_info = [x[0]['repos_path'][1:] for x in dir_info \
                    if x[0]['kind'] == pysvn.node_kind.dir]
    source_dirs = fnmatch.filter(dir_info, 
                                 rule_manager.get_property(None, module_name, 
                                                           'files'))

    # Dump the dirs being processed
    if debug != 0:
        print >> sys.stderr, repository_root, ':', len(source_dirs), 
        print >> sys.stderr, 'svndirs being processed.'

    # Get the window date to process events
    (from_date, until_date) = rules_common.window_dates(module_name)

    # Set the date/times to ask for the logs
    if from_date != None:
        seconds = calendar.timegm(from_date.utctimetuple())
        revision_start = pysvn.Revision(pysvn.opt_revision_kind.date,
                                        seconds)
    else:
        revision_start = pysvn.Revision(pysvn.opt_revision_kind.head)

    if until_date != None:
        seconds = calendar.timegm(until_date.utctimetuple())
        revision_end = pysvn.Revision(pysvn.opt_revision_kind.date,
                                      seconds)
    else:
        revision_end = pysvn.Revision(pysvn.opt_revision_kind.number, 0)

    msg_size = int(rule_manager.get_property(None, module_name, 'msg_length'))
    # Loop over the directories and collect al the logs
    all_logs = []
    for directory_name in source_dirs:

        # Slurp al the logs in the server
        all_logs.extend(svn_client.log(os.path.join(repository_root, 
                                                    directory_name),
                                       revision_start = revision_start,
                                       revision_end = revision_end))
        
    # Dump the dirs being processed
    if debug != 0:
        print >> sys.stderr, len(all_logs), 'logs being processed.'

    # Loop over all the log elements
    total_counter = 0
    mark_lines = len(all_logs) / 40 + 1
    for log_data in all_logs:

        # Count the logs to print the mark string on the screen
        total_counter += 1
        if total_counter % mark_lines == 0:
            print >> sys.stderr, '+',
            sys.stderr.flush()

        # Fetch the three important fields, author, date/time and msg
        anon_user_id = anonymize.find_or_encode_string(log_data['author'])
        dtime = datetime.datetime.fromtimestamp(log_data['date'])

        # How can be a substring of a specific length be obtained?
        msg = unicode(log_data['message'], 'utf-8')
        # This subsetting needs to be done after encoding to make sure the
        # string is broken in a safe location (and not in the mid of a utf-8
        # character).
        msg = msg[:msg_size]

        if dtime < from_date or dtime > until_date:
            # Ignore event because it is outside the given window
            continue
        
        # If there is a filter function and returns None, skip this event
        if filter_function != None and \
                filter_function([log_data['author'],
                                 log_data['date'],
                                 log_data['message']]) == None:
            continue

        try:
            special_idx = svn_special_event_comment.index(msg)
            event_name = svn_special_event_names[special_idx]
        except ValueError, e:
            event_name = 'svn_commit'

        event = (event_name, dtime, anon_user_id,
                 [('program', 'svn'), 
                  ('repository', repository_name), 
                  ('comment', msg)])

        try:
            event_output.out(event)
        except Exception, e:
            print 'Exception while processing', module_name
            print str(e)
            sys.exit(1)