def mapper(self, line):
        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, date_string = value

        username = eventlog.get_event_username(event)
        if not username:
            return

        # Get timestamp instead of date string, so we get the latest ip
        # address for events on the same day.
        timestamp = eventlog.get_event_time_string(event)
        if not timestamp:
            return

        ip_address = event.get('ip')
        if not ip_address:
            log.warning("No ip_address found for user '%s' on '%s'.", username,
                        timestamp)
            return

        # Get the course_id from context, if it happens to be present.
        # It's okay if it isn't.

        # (Not sure if there are particular types of course
        # interaction we care about, but we might want to only collect
        # the course_id off of explicit events, and ignore implicit
        # events as not being "real" interactions with course content.
        # Or maybe we add a flag indicating explicit vs. implicit, so
        # that this can be better teased apart.  For example, we could
        # use the latest explicit event for a course, but if there are
        # none, then use the latest implicit event for the course, and
        # if there are none, then use the latest overall event.)
        course_id = eventlog.get_course_id(event)

        # For multi-output, we will generate a single file for each key value.
        # When looking at location for user in a course, we don't want to have
        # an output file per course per date, so just use date as the key,
        # and have a single file representing all events on the date.
        yield date_string, (timestamp, ip_address, course_id, username)
    def mapper(self, line):
        value = self.get_event_and_date_string(line)

        if value is None:
            return
        event, date_string = value

        username = eventlog.get_event_username(event)

        if not username:
            log.error("Encountered event with no username: %s", event)
            self.incr_counter('Active Users last year', 'Discard Event Missing username', 1)
            return

        date = datetime.date(*[int(x) for x in date_string.split('-')])
        iso_year, iso_weekofyear, _iso_weekday = date.isocalendar()
        week = isoweek.Week(iso_year, iso_weekofyear)
        start_date = week.monday().isoformat()
        end_date = (week.sunday() + datetime.timedelta(1)).isoformat()

        yield (start_date, end_date, username), 1
    def mapper(self, line):
        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, date_string = value

        username = eventlog.get_event_username(event)
        if not username:
            return

        # Get timestamp instead of date string, so we get the latest ip
        # address for events on the same day.
        timestamp = eventlog.get_event_time_string(event)
        if not timestamp:
            return

        ip_address = event.get('ip')
        if not ip_address:
            log.warning("No ip_address found for user '%s' on '%s'.", username, timestamp)
            return

        # Get the course_id from context, if it happens to be present.
        # It's okay if it isn't.

        # (Not sure if there are particular types of course
        # interaction we care about, but we might want to only collect
        # the course_id off of explicit events, and ignore implicit
        # events as not being "real" interactions with course content.
        # Or maybe we add a flag indicating explicit vs. implicit, so
        # that this can be better teased apart.  For example, we could
        # use the latest explicit event for a course, but if there are
        # none, then use the latest implicit event for the course, and
        # if there are none, then use the latest overall event.)
        course_id = eventlog.get_course_id(event)

        # For multi-output, we will generate a single file for each key value.
        # When looking at location for user in a course, we don't want to have
        # an output file per course per date, so just use date as the key,
        # and have a single file representing all events on the date.
        yield date_string, (timestamp, ip_address, course_id, username)
 def test_event_username_with_trailing_whitespace(self):
     item = {"username": "******"}
     self.assertEquals(eventlog.get_event_username(item), u'bub')
 def test_empty_event_username(self):
     item = {"username": "******"}
     self.assertIsNone(eventlog.get_event_username(item))
 def test_missing_event_username(self):
     item = {"something else": "not an event"}
     self.assertIsNone(eventlog.get_event_username(item))
Example #7
0
    def _remap_user_info_in_event(self, event, event_data):
        """
        Harvest user info from event, and remap those values (in place) where appropriate.

        Returns a dict of iterables, with key values of 'username', 'user_id', and 'name'.

        """
        # Find user info, and
        debug_str = self._get_log_string_for_event(event)

        # Create a user_info structure to collect relevant user information to look
        # for elsewhere in the event.  We need to return a dictionary of iterables,
        # but since we will potentially be adding the same values repeatedly from
        # different parts of the event, a set will make sure these are deduped.
        user_info = defaultdict(set)

        # Note that eventlog.get_event_username() does a strip on the username and checks for zero-len,
        # so we don't have to do so here.
        username = eventlog.get_event_username(event)
        if username is not None:
            username = username.decode('utf8')
            remapped_username = self._remap_username(username, user_info)
            if remapped_username is not None:
                event['username'] = remapped_username
            else:
                log.error(
                    "Redacting unrecognized username for '%s' field: '%s' %s",
                    'username', username, debug_str)
                event['username'] = REDACTED_USERNAME

        # Get the user_id from context, either as an int or None, and remap.
        user_id = self._get_user_id_as_int(
            event.get('context', {}).get('user_id'))
        if user_id is not None:
            user_info['user_id'].add(user_id)
            info = self._get_user_info_for_user_id(user_id)
            if info is not None:
                for key, value in info.iteritems():
                    user_info[key].add(value)
                if username is not None and 'username' in info and username != info[
                        'username']:
                    log.error(
                        u"user_id ('%s'=>'%s') does not match username ('%s') %s",
                        user_id,
                        info['username'],
                        username,
                        debug_str,
                    )
            event['context']['user_id'] = self.remap_id(user_id)

        # Clean username from context.
        if 'context' in event:
            # Remap value of username in context, if it is present.  (Removed in more recent events.)
            if 'username' in event['context'] and len(
                    event['context']['username'].strip()) > 0:
                context_username = event['context']['username'].strip().decode(
                    'utf8')
                remapped_username = self._remap_username(
                    context_username, user_info)
                if remapped_username is not None:
                    event['context']['username'] = remapped_username
                else:
                    log.error(
                        "Redacting unrecognized username for '%s' field: '%s' %s",
                        'context.username', context_username, debug_str)
                    event['context']['username'] = REDACTED_USERNAME

        # Look into the event payload.
        if event_data:
            # Get the user_id from payload and remap.
            event_user_id = self._get_user_id_as_int(event_data.get('user_id'))
            if event_user_id is not None:
                user_info['user_id'].add(event_user_id)
                info = self._get_user_info_for_user_id(event_user_id)
                if info is not None:
                    for key, value in info.iteritems():
                        user_info[key].add(value)
                event_data['user_id'] = self.remap_id(event_user_id)

            # Remap values of usernames in payload, if present.  Usernames may appear with different key values.
            # TODO: confirm that these values are usernames, not user_id values. (User_id values will fail remapping.)
            for username_key in ['username', 'instructor', 'student', 'user']:
                if username_key in event_data and len(
                        event_data[username_key].strip()) > 0:
                    event_username = event_data[username_key].strip().decode(
                        'utf8')
                    remapped_username = self._remap_username(
                        event_username, user_info)
                    if remapped_username is not None:
                        event_data[username_key] = remapped_username
                    else:
                        log.error(
                            "Redacting unrecognized username for 'event.%s' field: '%s' %s",
                            username_key, event_username, debug_str)
                        event_data[username_key] = REDACTED_USERNAME

        # Finally return the fully-constructed dict.
        return user_info
Example #8
0
    def get_userinfo_from_event(self, event, event_data):
        # Start simply, and just get obvious info.  See what it matches.
        # Need to check back on this, but we really only need to know
        # if this information is wrong.  What we want to come out
        # of this is a user_id and/or a username that can be used for
        # cleaning the rest of the event.

        # And actually, what we also need is the relevant fullname to use,
        # so we need to pick entries out of the user_info that match.
        # One or more?  No analysis was really made of alignment.
        # So we'll have to do it here...
        event_type = event.get('event_type')
        if isinstance(event_type, str):
            event_type = event_type.decode('utf8')
        debug_str = u" [event_type='{}']".format(event_type)

        username_entry = None
        username = eventlog.get_event_username(event)
        if username is not None:
            username = username.decode('utf8')
            if self.user_info is not None:
                username_entry = self.user_info.get(username)
                if username_entry is None:
                    log.error(u"username ('%s') is unknown to user_info %s",
                              username, debug_str)

        # Get the user_id either as an int or None
        userid_entry = None
        user_id = self.get_user_id_as_int(
            event.get('context', {}).get('user_id'))
        if user_id is not None:
            if self.user_info is not None:
                userid_entry = self.user_info.get(user_id)
                if userid_entry is None:
                    log.error(u"user_id ('%s') is unknown to user_info %s",
                              user_id, debug_str)
                elif username_entry and userid_entry != username_entry:
                    log.error(
                        u"user_id ('%s'='%s') does not match username ('%s'='%s') %s",
                        userid_entry.get('user_id'),
                        userid_entry.get('username'),
                        username_entry.get('username'),
                        username_entry.get('user_id'),
                        debug_str,
                    )

        event_userid_entry = None
        if event_data and isinstance(event_data, dict):
            event_user_id = self.get_user_id_as_int(event_data.get('user_id'))
            if event_user_id:
                if self.user_info is not None:
                    event_userid_entry = self.user_info.get(event_user_id)
                    if event_userid_entry is None:
                        log.error(
                            u"Event_user_id ('%s') is unknown to user_info %s",
                            event_user_id, debug_str)

                if user_id is None:
                    # This is way too common. In testing, every edx.course.enrollment.xxx had the user_id in the event but not
                    # in context.  Weird.
                    # log.warning(u"Found user_id ('%s') in event but nothing in context %s", event_user_id, debug_str)
                    pass
                elif event_userid_entry and userid_entry != event_userid_entry:
                    # This turns out to be somewhat expected for certain event types where one user is doing something on behalf
                    # of another user.  The actor is in context, and the object is in event payload.
                    if event_type not in EVENT_TYPES_WITH_DIFFERENT_USERIDS:
                        log.error(
                            u"Context user_id ('%s'='%s') does not match event user_id ('%s'='%s') %s",
                            userid_entry.get('user_id'),
                            userid_entry.get('username'),
                            event_userid_entry.get('username'),
                            event_userid_entry.get('user_id'),
                            debug_str,
                        )
                elif event_user_id != user_id:
                    log.error(
                        u"Found user_id ('%s') in event that was different from context ('%s') %s",
                        event_user_id, user_id, debug_str)

        # We choose the event user_id over the context, and fall back on the username.
        if event_userid_entry is not None:
            return event_userid_entry
        elif userid_entry is not None:
            return userid_entry
        else:
            return username_entry
    def _remap_user_info_in_event(self, event, event_data):
        """
        Harvest user info from event, and remap those values (in place) where appropriate.

        Returns a dict of iterables, with key values of 'username', 'user_id', and 'name'.

        """
        # Find user info, and
        debug_str = self._get_log_string_for_event(event)

        # Create a user_info structure to collect relevant user information to look
        # for elsewhere in the event.  We need to return a dictionary of iterables,
        # but since we will potentially be adding the same values repeatedly from
        # different parts of the event, a set will make sure these are deduped.
        user_info = defaultdict(set)

        # Note that eventlog.get_event_username() does a strip on the username and checks for zero-len,
        # so we don't have to do so here.
        username = eventlog.get_event_username(event)
        if username is not None:
            username = username.decode('utf8')
            remapped_username = self._remap_username(username, user_info)
            if remapped_username is not None:
                event['username'] = remapped_username
            else:
                log.error("Redacting unrecognized username for '%s' field: '%s' %s", 'username', username, debug_str)
                event['username'] = REDACTED_USERNAME

        # Get the user_id from context, either as an int or None, and remap.
        user_id = self._get_user_id_as_int(event.get('context', {}).get('user_id'))
        if user_id is not None:
            user_info['user_id'].add(user_id)
            info = self._get_user_info_for_user_id(user_id)
            if info is not None:
                for key, value in info.iteritems():
                    user_info[key].add(value)
                if username is not None and 'username' in info and username != info['username']:
                    log.error(
                        u"user_id ('%s'=>'%s') does not match username ('%s') %s",
                        user_id, info['username'], username, debug_str,
                    )
            event['context']['user_id'] = self.remap_id(user_id)

        # Clean username from context.
        if 'context' in event:
            # Remap value of username in context, if it is present.  (Removed in more recent events.)
            if 'username' in event['context'] and len(event['context']['username'].strip()) > 0:
                context_username = event['context']['username'].strip().decode('utf8')
                remapped_username = self._remap_username(context_username, user_info)
                if remapped_username is not None:
                    event['context']['username'] = remapped_username
                else:
                    log.error("Redacting unrecognized username for '%s' field: '%s' %s", 'context.username',
                              context_username, debug_str)
                    event['context']['username'] = REDACTED_USERNAME

        # Look into the event payload.
        if event_data:
            # Get the user_id from payload and remap.
            event_user_id = self._get_user_id_as_int(event_data.get('user_id'))
            if event_user_id is not None:
                user_info['user_id'].add(event_user_id)
                info = self._get_user_info_for_user_id(event_user_id)
                if info is not None:
                    for key, value in info.iteritems():
                        user_info[key].add(value)
                event_data['user_id'] = self.remap_id(event_user_id)

            # Remap values of usernames in payload, if present.  Usernames may appear with different key values.
            # TODO: confirm that these values are usernames, not user_id values. (User_id values will fail remapping.)
            for username_key in ['username', 'instructor', 'student', 'user']:
                if username_key in event_data and len(event_data[username_key].strip()) > 0:
                    event_username = event_data[username_key].strip().decode('utf8')
                    remapped_username = self._remap_username(event_username, user_info)
                    if remapped_username is not None:
                        event_data[username_key] = remapped_username
                    else:
                        log.error("Redacting unrecognized username for 'event.%s' field: '%s' %s",
                                  username_key, event_username, debug_str)
                        event_data[username_key] = REDACTED_USERNAME

        # Finally return the fully-constructed dict.
        return user_info
    def get_userinfo_from_event(self, event, event_data):
        # Start simply, and just get obvious info.  See what it matches.
        # Need to check back on this, but we really only need to know
        # if this information is wrong.  What we want to come out
        # of this is a user_id and/or a username that can be used for
        # cleaning the rest of the event.

        # And actually, what we also need is the relevant fullname to use,
        # so we need to pick entries out of the user_info that match.
        # One or more?  No analysis was really made of alignment.
        # So we'll have to do it here...
        event_type = event.get('event_type')
        if isinstance(event_type, str):
            event_type = event_type.decode('utf8')
        debug_str = u" [event_type='{}']".format(event_type)

        username_entry = None
        username = eventlog.get_event_username(event)
        if username is not None:
            username = username.decode('utf8')
            if self.user_info is not None:
                username_entry = self.user_info.get(username)
                if username_entry is None:
                    log.error(u"username ('%s') is unknown to user_info %s", username, debug_str)

        # Get the user_id either as an int or None
        userid_entry = None
        user_id = self.get_user_id_as_int(event.get('context', {}).get('user_id'))
        if user_id is not None:
            if self.user_info is not None:
                userid_entry = self.user_info.get(user_id)
                if userid_entry is None:
                    log.error(u"user_id ('%s') is unknown to user_info %s", user_id, debug_str)
                elif username_entry and userid_entry != username_entry:
                    log.error(
                        u"user_id ('%s'='%s') does not match username ('%s'='%s') %s",
                        userid_entry.get('user_id'), userid_entry.get('username'), username_entry.get('username'), username_entry.get('user_id'), debug_str,
                    )

        event_userid_entry = None
        if event_data and isinstance(event_data, dict):
            event_user_id = self.get_user_id_as_int(event_data.get('user_id'))
            if event_user_id:
                if self.user_info is not None:
                    event_userid_entry = self.user_info.get(event_user_id)
                    if event_userid_entry is None:
                        log.error(u"Event_user_id ('%s') is unknown to user_info %s", event_user_id, debug_str)

                if user_id is None:
                    # This is way too common. In testing, every edx.course.enrollment.xxx had the user_id in the event but not
                    # in context.  Weird.
                    # log.warning(u"Found user_id ('%s') in event but nothing in context %s", event_user_id, debug_str)
                    pass
                elif event_userid_entry and userid_entry != event_userid_entry:
                    # This turns out to be somewhat expected for certain event types where one user is doing something on behalf
                    # of another user.  The actor is in context, and the object is in event payload.
                    if event_type not in EVENT_TYPES_WITH_DIFFERENT_USERIDS:
                        log.error(
                            u"Context user_id ('%s'='%s') does not match event user_id ('%s'='%s') %s",
                            userid_entry.get('user_id'), userid_entry.get('username'), event_userid_entry.get('username'), event_userid_entry.get('user_id'), debug_str,
                        )
                elif event_user_id != user_id:
                    log.error(u"Found user_id ('%s') in event that was different from context ('%s') %s", event_user_id, user_id, debug_str)

        # We choose the event user_id over the context, and fall back on the username.
        if event_userid_entry is not None:
            return event_userid_entry
        elif userid_entry is not None:
            return userid_entry
        else:
            return username_entry
 def test_event_username_with_trailing_whitespace(self):
     item = {"username": "******"}
     self.assertEquals(eventlog.get_event_username(item), u'bub')
 def test_empty_event_username(self):
     item = {"username": "******"}
     self.assertIsNone(eventlog.get_event_username(item))
 def test_missing_event_username(self):
     item = {"something else": "not an event"}
     self.assertIsNone(eventlog.get_event_username(item))