Esempio n. 1
0
    def mapper(self, line):
        event = eventlog.parse_json_event(line)
        if event is None:
            return

        username = event.get('username')
        if not username:
            return

        stripped_username = username.strip()
        if username != stripped_username:
            log.error("User '%s' has extra whitespace, which is being stripped. Event: %s", username, event)
            username = stripped_username

        timestamp_as_datetime = eventlog.get_event_time(event)
        if timestamp_as_datetime is None:
            return

        if timestamp_as_datetime >= self.end_datetime:
            return

        timestamp = eventlog.datetime_to_timestamp(timestamp_as_datetime)

        ip_address = event.get('ip')
        if not ip_address:
            log.warning("No ip_address found for user '%s' on '%s'.", username, timestamp)
            return

        yield username, (timestamp, ip_address)
 def get_raw_event(self, event_line):
     event = eventlog.parse_json_event(event_line)
     event_data = eventlog.get_event_data(event)
     if event_data is not None:
         event['event'] = event_data
     dump = json.dumps(event, sort_keys=True)
     encoded_dump = backslash_encode_value(dump)
     return encoded_dump
 def get_raw_event(self, event_line):
     event = eventlog.parse_json_event(event_line)
     event_data = eventlog.get_event_data(event)
     if event_data is not None:
         event['event'] = event_data
     dump = json.dumps(event, sort_keys=True)
     encoded_dump = backslash_encode_value(dump)
     return encoded_dump
Esempio n. 4
0
    def mapper(self, line):
        event = eventlog.parse_json_event(line)
        date_string = event['time'].split("T")[0]

        filtered_event = self._filter_event(event)
        if filtered_event is None:
            return

        yield date_string.encode('utf-8'), line.rstrip('\r\n')
    def mapper(self, line):
        event = eventlog.parse_json_event(line)
        date_string = event['time'].split("T")[0]

        filtered_event = self._filter_event(event)
        if filtered_event is None:
            return

        yield date_string.encode('utf-8'), line.rstrip('\r\n')
Esempio n. 6
0
    def get_raw_events_from_log_file(self, input_file):
        # override parent class to disable event filter
        raw_events = []
        for line in input_file:

            event_row = eventlog.parse_json_event(line)
            if not event_row:
                continue
            timestamp = eventlog.get_event_time(event_row)
            if not timestamp:
                continue
            event_row['timestamp'] = timestamp
            raw_events.append(event_row)
        return raw_events
    def mapper(self, line):
        event = eventlog.parse_json_event(line)
        date_string = event['time'].split("T")[0]

        filtered_event = self.filter_event(event)

        if filtered_event is None:
            return

        deidentified_event = self.deidentify_event(filtered_event)
        if deidentified_event is None:
            return

        yield date_string.encode('utf-8'), cjson.encode(deidentified_event)
    def get_event_and_date_string(self, line):
        """Default mapper implementation, that always outputs the log line, but with a configurable key."""
        event = eventlog.parse_json_event(line)
        if event is None:
            return None

        event_time = self.get_event_time(event)
        if not event_time:
            return None

        # Don't use strptime to parse the date, it is extremely slow
        # to do so. Instead rely on alphanumeric comparisons.  The
        # timestamp is ISO8601 formatted, so dates will look like
        # %Y-%m-%d.  For example: 2014-05-20.
        date_string = event_time.split("T")[0]

        if date_string < self.lower_bound_date_string or date_string >= self.upper_bound_date_string:
            return None

        return event, date_string
Esempio n. 9
0
    def get_event_and_date_string(self, line):
        """Default mapper implementation, that always outputs the log line, but with a configurable key."""
        event = eventlog.parse_json_event(line)
        if event is None:
            return None

        event_time = self.get_event_time(event)
        if not event_time:
            return None

        # Don't use strptime to parse the date, it is extremely slow
        # to do so. Instead rely on alphanumeric comparisons.  The
        # timestamp is ISO8601 formatted, so dates will look like
        # %Y-%m-%d.  For example: 2014-05-20.
        date_string = event_time.split("T")[0]

        if date_string < self.lower_bound_date_string or date_string >= self.upper_bound_date_string:
            return None

        return event, date_string
def get_explicit_enrollment_output(line):
    """
    Generates output values for explicit enrollment events.

    Args:

      line: text line from a tracking event log.

    Returns:

      (course_id, user_id), (timestamp, action_value)

        where action_value = 1 (enrolled) or -1 (unenrolled)
        and timestamp is in ISO format, with resolution to the millisecond.

      or None if there is no valid enrollment event on the line.

    Example:
            (edX/DemoX/Demo_Course, dummy_userid), (2013-09-10T00:01:05.123456, 1)

    """
    # Before parsing, check that the line contains something that
    # suggests it's an enrollment event.
    if 'edx.course.enrollment' not in line:
        return None

    # try to parse the line into a dict:
    event = eventlog.parse_json_event(line)
    if event is None:
        # The line didn't parse.  For this specific purpose,
        # we can assume that all enrollment-related lines would parse,
        # and these non-parsing lines would get skipped anyway.
        return None

    # get event type, and check that it exists:
    event_type = event.get('event_type')
    if event_type is None:
        log.error("encountered event with no event_type: %s", event)
        return None

    # convert the type to a value:
    if event_type == 'edx.course.enrollment.activated':
        action_value = ENROLLED
    elif event_type == 'edx.course.enrollment.deactivated':
        action_value = UNENROLLED
    else:
        # not an enrollment event...
        return None

    # get the timestamp:
    datetime = eventlog.get_event_time(event)
    if datetime is None:
        log.error("encountered event with bad datetime: %s", event)
        return None
    timestamp = eventlog.datetime_to_timestamp(datetime)

    # Use the `user_id` from the event `data` field, since the
    # `user_id` in the `context` field is the user who made the
    # request but not necessarily the one who got enrolled.  (The
    # `course_id` should be the same in `context` as in `data`.)

    # Get the event data:
    event_data = eventlog.get_event_data(event)
    if event_data is None:
        # Assume it's already logged (and with more specifics).
        return None

    # Get the course_id from the data, and validate.
    course_id = event_data['course_id']
    if not opaque_key_util.is_valid_course_id(course_id):
        log.error("encountered explicit enrollment event with bogus course_id: %s", event)
        return None

    # Get the user_id from the data:
    user_id = event_data.get('user_id')
    if user_id is None:
        log.error("encountered explicit enrollment event with no user_id: %s", event)
        return None

    # For now, ignore the enrollment 'mode' (e.g. 'honor').

    return (course_id, user_id), (timestamp, action_value)
 def test_parse_json_event_with_nonascii(self):
     line = '{"username": "******"}'
     result = eventlog.parse_json_event(line)
     self.assertTrue(isinstance(result, dict))
     self.assertEquals(result['username'], u'b\ufffdb')
 def test_parse_json_event_with_cruft(self):
     line = 'leading cruft here {"username": "******"}  '
     result = eventlog.parse_json_event(line)
     self.assertTrue(isinstance(result, dict))
 def test_parse_json_event_truncated(self):
     line = '{"username": "unsuccessful'
     result = eventlog.parse_json_event(line)
     self.assertIsNone(result)
 def test_parse_valid_json_event(self):
     line = '{"username": "******"}'
     result = eventlog.parse_json_event(line)
     self.assertTrue(isinstance(result, dict))
Esempio n. 15
0
    def obfuscate_event_entry(self, line):
        event = eventlog.parse_json_event(line)
        if event is None:
            # Unexpected here...
            log.error(u"Encountered event entry which failed to parse: %r", line)
            return line
        course_id = eventlog.get_course_id(event, from_url=True)
        if course_id is None:
            # Unexpected here...
            log.error(u"Encountered event entry with no course_id: %r", line)
            return line

        # We cannot use this method as-is, since we need to know what was done to the event, so
        # that it can be transformed back to its original form once cleaned.
        # NOT event_data = eventlog.get_event_data(event)
        event_json_decoded = False
        event_data = event.get('event')

        if event_data is None:
            log.error(u"Encountered event entry with no 'event' payload: %r", line)
        if event_data == '':
            # Note that this happens with some browser events.  Instead of
            # failing to parse it as a JSON string, just leave as-is.
            pass
        elif isinstance(event_data, basestring):
            # Cjson produces str, while json produces unicode.  Hmm.
            if len(event_data) == 512 and 'POST' in event_data:
                # It's a truncated JSON string.  But we're going to throw it out anyway, so no worries.
                pass
            elif '{' not in event_data and '=' in event_data:
                # It's a key-value pair from a browser event.  Just process as-is, rather than parsing and reassembling.
                pass
            else:
                try:
                    event_data = eventlog.decode_json(event_data)
                    event_json_decoded = True
                except Exception:
                    log.error(u"Encountered event entry with unparseable 'event' payload: %r", line)

        # TODO: update the comment!  This is where we traverse the event in search of values that should be "cleansed".
        # Much along the model of what we already do for 'state' in CWSM.  Except that we need to be more
        # flexible in determining the level of backslash encoding -- decode and re-encode as many levels as needed
        # to get to strings that can be properly interpreted.
        event_user_info = self.get_userinfo_from_event(event, event_data)

        if 'POST' in event_data:
            if self.parameters['skip_post']:
                return None

        updated_event_data = self.obfuscator.obfuscate_structure(event_data, u"event", event_user_info)

        if updated_event_data is not None:
            event_source = event.get('event_source')
            event_type = event.get('event_type')
            log.info(u"Obfuscated %s event with event_type = '%s'", event_source, event_type)

            if event_json_decoded:
                # TODO: should really use cjson, if that were originally used for decoding the json.
                updated_event_data = json.dumps(updated_event_data)

            event['event'] = updated_event_data

        # TODO: should really use cjson, if that were originally used for decoding the json.
        return json.dumps(event)
 def test_parse_json_event_with_cruft(self):
     line = 'leading cruft here {"username": "******"}  '
     result = eventlog.parse_json_event(line)
     self.assertTrue(isinstance(result, dict))
Esempio n. 17
0
 def parse_event_from_entity(self, line):
     return eventlog.parse_json_event(line)
 def get_raw_event(self, event_line):
     event = eventlog.parse_json_event(event_line)
     dump = json.dumps(event, sort_keys=True)
     encoded_dump = backslash_encode_value(dump)
     return encoded_dump
 def test_parse_valid_json_event(self):
     line = '{"username": "******"}'
     result = eventlog.parse_json_event(line)
     self.assertTrue(isinstance(result, dict))
 def obfuscate_event_line(self, line):
     """Parse an event line, obfuscate it, and convert back to a line."""
     input_event = eventlog.parse_json_event(line)
     obfuscated_event = self._obfuscate_event(input_event)
     return eventlog.encode_json(obfuscated_event).strip()
 def test_parse_json_event_with_nonascii(self):
     line = '{"username": "******"}'
     result = eventlog.parse_json_event(line)
     self.assertTrue(isinstance(result, dict))
     self.assertEquals(result['username'], u'b\ufffdb')
Esempio n. 22
0
 def obfuscate_event_line(self, line):
     """Parse an event line, obfuscate it, and convert back to a line."""
     input_event = eventlog.parse_json_event(line)
     obfuscated_event = self._obfuscate_event(input_event)
     return eventlog.encode_json(obfuscated_event).strip()
 def get_raw_event(self, event_line):
     event = eventlog.parse_json_event(event_line)
     dump = json.dumps(event, sort_keys=True)
     encoded_dump = backslash_encode_value(dump)
     return encoded_dump
 def test_parse_json_event_truncated(self):
     line = '{"username": "unsuccessful'
     result = eventlog.parse_json_event(line)
     self.assertIsNone(result)
Esempio n. 25
0
    def obfuscate_event_entry(self, line):
        event = eventlog.parse_json_event(line)
        if event is None:
            # Unexpected here...
            log.error(u"Encountered event entry which failed to parse: %r",
                      line)
            return line
        course_id = eventlog.get_course_id(event, from_url=True)
        if course_id is None:
            # Unexpected here...
            log.error(u"Encountered event entry with no course_id: %r", line)
            return line

        # We cannot use this method as-is, since we need to know what was done to the event, so
        # that it can be transformed back to its original form once cleaned.
        # NOT event_data = eventlog.get_event_data(event)
        event_json_decoded = False
        event_data = event.get('event')

        if event_data is None:
            log.error(u"Encountered event entry with no 'event' payload: %r",
                      line)
        if event_data == '':
            # Note that this happens with some browser events.  Instead of
            # failing to parse it as a JSON string, just leave as-is.
            pass
        elif isinstance(event_data, basestring):
            # Cjson produces str, while json produces unicode.  Hmm.
            if len(event_data) == 512 and 'POST' in event_data:
                # It's a truncated JSON string.  But we're going to throw it out anyway, so no worries.
                pass
            elif '{' not in event_data and '=' in event_data:
                # It's a key-value pair from a browser event.  Just process as-is, rather than parsing and reassembling.
                pass
            else:
                try:
                    event_data = eventlog.decode_json(event_data)
                    event_json_decoded = True
                except Exception:
                    log.error(
                        u"Encountered event entry with unparseable 'event' payload: %r",
                        line)

        # TODO: update the comment!  This is where we traverse the event in search of values that should be "cleansed".
        # Much along the model of what we already do for 'state' in CWSM.  Except that we need to be more
        # flexible in determining the level of backslash encoding -- decode and re-encode as many levels as needed
        # to get to strings that can be properly interpreted.
        event_user_info = self.get_userinfo_from_event(event, event_data)

        if 'POST' in event_data:
            if self.parameters['skip_post']:
                return None

        updated_event_data = self.obfuscator.obfuscate_structure(
            event_data, u"event", event_user_info)

        if updated_event_data is not None:
            event_source = event.get('event_source')
            event_type = event.get('event_type')
            log.info(u"Obfuscated %s event with event_type = '%s'",
                     event_source, event_type)

            if event_json_decoded:
                # TODO: should really use cjson, if that were originally used for decoding the json.
                updated_event_data = json.dumps(updated_event_data)

            event['event'] = updated_event_data

        # TODO: should really use cjson, if that were originally used for decoding the json.
        return json.dumps(event)
Esempio n. 26
0
def get_explicit_enrollment_output(line):
    """
    Generates output values for explicit enrollment events.

    Args:

      line: text line from a tracking event log.

    Returns:

      (course_id, user_id), (timestamp, action_value)

        where action_value = 1 (enrolled) or -1 (unenrolled)
        and timestamp is in ISO format, with resolution to the millisecond.

      or None if there is no valid enrollment event on the line.

    Example:
            (edX/DemoX/Demo_Course, dummy_userid), (2013-09-10T00:01:05.123456, 1)

    """
    # Before parsing, check that the line contains something that
    # suggests it's an enrollment event.
    if 'edx.course.enrollment' not in line:
        return None

    # try to parse the line into a dict:
    event = eventlog.parse_json_event(line)
    if event is None:
        # The line didn't parse.  For this specific purpose,
        # we can assume that all enrollment-related lines would parse,
        # and these non-parsing lines would get skipped anyway.
        return None

    # get event type, and check that it exists:
    event_type = event.get('event_type')
    if event_type is None:
        log.error("encountered event with no event_type: %s", event)
        return None

    # convert the type to a value:
    if event_type == 'edx.course.enrollment.activated':
        action_value = ENROLLED
    elif event_type == 'edx.course.enrollment.deactivated':
        action_value = UNENROLLED
    else:
        # not an enrollment event...
        return None

    # get the timestamp:
    datetime = eventlog.get_event_time(event)
    if datetime is None:
        log.error("encountered event with bad datetime: %s", event)
        return None
    timestamp = eventlog.datetime_to_timestamp(datetime)

    # Use the `user_id` from the event `data` field, since the
    # `user_id` in the `context` field is the user who made the
    # request but not necessarily the one who got enrolled.  (The
    # `course_id` should be the same in `context` as in `data`.)

    # Get the event data:
    event_data = eventlog.get_event_data(event)
    if event_data is None:
        # Assume it's already logged (and with more specifics).
        return None

    # Get the course_id from the data, and validate.
    course_id = event_data['course_id']
    if not opaque_key_util.is_valid_course_id(course_id):
        log.error(
            "encountered explicit enrollment event with bogus course_id: %s",
            event)
        return None

    # Get the user_id from the data:
    user_id = event_data.get('user_id')
    if user_id is None:
        log.error("encountered explicit enrollment event with no user_id: %s",
                  event)
        return None

    # For now, ignore the enrollment 'mode' (e.g. 'honor').

    return (course_id, user_id), (timestamp, action_value)