def get_course_id(event, from_url=False):
    """Gets course_id from event's data."""

    # Get the event data:
    event_context = event.get('context')
    if event_context is None:
        # Assume it's old, and not worth logging...
        return None

    # Get the course_id from the data, and validate.
    course_id = event_context.get('course_id', '')
    if course_id:
        if opaque_key_util.is_valid_course_id(course_id):
            return course_id
        else:
            log.error("encountered event with bogus course_id: %s", event)
            return None

    # Try to get the course_id from the URLs in `event_type` (for implicit
    # server events) and `page` (for browser events).
    if from_url:
        source = event.get('event_source')

        if source == 'server':
            url = event.get('event_type', '')
        elif source == 'browser':
            url = event.get('page', '')
        else:
            url = ''

        course_key = opaque_key_util.get_course_key_from_url(url)
        if course_key:
            return unicode(course_key)

    return None
Esempio n. 2
0
def get_course_id(event, from_url=False):
    """Gets course_id from event's data."""

    # Get the event data:
    event_context = event.get('context')
    if event_context is None:
        # Assume it's old, and not worth logging...
        return None

    # Get the course_id from the data, and validate.
    course_id = event_context.get('course_id', '')
    if course_id:
        if opaque_key_util.is_valid_course_id(course_id):
            return course_id
        else:
            log.error("encountered event with bogus course_id: %s", event)
            return None

    # Try to get the course_id from the URLs in `event_type` (for implicit
    # server events) and `page` (for browser events).
    if from_url:
        source = event.get('event_source')

        if source == 'server':
            url = event.get('event_type', '')
        elif source == 'browser':
            url = event.get('page', '')
        else:
            url = ''

        course_key = opaque_key_util.get_course_key_from_url(url)
        if course_key:
            return unicode(course_key)

    return None
    def mapper(self, line):
        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, _date_string = value

        event_type = event.get('event_type')
        if event_type is None:
            log.error("encountered event with no event_type: %s", event)
            return

        if event_type not in (DEACTIVATED, ACTIVATED):
            return

        timestamp = eventlog.get_event_time_string(event)
        if timestamp is None:
            log.error("encountered event with bad timestamp: %s", event)
            return

        event_data = eventlog.get_event_data(event)
        if event_data is None:
            return

        course_id = event_data.get('course_id')
        if course_id is None or not opaque_key_util.is_valid_course_id(course_id):
            log.error("encountered explicit enrollment event with invalid course_id: %s", event)
            return

        user_id = event_data.get('user_id')
        if user_id is None:
            log.error("encountered explicit enrollment event with no user_id: %s", event)
            return

        yield (course_id, user_id), (timestamp, event_type)
Esempio n. 4
0
    def mapper(self, line):
        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, _date_string = value

        event_type = event.get('event_type')
        if event_type is None:
            log.error("encountered event with no event_type: %s", event)
            return

        if event_type not in (DEACTIVATED, ACTIVATED, MODE_CHANGED, VALIDATED):
            return

        timestamp = eventlog.get_event_time_string(event)
        if timestamp is None:
            log.error("encountered event with bad timestamp: %s", event)
            return

        event_data = eventlog.get_event_data(event)
        if event_data is None:
            return

        course_id = opaque_key_util.normalize_course_id(
            event_data.get('course_id'))
        if course_id is None or not opaque_key_util.is_valid_course_id(
                course_id):
            log.error(
                "encountered explicit enrollment event with invalid course_id: %s",
                event)
            return

        user_id = event_data.get('user_id')
        if user_id is None:
            log.error(
                "encountered explicit enrollment event with no user_id: %s",
                event)
            return

        mode = event_data.get('mode')
        if mode is None:
            log.error("encountered explicit enrollment event with no mode: %s",
                      event)
            return

        # Pull in extra properties provided only by synthetic enrollment validation events.
        validation_info = None
        if 'dump_start' in event_data:
            validation_info = {
                'is_active': event_data.get('is_active'),
                'created': event_data.get('created'),
                'dump_start': event_data.get('dump_start'),
                'dump_end': event_data.get('dump_end'),
            }

        # Make sure key values that are strings are properly encoded.
        # Note, however, that user_id is an int.
        key = (unicode(course_id).encode('utf-8'), user_id)
        yield key, (timestamp, event_type, mode, validation_info)
    def mapper(self, line):
        # We only want to consider lines that include the type of event with which we are concerned.
        if LINK_CLICKED not in line:
            return

        value = self.get_event_and_date_string(line)

        if value is None:
            return
        event, date_string = value

        event_type = event.get('event_type')
        if not event_type:
            log.error("encountered event with no event_type: %s", event)
            return

        if event_type != LINK_CLICKED:
            return

        event_data = eventlog.get_event_data(event)
        if event_data is None:
            log.error("encountered explicit link_clicked event with no event data: %s", event)
            return

        context = event.get('context')
        if not context:
            log.error("encountered explicit link_clicked event with no context: %s", event)
            return

        course_id = context.get('course_id')
        if course_id is None or not opaque_key_util.is_valid_course_id(course_id):
            log.error("encountered explicit link_clicked event with invalid course_id: %s", event)
            return

        target_url = event_data.get('target_url')
        if not target_url:
            log.error("encountered explicit link_clicked event with no target_url: %s", event)
            return

        current_url = event_data.get('current_url')
        if not current_url:
            log.error("encountered explicit link_clicked event with no current_url: %s", event)
            return

        # A link is considered "internal" when it does not navigate away from the current host.
        # Some internal links exclude the host name entirely- they start with / so we account for that.
        current_loc = urlparse(current_url).netloc
        target_loc = urlparse(target_url).netloc

        is_external = 0
        if current_loc != target_loc and target_loc != "":
            is_external = 1

        yield (course_id, date_string), (is_external)
    def mapper(self, line):
        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, _date_string = value

        event_type = event.get('event_type')
        if event_type is None:
            log.error("encountered event with no event_type: %s", event)
            return

        if event_type not in (DEACTIVATED, ACTIVATED, MODE_CHANGED, VALIDATED):
            return

        timestamp = eventlog.get_event_time_string(event)
        if timestamp is None:
            log.error("encountered event with bad timestamp: %s", event)
            return

        event_data = eventlog.get_event_data(event)
        if event_data is None:
            return

        course_id = event_data.get('course_id')
        if course_id is None or not opaque_key_util.is_valid_course_id(course_id):
            log.error("encountered explicit enrollment event with invalid course_id: %s", event)
            return

        user_id = event_data.get('user_id')
        if user_id is None:
            log.error("encountered explicit enrollment event with no user_id: %s", event)
            return

        mode = event_data.get('mode')
        if mode is None:
            log.error("encountered explicit enrollment event with no mode: %s", event)
            return

        # Pull in extra properties provided only by synthetic enrollment validation events.
        validation_info = None
        if 'dump_start' in event_data:
            validation_info = {
                'is_active': event_data.get('is_active'),
                'created': event_data.get('created'),
                'dump_start': event_data.get('dump_start'),
                'dump_end': event_data.get('dump_end'),
            }

        # Make sure key values that are strings are properly encoded.
        # Note, however, that user_id is an int.
        key = (unicode(course_id).encode('utf-8'), user_id)
        yield key, (timestamp, event_type, mode, validation_info)
    def mapper(self, line):
        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, date_string = value

        event_type = event.get('event_type')
        if event_type is None:
            log.error("encountered event with no event_type: %s", event)
            return

        if event_type not in (DEACTIVATED, ACTIVATED, MODE_CHANGED):
            return

        timestamp = eventlog.get_event_time_string(event)
        if timestamp is None:
            log.error("encountered event with bad timestamp: %s", event)
            return

        event_data = eventlog.get_event_data(event)
        if event_data is None:
            return

        course_id = opaque_key_util.normalize_course_id(
            event_data.get('course_id'))
        if course_id is None or not opaque_key_util.is_valid_course_id(
                course_id):
            log.error(
                "encountered explicit enrollment event with invalid course_id: %s",
                event)
            return

        user_id = event_data.get('user_id')
        if user_id is None:
            log.error(
                "encountered explicit enrollment event with no user_id: %s",
                event)
            return

        mode = event_data.get('mode')
        if mode is None:
            log.error("encountered explicit enrollment event with no mode: %s",
                      event)
            return

        yield date_string, (course_id, user_id, timestamp, event_type, mode)
Esempio n. 8
0
    def run(self):
        self.remove_output_on_overwrite()
        with self.input().open('r') as input_file:
            course_structure = json.load(input_file)
            with self.output().open('w') as output_file:
                courses_list = course_structure.get('results')
                if not courses_list:  # If there are no courses, or 'results' is not a key in the json, output nothing.
                    return
                for course in courses_list:
                    # To maintain robustness, ignore any non-dictionary data that finds its way into the API response.
                    try:
                        start_string = course.get('start')
                        end_string = course.get('end')
                        if start_string is None:
                            cleaned_start_string = '\N'
                        else:
                            cleaned_start_string = ciso8601.parse_datetime(
                                start_string)
                        if end_string is None:
                            cleaned_end_string = '\N'
                        else:
                            cleaned_end_string = ciso8601.parse_datetime(
                                end_string)

                        course_id = course.get('id', '\N')
                        if is_valid_course_id(course_id):
                            course_key = CourseKey.from_string(course_id)
                            course_run = course_key.run
                        else:
                            course_run = '\N'

                        line = [
                            course_id,
                            course.get('org', '\N'),
                            course.get('number', '\N'), course_run,
                            coerce_timestamp_for_hive(cleaned_start_string),
                            coerce_timestamp_for_hive(cleaned_end_string),
                            course.get('name', '\N')
                        ]
                        output_file.write('\t'.join(
                            [v.encode('utf-8') for v in line]))
                        output_file.write('\n')
                    except AttributeError:  # If the course is not a dictionary, move on to the next one.
                        continue
def get_course_id(event):
    """Gets course_id from event's data."""

    # Get the event data:
    event_context = event.get('context')
    if event_context is None:
        # Assume it's old, and not worth logging...
        return None

    # Get the course_id from the data, and validate.
    course_id = event_context.get('course_id', '')
    if not course_id:
        return None

    if not opaque_key_util.is_valid_course_id(course_id):
        log.error("encountered event with bogus course_id: %s", event)
        return None

    return course_id
def get_course_id(event):
    """Gets course_id from event's data."""

    # Get the event data:
    event_context = event.get('context')
    if event_context is None:
        # Assume it's old, and not worth logging...
        return None

    # Get the course_id from the data, and validate.
    course_id = event_context.get('course_id', '')
    if not course_id:
        return None

    if not opaque_key_util.is_valid_course_id(course_id):
        log.error("encountered event with bogus course_id: %s", event)
        return None

    return course_id
    def run(self):
        self.remove_output_on_overwrite()
        with self.input().open('r') as input_file:
            course_structure = json.load(input_file)
            with self.output().open('w') as output_file:
                courses_list = course_structure.get('results')
                if not courses_list:  # If there are no courses, or 'results' is not a key in the json, output nothing.
                    return
                for course in courses_list:
                    # To maintain robustness, ignore any non-dictionary data that finds its way into the API response.
                    try:
                        start_string = course.get('start')
                        end_string = course.get('end')
                        if start_string is None:
                            cleaned_start_string = '\N'
                        else:
                            cleaned_start_string = ciso8601.parse_datetime(start_string)
                        if end_string is None:
                            cleaned_end_string = '\N'
                        else:
                            cleaned_end_string = ciso8601.parse_datetime(end_string)

                        course_id = course.get('id', '\N')
                        if is_valid_course_id(course_id):
                            course_key = CourseKey.from_string(course_id)
                            course_run = course_key.run
                        else:
                            course_run = '\N'

                        line = [
                            course_id,
                            course.get('org', '\N'),
                            course.get('number', '\N'),
                            course_run,
                            coerce_timestamp_for_hive(cleaned_start_string),
                            coerce_timestamp_for_hive(cleaned_end_string),
                            course.get('name', '\N')
                        ]
                        output_file.write('\t'.join([v.encode('utf-8') for v in line]))
                        output_file.write('\n')
                    except AttributeError:  # If the course is not a dictionary, move on to the next one.
                        continue
def get_explicit_enrollment_output(line):
    """
    Generates output values for explicit enrollment events.

    Args:

      line: text line from a tracking event log.

    Returns:

      (course_id, user_id), (timestamp, action_value)

        where action_value = 1 (enrolled) or -1 (unenrolled)
        and timestamp is in ISO format, with resolution to the millisecond.

      or None if there is no valid enrollment event on the line.

    Example:
            (edX/DemoX/Demo_Course, dummy_userid), (2013-09-10T00:01:05.123456, 1)

    """
    # Before parsing, check that the line contains something that
    # suggests it's an enrollment event.
    if 'edx.course.enrollment' not in line:
        return None

    # try to parse the line into a dict:
    event = eventlog.parse_json_event(line)
    if event is None:
        # The line didn't parse.  For this specific purpose,
        # we can assume that all enrollment-related lines would parse,
        # and these non-parsing lines would get skipped anyway.
        return None

    # get event type, and check that it exists:
    event_type = event.get('event_type')
    if event_type is None:
        log.error("encountered event with no event_type: %s", event)
        return None

    # convert the type to a value:
    if event_type == 'edx.course.enrollment.activated':
        action_value = ENROLLED
    elif event_type == 'edx.course.enrollment.deactivated':
        action_value = UNENROLLED
    else:
        # not an enrollment event...
        return None

    # get the timestamp:
    datetime = eventlog.get_event_time(event)
    if datetime is None:
        log.error("encountered event with bad datetime: %s", event)
        return None
    timestamp = eventlog.datetime_to_timestamp(datetime)

    # Use the `user_id` from the event `data` field, since the
    # `user_id` in the `context` field is the user who made the
    # request but not necessarily the one who got enrolled.  (The
    # `course_id` should be the same in `context` as in `data`.)

    # Get the event data:
    event_data = eventlog.get_event_data(event)
    if event_data is None:
        # Assume it's already logged (and with more specifics).
        return None

    # Get the course_id from the data, and validate.
    course_id = event_data['course_id']
    if not opaque_key_util.is_valid_course_id(course_id):
        log.error("encountered explicit enrollment event with bogus course_id: %s", event)
        return None

    # Get the user_id from the data:
    user_id = event_data.get('user_id')
    if user_id is None:
        log.error("encountered explicit enrollment event with no user_id: %s", event)
        return None

    # For now, ignore the enrollment 'mode' (e.g. 'honor').

    return (course_id, user_id), (timestamp, action_value)
Esempio n. 13
0
def get_explicit_enrollment_output(line):
    """
    Generates output values for explicit enrollment events.

    Args:

      line: text line from a tracking event log.

    Returns:

      (course_id, user_id), (timestamp, action_value)

        where action_value = 1 (enrolled) or -1 (unenrolled)
        and timestamp is in ISO format, with resolution to the millisecond.

      or None if there is no valid enrollment event on the line.

    Example:
            (edX/DemoX/Demo_Course, dummy_userid), (2013-09-10T00:01:05.123456, 1)

    """
    # Before parsing, check that the line contains something that
    # suggests it's an enrollment event.
    if 'edx.course.enrollment' not in line:
        return None

    # try to parse the line into a dict:
    event = eventlog.parse_json_event(line)
    if event is None:
        # The line didn't parse.  For this specific purpose,
        # we can assume that all enrollment-related lines would parse,
        # and these non-parsing lines would get skipped anyway.
        return None

    # get event type, and check that it exists:
    event_type = event.get('event_type')
    if event_type is None:
        log.error("encountered event with no event_type: %s", event)
        return None

    # convert the type to a value:
    if event_type == 'edx.course.enrollment.activated':
        action_value = ENROLLED
    elif event_type == 'edx.course.enrollment.deactivated':
        action_value = UNENROLLED
    else:
        # not an enrollment event...
        return None

    # get the timestamp:
    datetime = eventlog.get_event_time(event)
    if datetime is None:
        log.error("encountered event with bad datetime: %s", event)
        return None
    timestamp = eventlog.datetime_to_timestamp(datetime)

    # Use the `user_id` from the event `data` field, since the
    # `user_id` in the `context` field is the user who made the
    # request but not necessarily the one who got enrolled.  (The
    # `course_id` should be the same in `context` as in `data`.)

    # Get the event data:
    event_data = eventlog.get_event_data(event)
    if event_data is None:
        # Assume it's already logged (and with more specifics).
        return None

    # Get the course_id from the data, and validate.
    course_id = event_data['course_id']
    if not opaque_key_util.is_valid_course_id(course_id):
        log.error(
            "encountered explicit enrollment event with bogus course_id: %s",
            event)
        return None

    # Get the user_id from the data:
    user_id = event_data.get('user_id')
    if user_id is None:
        log.error("encountered explicit enrollment event with no user_id: %s",
                  event)
        return None

    # For now, ignore the enrollment 'mode' (e.g. 'honor').

    return (course_id, user_id), (timestamp, action_value)
Esempio n. 14
0
 def test_just_newline_course_id(self):
     self.assertFalse(opaque_key_util.is_valid_course_id('\n'))
Esempio n. 15
0
 def test_normal_legacy_course_id(self):
     self.assertTrue(opaque_key_util.is_valid_course_id(VALID_LEGACY_COURSE_ID))
Esempio n. 16
0
 def test_legacy_course_id_without_components(self):
     self.assertFalse(opaque_key_util.is_valid_course_id(INVALID_LEGACY_COURSE_ID))
Esempio n. 17
0
def get_problem_check_event(line):
    """
    Generates output values for explicit problem_check events.
    Args:
        line: text line from a tracking event log.
    Returns:
        (problem_id, username), (timestamp, problem_check_info)
        where timestamp is in ISO format, with resolution to the millisecond
        and problem_check_info is a JSON-serialized dict containing
        the contents of the problem_check event's 'event' field,
        augmented with entries for 'timestamp', 'username', and
        'context' from the event.
        or None if there is no valid problem_check event on the line.
    Example:
            (i4x://edX/DemoX/Demo_Course/problem/PS1_P1, dummy_username), (2013-09-10T00:01:05.123456, blah)
    """
    # Parse the line into a dict.
    event = eventlog.parse_json_server_event(line, 'problem_check')
    if event is None:
        return None

    # Get the "problem data".  This is the event data, the context, and anything else that would
    # be useful further downstream.  (We could just pass the entire event dict?)

    # Get the user from the username, not from the user_id in the
    # context.  While we are currently requiring context (as described
    # above), we might not in future.  Older events will not have
    # context information, so we can't rely on user_id from there.
    # And we don't expect problem_check events to occur without a
    # username, and don't expect them to occur with the wrong user
    # (i.e. one user acting on behalf of another, as in an instructor
    # acting on behalf of a student).
    augmented_data_fields = ['context', 'username', 'timestamp']
    problem_data = eventlog.get_augmented_event_data(event, augmented_data_fields)
    if problem_data is None:
        return None

    # Get the course_id from context.  We won't work with older events
    # that do not have context information, since they do not directly
    # provide course_id information.  (The problem_id/answer_id values
    # contain the org and course name, but not the run.)  Course_id
    # information could be found from other events, but it would
    # require expanding the events being selected.
    course_id = problem_data.get('context').get('course_id')
    if course_id is None:
        log.error("encountered explicit problem_check event with missing course_id: %s", event)
        return None

    if not opaque_key_util.is_valid_course_id(course_id):
        log.error("encountered explicit problem_check event with bogus course_id: %s", event)
        return None

    # Get the problem_id from the event data.
    problem_id = problem_data.get('problem_id')
    if problem_id is None:
        log.error("encountered explicit problem_check event with bogus problem_id: %s", event)
        return None

    if len(event.get('event', {}).get('answers', [])) == 0:
        return None

    problem_data_json = json.dumps(problem_data)
    key = (course_id, problem_id, problem_data.get('username'))
    value = (problem_data.get('timestamp'), problem_data_json)

    return key, value
 def test_course_id_with_valid_nonascii(self):
     self.assertTrue(opaque_key_util.is_valid_course_id(VALID_NONASCII_LEGACY_COURSE_ID))
 def test_no_course_id(self):
     self.assertFalse(opaque_key_util.is_valid_course_id(None))
Esempio n. 20
0
 def test_empty_course_id(self):
     self.assertFalse(opaque_key_util.is_valid_course_id(''))
 def test_invalid_course_id(self, course_id):
     self.assertFalse(opaque_key_util.is_valid_course_id(course_id))
 def test_invalid_course_id(self, course_id):
     self.assertFalse(opaque_key_util.is_valid_course_id(course_id))
Esempio n. 23
0
def get_problem_check_event(line_or_event):
    """
    Generates output values for explicit problem_check events.

    Args:

        line_or_event: pre-parsed event dict, or text line from a tracking event log

    Returns:

        (problem_id, username), (timestamp, problem_check_info)

        where timestamp is in ISO format, with resolution to the millisecond
        and problem_check_info is a JSON-serialized dict containing
        the contents of the problem_check event's 'event' field,
        augmented with entries for 'timestamp', 'username', and
        'context' from the event.

        or None if there is no valid problem_check event on the line.

    Example:
            (i4x://edX/DemoX/Demo_Course/problem/PS1_P1, dummy_username), (2013-09-10T00:01:05.123456, blah)

    """
    # Ensure the given event dict is a problem_check event
    if isinstance(line_or_event, dict):
        event = line_or_event
        if event.get('event_type') != 'problem_check':
            return None

    # Parse the line into an event dict, if not provided.
    else:
        event = eventlog.parse_json_server_event(line_or_event, 'problem_check')
        if event is None:
            return None

    # Get the "problem data".  This is the event data, the context, and anything else that would
    # be useful further downstream.  (We could just pass the entire event dict?)

    # Get the user from the username, not from the user_id in the
    # context.  While we are currently requiring context (as described
    # above), we might not in future.  Older events will not have
    # context information, so we can't rely on user_id from there.
    # And we don't expect problem_check events to occur without a
    # username, and don't expect them to occur with the wrong user
    # (i.e. one user acting on behalf of another, as in an instructor
    # acting on behalf of a student).
    augmented_data_fields = ['context', 'username', 'timestamp']
    problem_data = eventlog.get_augmented_event_data(event, augmented_data_fields)
    if problem_data is None:
        return None

    # Get the course_id from context.  We won't work with older events
    # that do not have context information, since they do not directly
    # provide course_id information.  (The problem_id/answer_id values
    # contain the org and course name, but not the run.)  Course_id
    # information could be found from other events, but it would
    # require expanding the events being selected.
    course_id = eventlog.get_course_id(event)
    if course_id is None:
        log.error("encountered explicit problem_check event with missing course_id: %s", event)
        return None

    if not opaque_key_util.is_valid_course_id(course_id):
        log.error("encountered explicit problem_check event with bogus course_id: %s", event)
        return None

    # Get the problem_id from the event data.
    problem_id = problem_data.get('problem_id')
    if problem_id is None:
        log.error("encountered explicit problem_check event with bogus problem_id: %s", event)
        return None

    event = event.get('event', {})
    answers = event.get('answers', {})
    if len(answers) == 0:
        return None

    try:
        _check_answer_ids(answers)
        _check_answer_ids(event.get('submission', {}))
    except (TypeError, ValueError):
        log.error("encountered explicit problem_check event with invalid answers: %s", event)
        return None

    problem_data_json = json.dumps(problem_data)
    key = (course_id, problem_id, problem_data.get('username'))
    value = (problem_data.get('timestamp'), problem_data_json)

    return key, value
Esempio n. 24
0
 def test_course_id_with_invalid_nonascii(self):
     self.assertFalse(opaque_key_util.is_valid_course_id(INVALID_NONASCII_LEGACY_COURSE_ID))
Esempio n. 25
0
 def test_no_course_id(self):
     self.assertFalse(opaque_key_util.is_valid_course_id(None))
 def test_valid_course_id(self, course_id):
     self.assertTrue(opaque_key_util.is_valid_course_id(course_id))
 def test_normal_legacy_course_id(self):
     self.assertTrue(opaque_key_util.is_valid_course_id(VALID_LEGACY_COURSE_ID))
 def test_valid_course_id(self, course_id):
     self.assertTrue(opaque_key_util.is_valid_course_id(course_id))
 def test_legacy_course_id_without_components(self):
     self.assertFalse(opaque_key_util.is_valid_course_id(INVALID_LEGACY_COURSE_ID))
 def test_course_id_with_nonascii(self):
     self.assertFalse(opaque_key_util.is_valid_course_id(NONASCII_LEGACY_COURSE_ID))
Esempio n. 31
0
 def test_newline_terminated_course_id(self):
     self.assertFalse(
         opaque_key_util.is_valid_course_id(VALID_COURSE_ID + '\n'))