def mapper(self, line):
        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, _date_string = value

        event_type = event.get('event_type')
        if event_type is None:
            log.error("encountered event with no event_type: %s", event)
            return

        if event_type not in (DEACTIVATED, ACTIVATED):
            return

        timestamp = eventlog.get_event_time_string(event)
        if timestamp is None:
            log.error("encountered event with bad timestamp: %s", event)
            return

        event_data = eventlog.get_event_data(event)
        if event_data is None:
            return

        course_id = event_data.get('course_id')
        if course_id is None or not opaque_key_util.is_valid_course_id(course_id):
            log.error("encountered explicit enrollment event with invalid course_id: %s", event)
            return

        user_id = event_data.get('user_id')
        if user_id is None:
            log.error("encountered explicit enrollment event with no user_id: %s", event)
            return

        yield (course_id, user_id), (timestamp, event_type)
Esempio n. 2
0
    def mapper(self, line):
        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, _date_string = value

        event_type = event.get('event_type')
        if event_type is None:
            log.error("encountered event with no event_type: %s", event)
            return

        if event_type not in (DEACTIVATED, ACTIVATED, MODE_CHANGED, VALIDATED):
            return

        timestamp = eventlog.get_event_time_string(event)
        if timestamp is None:
            log.error("encountered event with bad timestamp: %s", event)
            return

        event_data = eventlog.get_event_data(event)
        if event_data is None:
            return

        course_id = opaque_key_util.normalize_course_id(
            event_data.get('course_id'))
        if course_id is None or not opaque_key_util.is_valid_course_id(
                course_id):
            log.error(
                "encountered explicit enrollment event with invalid course_id: %s",
                event)
            return

        user_id = event_data.get('user_id')
        if user_id is None:
            log.error(
                "encountered explicit enrollment event with no user_id: %s",
                event)
            return

        mode = event_data.get('mode')
        if mode is None:
            log.error("encountered explicit enrollment event with no mode: %s",
                      event)
            return

        # Pull in extra properties provided only by synthetic enrollment validation events.
        validation_info = None
        if 'dump_start' in event_data:
            validation_info = {
                'is_active': event_data.get('is_active'),
                'created': event_data.get('created'),
                'dump_start': event_data.get('dump_start'),
                'dump_end': event_data.get('dump_end'),
            }

        # Make sure key values that are strings are properly encoded.
        # Note, however, that user_id is an int.
        key = (unicode(course_id).encode('utf-8'), user_id)
        yield key, (timestamp, event_type, mode, validation_info)
    def mapper(self, line):
        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, _date_string = value

        event_type = event.get('event_type')
        if event_type is None:
            log.error("encountered event with no event_type: %s", event)
            return

        if event_type not in (DEACTIVATED, ACTIVATED, MODE_CHANGED, VALIDATED):
            return

        timestamp = eventlog.get_event_time_string(event)
        if timestamp is None:
            log.error("encountered event with bad timestamp: %s", event)
            return

        event_data = eventlog.get_event_data(event)
        if event_data is None:
            return

        course_id = event_data.get('course_id')
        if course_id is None or not opaque_key_util.is_valid_course_id(course_id):
            log.error("encountered explicit enrollment event with invalid course_id: %s", event)
            return

        user_id = event_data.get('user_id')
        if user_id is None:
            log.error("encountered explicit enrollment event with no user_id: %s", event)
            return

        mode = event_data.get('mode')
        if mode is None:
            log.error("encountered explicit enrollment event with no mode: %s", event)
            return

        # Pull in extra properties provided only by synthetic enrollment validation events.
        validation_info = None
        if 'dump_start' in event_data:
            validation_info = {
                'is_active': event_data.get('is_active'),
                'created': event_data.get('created'),
                'dump_start': event_data.get('dump_start'),
                'dump_end': event_data.get('dump_end'),
            }

        # Make sure key values that are strings are properly encoded.
        # Note, however, that user_id is an int.
        key = (unicode(course_id).encode('utf-8'), user_id)
        yield key, (timestamp, event_type, mode, validation_info)
    def mapper(self, line):
        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, date_string = value

        user_id = event.get('context', {}).get('user_id')
        if not user_id:
            return

        try:
            user_id = int(user_id)
        except ValueError:
            self.incr_counter('User Location',
                              'Discard event with malformed user_id', 1)
            return

        # Get timestamp instead of date string, so we get the latest ip
        # address for events on the same day.
        timestamp = eventlog.get_event_time_string(event)
        if not timestamp:
            return

        ip_address = event.get('ip')
        if not ip_address:
            log.warning("No ip_address found for user '%s' on '%s'.", user_id,
                        timestamp)
            return

        # Get the course_id from context, if it happens to be present.
        # It's okay if it isn't.

        # (Not sure if there are particular types of course
        # interaction we care about, but we might want to only collect
        # the course_id off of explicit events, and ignore implicit
        # events as not being "real" interactions with course content.
        # Or maybe we add a flag indicating explicit vs. implicit, so
        # that this can be better teased apart.  For example, we could
        # use the latest explicit event for a course, but if there are
        # none, then use the latest implicit event for the course, and
        # if there are none, then use the latest overall event.)
        course_id = eventlog.get_course_id(event)

        # For multi-output, we will generate a single file for each key value.
        # When looking at location for user in a course, we don't want to have
        # an output file per course per date, so just use date as the key,
        # and have a single file representing all events on the date.
        yield date_string, (timestamp, ip_address, course_id, user_id)
    def mapper(self, line):
        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, date_string = value

        event_type = event.get('event_type')
        if event_type is None:
            log.error("encountered event with no event_type: %s", event)
            return

        if event_type not in (DEACTIVATED, ACTIVATED, MODE_CHANGED):
            return

        timestamp = eventlog.get_event_time_string(event)
        if timestamp is None:
            log.error("encountered event with bad timestamp: %s", event)
            return

        event_data = eventlog.get_event_data(event)
        if event_data is None:
            return

        course_id = opaque_key_util.normalize_course_id(
            event_data.get('course_id'))
        if course_id is None or not opaque_key_util.is_valid_course_id(
                course_id):
            log.error(
                "encountered explicit enrollment event with invalid course_id: %s",
                event)
            return

        user_id = event_data.get('user_id')
        if user_id is None:
            log.error(
                "encountered explicit enrollment event with no user_id: %s",
                event)
            return

        mode = event_data.get('mode')
        if mode is None:
            log.error("encountered explicit enrollment event with no mode: %s",
                      event)
            return

        yield date_string, (course_id, user_id, timestamp, event_type, mode)
    def mapper(self, line):
        """
        Args:
            line: text line from a tracking event log.

        Yields:  (course_id, org_id, problem_id), (timestamp, saved_tags, is_correct)

        """
        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, _ = value

        if event.get('event_type') != 'problem_check' or event.get(
                'event_source') != 'server':
            return

        timestamp = eventlog.get_event_time_string(event)
        if timestamp is None:
            return

        course_id = eventlog.get_course_id(event)
        if not course_id:
            return

        org_id = opaque_key_util.get_org_id_for_course(course_id)

        event_data = eventlog.get_event_data(event)
        if event_data is None:
            return

        problem_id = event_data.get('problem_id')
        if not problem_id:
            return

        is_correct = event_data.get('success') == 'correct'

        saved_tags = event.get('context').get('asides', {}).get(
            'tagging_aside', {}).get('saved_tags', {})

        yield (course_id, org_id, problem_id), (timestamp, saved_tags,
                                                is_correct)
    def mapper(self, line):
        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, date_string = value

        username = eventlog.get_event_username(event)
        if not username:
            return

        # Get timestamp instead of date string, so we get the latest ip
        # address for events on the same day.
        timestamp = eventlog.get_event_time_string(event)
        if not timestamp:
            return

        ip_address = event.get('ip')
        if not ip_address:
            log.warning("No ip_address found for user '%s' on '%s'.", username, timestamp)
            return

        # Get the course_id from context, if it happens to be present.
        # It's okay if it isn't.

        # (Not sure if there are particular types of course
        # interaction we care about, but we might want to only collect
        # the course_id off of explicit events, and ignore implicit
        # events as not being "real" interactions with course content.
        # Or maybe we add a flag indicating explicit vs. implicit, so
        # that this can be better teased apart.  For example, we could
        # use the latest explicit event for a course, but if there are
        # none, then use the latest implicit event for the course, and
        # if there are none, then use the latest overall event.)
        course_id = eventlog.get_course_id(event)

        # For multi-output, we will generate a single file for each key value.
        # When looking at location for user in a course, we don't want to have
        # an output file per course per date, so just use date as the key,
        # and have a single file representing all events on the date.
        yield date_string, (timestamp, ip_address, course_id, username)
    def mapper(self, line):
        """
        Args:
            line: text line from a tracking event log.

        Yields:  (course_id, org_id, problem_id), (timestamp, saved_tags, is_correct)

        """
        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, _ = value

        if event.get('event_type') != 'problem_check' or event.get('event_source') != 'server':
            return

        timestamp = eventlog.get_event_time_string(event)
        if timestamp is None:
            return

        course_id = eventlog.get_course_id(event)
        if not course_id:
            return

        org_id = opaque_key_util.get_org_id_for_course(course_id)

        event_data = eventlog.get_event_data(event)
        if event_data is None:
            return

        problem_id = event_data.get('problem_id')
        if not problem_id:
            return

        is_correct = event_data.get('success') == 'correct'

        saved_tags = event.get('context').get('asides', {}).get('tagging_aside', {}).get('saved_tags', {})

        yield (course_id, org_id, problem_id), (timestamp, saved_tags, is_correct)
    def mapper(self, line):
        # Add a filter here to permit quicker rejection of unrelated events.
        if VIDEO_EVENT_MINIMUM_STRING not in line:
            # self.incr_counter(self.counter_category_name, 'Discard Missing Video String', 1)
            return

        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, _date_string = value
        # self.incr_counter(self.counter_category_name, 'Inputs with Dates', 1)

        event_type = event.get('event_type')
        if event_type is None:
            log.error("encountered event with no event_type: %s", event)
            self.incr_counter(self.counter_category_name,
                              'Discard Missing Event Type', 1)
            return

        if event_type not in VIDEO_EVENT_TYPES:
            # self.incr_counter(self.counter_category_name, 'Discard Non-Video Event Type', 1)
            return

        # self.incr_counter(self.counter_category_name, 'Input Video Events', 1)

        # This has already been checked when getting the event, so just fetch the value.
        timestamp = eventlog.get_event_time_string(event)

        # Strip username to remove trailing newlines that mess up Luigi.
        username = event.get('username', '').strip()
        if not username:
            log.error("Video event without username: %s", event)
            ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1)
            ## self.incr_counter(self.counter_category_name, 'Discard Video Missing username', 1)
            return

        course_id = eventlog.get_course_id(event)
        if course_id is None:
            log.warn('Video event without valid course_id: {0}'.format(line))
            ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1)
            ## self.incr_counter(self.counter_category_name, 'Discard Video Missing course_id', 1)
            return

        event_data = eventlog.get_event_data(event)
        if event_data is None:
            # This should already have been logged.
            ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1)
            ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Event Data', 1)
            return

        encoded_module_id = event_data.get(
            'id', '').strip()  # we have seen id values with leading newline
        if not encoded_module_id:
            log.warn(
                'Video event without valid encoded_module_id (id): {0}'.format(
                    line))
            ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1)
            ## self.incr_counter(self.counter_category_name, 'Discard Video Missing encoded_module_id', 1)
            return

        video_duration = event_data.get('duration', VIDEO_UNKNOWN_DURATION)
        if not video_duration:
            # events may have a 'duration' value of null, so use the same default for those as well.
            video_duration = VIDEO_UNKNOWN_DURATION

        # self.incr_counter(self.counter_category_name, 'Video Events Before Time Check', 1)

        current_time = None
        old_time = None
        youtube_id = None
        if event_type == VIDEO_PLAYED:
            code = event_data.get('code')
            if code not in VIDEO_CODES:
                youtube_id = code
            current_time = self._check_time_offset(
                event_data.get('currentTime'), line)
            if current_time is None:
                ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1)
                ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1)
                ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Play', 1)
                return
            ### self.incr_counter(self.counter_category_name, 'Subset Play', 1)
        elif event_type == VIDEO_PAUSED:
            # Pause events may have a missing currentTime value if video is paused at the beginning,
            # so provide a default of zero.
            current_time = self._check_time_offset(
                event_data.get('currentTime', 0), line)
            if current_time is None:
                ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1)
                ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1)
                ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Pause', 1)
                return
            ### self.incr_counter(self.counter_category_name, 'Subset Pause', 1)
        elif event_type == VIDEO_SEEK:
            current_time = self._check_time_offset(event_data.get('new_time'),
                                                   line)
            old_time = self._check_time_offset(event_data.get('old_time'),
                                               line)
            if current_time is None or old_time is None:
                ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1)
                ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1)
                ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Seek', 1)
                return
            ### self.incr_counter(self.counter_category_name, 'Subset Seek', 1)
        elif event_type == VIDEO_STOPPED:
            current_time = self._check_time_offset(
                event_data.get('currentTime'), line)
            if current_time is None:
                ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1)
                ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1)
                ## self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Stop', 1)
                return
            ### self.incr_counter(self.counter_category_name, 'Subset Stop', 1)

        if youtube_id is not None:
            youtube_id = youtube_id.encode('utf8')

        # self.incr_counter(self.counter_category_name, 'Output Video Events from Mapper', 1)
        yield ((username.encode('utf8'), course_id.encode('utf8'),
                encoded_module_id.encode('utf8')),
               (timestamp, event_type, current_time, old_time, youtube_id,
                video_duration))
Esempio n. 10
0
    def mapper(self, line):
        # Add a filter here to permit quicker rejection of unrelated events.
        if VIDEO_EVENT_MINIMUM_STRING not in line:
            # self.incr_counter(self.counter_category_name, 'Discard Missing Video String', 1)
            return

        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, _date_string = value
        # self.incr_counter(self.counter_category_name, 'Inputs with Dates', 1)

        event_type = event.get('event_type')
        if event_type is None:
            log.error("encountered event with no event_type: %s", event)
            self.incr_counter(self.counter_category_name, 'Discard Missing Event Type', 1)
            return

        if event_type not in VIDEO_EVENT_TYPES:
            # self.incr_counter(self.counter_category_name, 'Discard Non-Video Event Type', 1)
            return

        # self.incr_counter(self.counter_category_name, 'Input Video Events', 1)

        # This has already been checked when getting the event, so just fetch the value.
        timestamp = eventlog.get_event_time_string(event)

        user_id = event.get('context', {}).get('user_id')
        if not user_id:
            log.error("Video event without user_id in context: %s", event)
            return
        # Convert user_id to int if str
        if not isinstance(user_id, int):
            user_id = int(user_id)

        course_id = eventlog.get_course_id(event)
        if course_id is None:
            log.warn('Video event without valid course_id: {0}'.format(line))
            # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1)
            # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing course_id', 1)
            return

        event_data = eventlog.get_event_data(event)
        if event_data is None:
            # This should already have been logged.
            # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1)
            # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Event Data', 1)
            return

        encoded_module_id = event_data.get('id', '').strip()  # we have seen id values with leading newline
        if not encoded_module_id:
            log.warn('Video event without valid encoded_module_id (id): {0}'.format(line))
            # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1)
            # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing encoded_module_id', 1)
            return

        video_duration = event_data.get('duration', VIDEO_UNKNOWN_DURATION)
        if not video_duration:
            # events may have a 'duration' value of null, so use the same default for those as well.
            video_duration = VIDEO_UNKNOWN_DURATION

        # self.incr_counter(self.counter_category_name, 'Video Events Before Time Check', 1)

        current_time = None
        old_time = None
        youtube_id = None
        if event_type == VIDEO_PLAYED:
            code = event_data.get('code')
            if code not in VIDEO_CODES:
                youtube_id = code
            current_time = self._check_time_offset(event_data.get('currentTime'), line)
            if current_time is None:
                # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1)
                # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1)
                # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Play', 1)
                return
            # Slow: self.incr_counter(self.counter_category_name, 'Subset Play', 1)
        elif event_type == VIDEO_PAUSED:
            # Pause events may have a missing currentTime value if video is paused at the beginning,
            # so provide a default of zero.
            current_time = self._check_time_offset(event_data.get('currentTime', 0), line)
            if current_time is None:
                # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1)
                # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1)
                # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Pause', 1)
                return
            # Slow: self.incr_counter(self.counter_category_name, 'Subset Pause', 1)
        elif event_type == VIDEO_SEEK:
            current_time = self._check_time_offset(event_data.get('new_time'), line)
            old_time = self._check_time_offset(event_data.get('old_time'), line)
            if current_time is None or old_time is None:
                # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1)
                # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1)
                # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Seek', 1)
                return
            # Slow: self.incr_counter(self.counter_category_name, 'Subset Seek', 1)
        elif event_type == VIDEO_STOPPED:
            current_time = self._check_time_offset(event_data.get('currentTime'), line)
            if current_time is None:
                # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Something', 1)
                # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time', 1)
                # Slow: self.incr_counter(self.counter_category_name, 'Discard Video Missing Time From Stop', 1)
                return
            # Slow: self.incr_counter(self.counter_category_name, 'Subset Stop', 1)

        if youtube_id is not None:
            youtube_id = youtube_id.encode('utf8')

        # self.incr_counter(self.counter_category_name, 'Output Video Events from Mapper', 1)
        yield (
            (user_id, course_id.encode('utf8'), encoded_module_id.encode('utf8')),
            (timestamp, event_type, current_time, old_time, youtube_id, video_duration)
        )
Esempio n. 11
0
    def mapper(self, line):
        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, date_string = value

        username = event.get('username', '').strip()
        if not username:
            return

        event_type = event.get('event_type')
        if event_type is None:
            return

        course_id = eventlog.get_course_id(event)
        if not course_id:
            return

        event_data = eventlog.get_event_data(event)
        if event_data is None:
            return

        event_source = event.get('event_source')

        entity_id = ''
        info = {}
        forum_post_voted = None
        if event_type == 'problem_check':
            if event_source != 'server':
                return

            problem_id = event_data.get('problem_id')
            if not problem_id:
                return

            entity_id = problem_id
            if event_data.get('success', 'incorrect').lower() == 'correct':
                info['correct'] = True
        elif event_type == 'play_video':
            encoded_module_id = event_data.get('id')
            if not encoded_module_id:
                return

            entity_id = encoded_module_id
        elif event_type[:9] == '/courses/' and re.match(
                self.SUBSECTION_ACCESSED_PATTERN, event_type):
            timestamp = eventlog.get_event_time_string(event)
            if timestamp is None:
                return
            info['path'] = event_type
            info['timestamp'] = timestamp
            event_type = SUBSECTION_VIEWED_MARKER
        elif event_type.startswith('edx.forum'):
            forum_post_voted = re.match(
                r'edx\.forum\.(?P<post_type>\w+)\.voted', event_type)
            if forum_post_voted:
                info['vote_value'] = event_data.get('vote_value')
                if info['vote_value'] not in ['up', 'down']:
                    return
                info['undo_vote'] = event_data.get('undo_vote', False)

        date_grouping_key = date_string

        if self.interval_type == 'weekly':
            last_complete_date = self.interval.date_b - datetime.timedelta(
                days=1)  # pylint: disable=no-member
            last_weekday = last_complete_date.isoweekday()

            split_date = date_string.split('-')
            event_date = datetime.date(int(split_date[0]), int(split_date[1]),
                                       int(split_date[2]))
            event_weekday = event_date.isoweekday()

            days_until_end = last_weekday - event_weekday
            if days_until_end < 0:
                days_until_end += 7

            end_of_week_date = event_date + datetime.timedelta(
                days=days_until_end)
            date_grouping_key = end_of_week_date.isoformat()

        elif self.interval_type == 'all':
            # If gathering all data for a given user, use the last complete day of the interval
            # for joining with enrollment.
            last_complete_date = self.interval.date_b - datetime.timedelta(
                days=1)  # pylint: disable=no-member
            date_grouping_key = last_complete_date.isoformat()

        yield ((date_grouping_key, course_id, username),
               (entity_id, event_type, json.dumps(info), date_string))

        if forum_post_voted:
            # We emit two events for each "voted" event - one for the voting user and one for the
            # user receiving the vote.
            username = event_data.get('target_username')
            if not username:
                return
            event_type = 'edx.forum.{}.vote_received'.format(
                forum_post_voted.group('post_type'))
            yield ((date_grouping_key, course_id, username),
                   (entity_id, event_type, json.dumps(info), date_string))
Esempio n. 12
0
    def mapper(self, line):
        # Add a filter here to permit quicker rejection of unrelated events.
        if VIDEO_EVENT_MINIMUM_STRING not in line:
            return

        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, _date_string = value

        event_type = event.get('event_type')
        if event_type is None:
            log.error("encountered event with no event_type: %s", event)
            return

        if event_type not in VIDEO_EVENT_TYPES:
            return

        # This has already been checked when getting the event, so just fetch the value.
        timestamp = eventlog.get_event_time_string(event)

        # Strip username to remove trailing newlines that mess up Luigi.
        username = event.get('username', '').strip()
        if not username:
            log.error("Video event without username: %s", event)
            return

        course_id = eventlog.get_course_id(event)
        if course_id is None:
            log.warn('Video event without valid course_id: {0}'.format(line))
            return

        event_data = eventlog.get_event_data(event)
        if event_data is None:
            # This should already have been logged.
            return

        encoded_module_id = event_data.get('id')
        if encoded_module_id is None:
            log.warn('Video event without valid encoded_module_id (id): {0}'.format(line))
            return

        current_time = None
        old_time = None
        youtube_id = None
        if event_type == VIDEO_PLAYED:
            code = event_data.get('code')
            if code not in ('html5', 'mobile'):
                youtube_id = code
            current_time = self._check_time_offset(event_data.get('currentTime'), line)
            if current_time is None:
                return
        elif event_type == VIDEO_PAUSED:
            # Pause events may have a missing currentTime value if video is paused at the beginning,
            # so provide a default of zero.
            current_time = self._check_time_offset(event_data.get('currentTime', 0), line)
            if current_time is None:
                return
        elif event_type == VIDEO_SEEK:
            current_time = self._check_time_offset(event_data.get('new_time'), line)
            old_time = self._check_time_offset(event_data.get('old_time'), line)
            if current_time is None or old_time is None:
                return
        elif event_type == VIDEO_STOPPED:
            current_time = self._check_time_offset(event_data.get('currentTime'), line)
            if current_time is None:
                return

        if youtube_id is not None:
            youtube_id = youtube_id.encode('utf8')

        yield (
            (username.encode('utf8'), course_id.encode('utf8'), encoded_module_id.encode('utf8')),
            (timestamp, event_type, current_time, old_time, youtube_id)
        )
    def mapper(self, line):
        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, date_string = value

        username = event.get('username', '').strip()
        if not username:
            return

        event_type = event.get('event_type')
        if event_type is None:
            return

        course_id = eventlog.get_course_id(event)
        if not course_id:
            return

        event_data = eventlog.get_event_data(event)
        if event_data is None:
            return

        event_source = event.get('event_source')

        entity_id = ''
        info = {}
        if event_type == 'problem_check':
            if event_source != 'server':
                return

            problem_id = event_data.get('problem_id')
            if not problem_id:
                return

            entity_id = problem_id
            if event_data.get('success', 'incorrect').lower() == 'correct':
                info['correct'] = True
        elif event_type == 'play_video':
            encoded_module_id = event_data.get('id')
            if not encoded_module_id:
                return

            entity_id = encoded_module_id
        elif event_type[:9] == '/courses/' and re.match(self.SUBSECTION_ACCESSED_PATTERN, event_type):
            timestamp = eventlog.get_event_time_string(event)
            if timestamp is None:
                return
            info['path'] = event_type
            info['timestamp'] = timestamp
            event_type = SUBSECTION_VIEWED_MARKER

        date_grouping_key = date_string

        if self.interval_type == 'weekly':
            last_complete_date = self.interval.date_b - datetime.timedelta(days=1)  # pylint: disable=no-member
            last_weekday = last_complete_date.isoweekday()

            split_date = date_string.split('-')
            event_date = datetime.date(int(split_date[0]), int(split_date[1]), int(split_date[2]))
            event_weekday = event_date.isoweekday()

            days_until_end = last_weekday - event_weekday
            if days_until_end < 0:
                days_until_end += 7

            end_of_week_date = event_date + datetime.timedelta(days=days_until_end)
            date_grouping_key = end_of_week_date.isoformat()

        elif self.interval_type == 'all':
            # If gathering all data for a given user, use the last complete day of the interval
            # for joining with enrollment.
            last_complete_date = self.interval.date_b - datetime.timedelta(days=1)  # pylint: disable=no-member
            date_grouping_key = last_complete_date.isoformat()

        yield ((date_grouping_key, course_id, username), (entity_id, event_type, json.dumps(info), date_string))
    def mapper(self, line):
        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, date_string = value

        username = event.get('username', '').strip()
        if not username:
            return

        event_type = event.get('event_type')
        if event_type is None:
            return

        course_id = eventlog.get_course_id(event)
        if not course_id:
            return

        event_data = eventlog.get_event_data(event)
        if event_data is None:
            return

        event_source = event.get('event_source')

        entity_id = ''
        info = {}
        forum_post_voted = None
        if event_type == 'problem_check':
            if event_source != 'server':
                return

            problem_id = event_data.get('problem_id')
            if not problem_id:
                return

            entity_id = problem_id
            if event_data.get('success', 'incorrect').lower() == 'correct':
                info['correct'] = True
        elif event_type == 'play_video':
            encoded_module_id = event_data.get('id')
            if not encoded_module_id:
                return

            entity_id = encoded_module_id
        elif event_type[:9] == '/courses/' and re.match(self.SUBSECTION_ACCESSED_PATTERN, event_type):
            timestamp = eventlog.get_event_time_string(event)
            if timestamp is None:
                return
            info['path'] = event_type
            info['timestamp'] = timestamp
            event_type = SUBSECTION_VIEWED_MARKER
        elif event_type.startswith('edx.forum'):
            forum_post_voted = re.match(r'edx\.forum\.(?P<post_type>\w+)\.voted', event_type)
            if forum_post_voted:
                info['vote_value'] = event_data.get('vote_value')
                if info['vote_value'] not in ['up', 'down']:
                    return
                info['undo_vote'] = event_data.get('undo_vote', False)

        date_grouping_key = date_string

        if self.interval_type == 'weekly':
            last_complete_date = self.interval.date_b - datetime.timedelta(days=1)  # pylint: disable=no-member
            last_weekday = last_complete_date.isoweekday()

            split_date = date_string.split('-')
            event_date = datetime.date(int(split_date[0]), int(split_date[1]), int(split_date[2]))
            event_weekday = event_date.isoweekday()

            days_until_end = last_weekday - event_weekday
            if days_until_end < 0:
                days_until_end += 7

            end_of_week_date = event_date + datetime.timedelta(days=days_until_end)
            date_grouping_key = end_of_week_date.isoformat()

        elif self.interval_type == 'all':
            # If gathering all data for a given user, use the last complete day of the interval
            # for joining with enrollment.
            last_complete_date = self.interval.date_b - datetime.timedelta(days=1)  # pylint: disable=no-member
            date_grouping_key = last_complete_date.isoformat()

        yield ((date_grouping_key, course_id, username), (entity_id, event_type, json.dumps(info), date_string))

        if forum_post_voted:
            # We emit two events for each "voted" event - one for the voting user and one for the
            # user receiving the vote.
            username = event_data.get('target_username')
            if not username:
                return
            event_type = 'edx.forum.{}.vote_received'.format(forum_post_voted.group('post_type'))
            yield ((date_grouping_key, course_id, username), (entity_id, event_type, json.dumps(info), date_string))
Esempio n. 15
0
    def mapper(self, line):
        value = self.get_event_and_date_string(line)
        if value is None:
            return
        event, date_string = value

        username = event.get('username', '').strip()
        if not username:
            return

        event_type = event.get('event_type')
        if event_type is None:
            return

        course_id = eventlog.get_course_id(event)
        if not course_id:
            return

        event_data = eventlog.get_event_data(event)
        if event_data is None:
            return

        event_source = event.get('event_source')

        entity_id = ''
        info = {}
        if event_type == 'problem_check':
            if event_source != 'server':
                return

            problem_id = event_data.get('problem_id')
            if not problem_id:
                return

            entity_id = problem_id
            if event_data.get('success', 'incorrect').lower() == 'correct':
                info['correct'] = True
        elif event_type == 'play_video':
            encoded_module_id = event_data.get('id')
            if not encoded_module_id:
                return

            entity_id = encoded_module_id
        elif event_type[:9] == '/courses/' and re.match(
                self.SUBSECTION_ACCESSED_PATTERN, event_type):
            timestamp = eventlog.get_event_time_string(event)
            if timestamp is None:
                return
            info['path'] = event_type
            info['timestamp'] = timestamp
            event_type = SUBSECTION_VIEWED_MARKER

        date_grouping_key = date_string

        if self.interval_type == 'weekly':
            last_complete_date = self.interval.date_b - datetime.timedelta(
                days=1)  # pylint: disable=no-member
            last_weekday = last_complete_date.isoweekday()

            split_date = date_string.split('-')
            event_date = datetime.date(int(split_date[0]), int(split_date[1]),
                                       int(split_date[2]))
            event_weekday = event_date.isoweekday()

            days_until_end = last_weekday - event_weekday
            if days_until_end < 0:
                days_until_end += 7

            end_of_week_date = event_date + datetime.timedelta(
                days=days_until_end)
            date_grouping_key = end_of_week_date.isoformat()

        elif self.interval_type == 'all':
            # If gathering all data for a given user, use the last complete day of the interval
            # for joining with enrollment.
            last_complete_date = self.interval.date_b - datetime.timedelta(
                days=1)  # pylint: disable=no-member
            date_grouping_key = last_complete_date.isoformat()

        yield ((date_grouping_key, course_id, username),
               (entity_id, event_type, json.dumps(info), date_string))