def get_course_id(event, from_url=False): """Gets course_id from event's data.""" # Get the event data: event_context = event.get('context') if event_context is None: # Assume it's old, and not worth logging... return None # Get the course_id from the data, and validate. course_id = opaque_key_util.normalize_course_id(event_context.get('course_id', '')) if course_id: if opaque_key_util.is_valid_course_id(course_id): return course_id else: log.error("encountered event with bogus course_id: %s", event) return None # Try to get the course_id from the URLs in `event_type` (for implicit # server events) and `page` (for browser events). if from_url: source = event.get('event_source') if source == 'server': url = event.get('event_type', '') elif source == 'browser': url = event.get('page', '') else: url = '' course_key = opaque_key_util.get_course_key_from_url(url) if course_key: return unicode(course_key) return None
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, _date_string = value event_type = event.get('event_type') if event_type is None: log.error("encountered event with no event_type: %s", event) return if event_type not in (DEACTIVATED, ACTIVATED, MODE_CHANGED, VALIDATED): return timestamp = eventlog.get_event_time_string(event) if timestamp is None: log.error("encountered event with bad timestamp: %s", event) return event_data = eventlog.get_event_data(event) if event_data is None: return course_id = opaque_key_util.normalize_course_id( event_data.get('course_id')) if course_id is None or not opaque_key_util.is_valid_course_id( course_id): log.error( "encountered explicit enrollment event with invalid course_id: %s", event) return user_id = event_data.get('user_id') if user_id is None: log.error( "encountered explicit enrollment event with no user_id: %s", event) return mode = event_data.get('mode') if mode is None: log.error("encountered explicit enrollment event with no mode: %s", event) return # Pull in extra properties provided only by synthetic enrollment validation events. validation_info = None if 'dump_start' in event_data: validation_info = { 'is_active': event_data.get('is_active'), 'created': event_data.get('created'), 'dump_start': event_data.get('dump_start'), 'dump_end': event_data.get('dump_end'), } # Make sure key values that are strings are properly encoded. # Note, however, that user_id is an int. key = (unicode(course_id).encode('utf-8'), user_id) yield key, (timestamp, event_type, mode, validation_info)
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, _date_string = value event_type = event.get('event_type') if event_type is None: log.error("encountered event with no event_type: %s", event) return if event_type not in (DEACTIVATED, ACTIVATED, MODE_CHANGED, VALIDATED): return timestamp = eventlog.get_event_time_string(event) if timestamp is None: log.error("encountered event with bad timestamp: %s", event) return event_data = eventlog.get_event_data(event) if event_data is None: return course_id = opaque_key_util.normalize_course_id(event_data.get('course_id')) if course_id is None or not opaque_key_util.is_valid_course_id(course_id): log.error("encountered explicit enrollment event with invalid course_id: %s", event) return user_id = event_data.get('user_id') if user_id is None: log.error("encountered explicit enrollment event with no user_id: %s", event) return mode = event_data.get('mode') if mode is None: log.error("encountered explicit enrollment event with no mode: %s", event) return # Pull in extra properties provided only by synthetic enrollment validation events. validation_info = None if 'dump_start' in event_data: validation_info = { 'is_active': event_data.get('is_active'), 'created': event_data.get('created'), 'dump_start': event_data.get('dump_start'), 'dump_end': event_data.get('dump_end'), } # Make sure key values that are strings are properly encoded. # Note, however, that user_id is an int. key = (unicode(course_id).encode('utf-8'), user_id) yield key, (timestamp, event_type, mode, validation_info)
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, date_string = value event_type = event.get('event_type') if event_type is None: log.error("encountered event with no event_type: %s", event) return if event_type not in (DEACTIVATED, ACTIVATED, MODE_CHANGED): return timestamp = eventlog.get_event_time_string(event) if timestamp is None: log.error("encountered event with bad timestamp: %s", event) return event_data = eventlog.get_event_data(event) if event_data is None: return course_id = opaque_key_util.normalize_course_id( event_data.get('course_id')) if course_id is None or not opaque_key_util.is_valid_course_id( course_id): log.error( "encountered explicit enrollment event with invalid course_id: %s", event) return user_id = event_data.get('user_id') if user_id is None: log.error( "encountered explicit enrollment event with no user_id: %s", event) return mode = event_data.get('mode') if mode is None: log.error("encountered explicit enrollment event with no mode: %s", event) return yield date_string, (course_id, user_id, timestamp, event_type, mode)
def run(self): self.remove_output_on_overwrite() with self.input().open('r') as input_file: course_structure = json.load(input_file) with self.output().open('w') as output_file: courses_list = course_structure.get('results') if not courses_list: # If there are no courses, or 'results' is not a key in the json, output nothing. return for course in courses_list: # To maintain robustness, ignore any non-dictionary data that finds its way into the API response. try: start_string = course.get('start') end_string = course.get('end') if start_string is None: cleaned_start_string = '\N' else: cleaned_start_string = ciso8601.parse_datetime( start_string) if end_string is None: cleaned_end_string = '\N' else: cleaned_end_string = ciso8601.parse_datetime( end_string) course_id = normalize_course_id(course.get('id', '\N')) if is_valid_course_id(course_id): course_key = CourseKey.from_string(course_id) course_run = course_key.run else: course_run = '\N' line = [ course_id, course.get('org', '\N'), course.get('number', '\N'), course_run, coerce_timestamp_for_hive(cleaned_start_string), coerce_timestamp_for_hive(cleaned_end_string), course.get('name', '\N') ] output_file.write('\t'.join( [v.encode('utf-8') for v in line])) output_file.write('\n') except AttributeError: # If the course is not a dictionary, move on to the next one. continue
def mapper(self, line): value = self.get_event_and_date_string(line) if value is None: return event, date_string = value event_type = event.get('event_type') if event_type is None: log.error("encountered event with no event_type: %s", event) return if event_type not in (DEACTIVATED, ACTIVATED, MODE_CHANGED): return timestamp = eventlog.get_event_time_string(event) if timestamp is None: log.error("encountered event with bad timestamp: %s", event) return event_data = eventlog.get_event_data(event) if event_data is None: return course_id = opaque_key_util.normalize_course_id(event_data.get('course_id')) if course_id is None or not opaque_key_util.is_valid_course_id(course_id): log.error("encountered explicit enrollment event with invalid course_id: %s", event) return user_id = event_data.get('user_id') if user_id is None: log.error("encountered explicit enrollment event with no user_id: %s", event) return mode = event_data.get('mode') if mode is None: log.error("encountered explicit enrollment event with no mode: %s", event) return yield date_string, (course_id, user_id, timestamp, event_type, mode)
def get_course_id(event, from_url=False): """Gets course_id from event's data.""" # Get the event data: event_context = event.get('context') if event_context is None: # Assume it's old, and not worth logging... return None # Get the course_id from the data, and validate. course_id = opaque_key_util.normalize_course_id( event_context.get('course_id', '')) if course_id: if opaque_key_util.is_valid_course_id(course_id): return course_id else: log.error("encountered event with bogus course_id: %s", event) return None # Try to get the course_id from the URLs in `event_type` (for implicit # server events) and `page` (for browser events). if from_url: source = event.get('event_source') if source == 'server': url = event.get('event_type', '') elif source == 'browser': url = event.get('page', '') else: url = '' course_key = opaque_key_util.get_course_key_from_url(url) if course_key: return unicode(course_key) return None
def get_explicit_enrollment_output(line): """ Generates output values for explicit enrollment events. Args: line: text line from a tracking event log. Returns: (course_id, user_id), (timestamp, action_value) where action_value = 1 (enrolled) or -1 (unenrolled) and timestamp is in ISO format, with resolution to the millisecond. or None if there is no valid enrollment event on the line. Example: (edX/DemoX/Demo_Course, dummy_userid), (2013-09-10T00:01:05.123456, 1) """ # Before parsing, check that the line contains something that # suggests it's an enrollment event. if 'edx.course.enrollment' not in line: return None # try to parse the line into a dict: event = eventlog.parse_json_event(line) if event is None: # The line didn't parse. For this specific purpose, # we can assume that all enrollment-related lines would parse, # and these non-parsing lines would get skipped anyway. return None # get event type, and check that it exists: event_type = event.get('event_type') if event_type is None: log.error("encountered event with no event_type: %s", event) return None # convert the type to a value: if event_type == 'edx.course.enrollment.activated': action_value = ENROLLED elif event_type == 'edx.course.enrollment.deactivated': action_value = UNENROLLED else: # not an enrollment event... return None # get the timestamp: datetime = eventlog.get_event_time(event) if datetime is None: log.error("encountered event with bad datetime: %s", event) return None timestamp = eventlog.datetime_to_timestamp(datetime) # Use the `user_id` from the event `data` field, since the # `user_id` in the `context` field is the user who made the # request but not necessarily the one who got enrolled. (The # `course_id` should be the same in `context` as in `data`.) # Get the event data: event_data = eventlog.get_event_data(event) if event_data is None: # Assume it's already logged (and with more specifics). return None # Get the course_id from the data, and validate. course_id = opaque_key_util.normalize_course_id(event_data['course_id']) if not opaque_key_util.is_valid_course_id(course_id): log.error( "encountered explicit enrollment event with bogus course_id: %s", event) return None # Get the user_id from the data: user_id = event_data.get('user_id') if user_id is None: log.error("encountered explicit enrollment event with no user_id: %s", event) return None # For now, ignore the enrollment 'mode' (e.g. 'honor'). return (course_id, user_id), (timestamp, action_value)
def get_explicit_enrollment_output(line): """ Generates output values for explicit enrollment events. Args: line: text line from a tracking event log. Returns: (course_id, user_id), (timestamp, action_value) where action_value = 1 (enrolled) or -1 (unenrolled) and timestamp is in ISO format, with resolution to the millisecond. or None if there is no valid enrollment event on the line. Example: (edX/DemoX/Demo_Course, dummy_userid), (2013-09-10T00:01:05.123456, 1) """ # Before parsing, check that the line contains something that # suggests it's an enrollment event. if 'edx.course.enrollment' not in line: return None # try to parse the line into a dict: event = eventlog.parse_json_event(line) if event is None: # The line didn't parse. For this specific purpose, # we can assume that all enrollment-related lines would parse, # and these non-parsing lines would get skipped anyway. return None # get event type, and check that it exists: event_type = event.get('event_type') if event_type is None: log.error("encountered event with no event_type: %s", event) return None # convert the type to a value: if event_type == 'edx.course.enrollment.activated': action_value = ENROLLED elif event_type == 'edx.course.enrollment.deactivated': action_value = UNENROLLED else: # not an enrollment event... return None # get the timestamp: datetime = eventlog.get_event_time(event) if datetime is None: log.error("encountered event with bad datetime: %s", event) return None timestamp = eventlog.datetime_to_timestamp(datetime) # Use the `user_id` from the event `data` field, since the # `user_id` in the `context` field is the user who made the # request but not necessarily the one who got enrolled. (The # `course_id` should be the same in `context` as in `data`.) # Get the event data: event_data = eventlog.get_event_data(event) if event_data is None: # Assume it's already logged (and with more specifics). return None # Get the course_id from the data, and validate. course_id = opaque_key_util.normalize_course_id(event_data['course_id']) if not opaque_key_util.is_valid_course_id(course_id): log.error("encountered explicit enrollment event with bogus course_id: %s", event) return None # Get the user_id from the data: user_id = event_data.get('user_id') if user_id is None: log.error("encountered explicit enrollment event with no user_id: %s", event) return None # For now, ignore the enrollment 'mode' (e.g. 'honor'). return (course_id, user_id), (timestamp, action_value)