class ValidateEnrollmentForEvents(object): """ Collects and validates events for a given user in a given course. """ def __init__(self, course_id, user_id, interval, events, **kwargs): self.course_id = course_id self.user_id = user_id self.interval = interval self.creation_timestamp = None self.tuple_output = kwargs.get('tuple_output') self.include_nonstate_changes = kwargs.get('include_nonstate_changes') self.generate_before = kwargs.get('generate_before') self.lower_bound_date_string = kwargs.get('lower_bound_date_string') self.earliest_timestamp = kwargs.get('earliest_timestamp') self.expected_validation = kwargs.get('expected_validation') if self.tuple_output: self.generate_output = self._create_tuple else: self.factory = SyntheticEventFactory( event_source='server', synthesizer='enrollment_validation', ) self.generate_output = self._synthetic_event # Create list of events in reverse order, as processing goes backwards # from validation states. self.sorted_events = [ EnrollmentEvent(timestamp, event_type, mode, validation_info) for timestamp, event_type, mode, validation_info in sorted(events, reverse=True) ] self._reorder_within_dumps() if self.expected_validation: self._check_for_missing_validation() # Add a marker event to signal the beginning of the interval. initial_state = EnrollmentEvent(None, SENTINEL, mode='honor', validation_info=None) self.sorted_events.append(initial_state) def _reorder_within_dumps(self): """ Fix the timestamp of a validation event if an enrollment event occurs during the dump. """ num_events = len(self.sorted_events) - 1 for index in range(num_events): event = self.sorted_events[index] prev_event = self.sorted_events[index + 1] is_nonvalidate_during_validate = ( event.event_type == VALIDATED and prev_event.event_type != VALIDATED and event.is_during_dump(prev_event.timestamp) ) if is_nonvalidate_during_validate: is_active_is_inconsistent = ( (event.is_active and prev_event.event_type == DEACTIVATED) or (not event.is_active and prev_event.event_type == ACTIVATED) ) mode_is_inconsistent = (event.mode != prev_event.mode and prev_event.event_type == MODE_CHANGED) if is_active_is_inconsistent or mode_is_inconsistent: # Change the timestamp of the validation event to precede # the other event, and swap them. event.timestamp = add_microseconds(prev_event.timestamp, -1) self.sorted_events[index] = prev_event self.sorted_events[index + 1] = event def _check_for_missing_validation(self): """Check last event to see if it is missing an expected validation event.""" if len(self.sorted_events) > 0 and self.sorted_events[0].timestamp < self.expected_validation: validation_info = { 'is_active': False, 'created': None, 'dump_start': None, 'dump_end': None, } missing_event = [EnrollmentEvent( self.expected_validation, MISSING, mode=None, validation_info=validation_info )] missing_event.extend(self.sorted_events) self.sorted_events = missing_event def missing_enrolled(self): """ A synthetic event is yielded for each transition in user's events for which a real event is missing. Yields: json-encoded string representing a synthetic event, or a tuple. """ # The last element of the list is a placeholder indicating the beginning of the interval. # Don't process it. num_events = len(self.sorted_events) - 1 self._initialize_state(self.sorted_events[0]) all_missing_events = [] for index in range(num_events): event = self.sorted_events[index + 1] missing_events = self._check_event(event) if missing_events: all_missing_events.extend(missing_events) return all_missing_events def _create_tuple(self, timestamp, event_type, mode, reason, after=None, before=None): """Returns a tuple representation of the output, for TSV-based debugging.""" datestamp = eventlog.timestamp_to_datestamp(timestamp) return datestamp, (self.course_id, self.user_id, timestamp, event_type, mode, reason, after, before) def _synthetic_event(self, timestamp, event_type, mode, reason, after=None, before=None): """Create a synthetic event.""" # data specific to course enrollment events: event_data = { 'course_id': self.course_id, 'user_id': self.user_id, 'mode': mode, } event_properties = { # main properties: 'time': timestamp, 'event_type': event_type, # stuff for context: 'user_id': self.user_id, 'course_id': self.course_id, 'org_id': opaque_key_util.get_org_id_for_course(self.course_id), # stuff for synthesized: 'reason': reason, } event = self.factory.create_event_dict(event_data, **event_properties) synthesized = event['synthesized'] if after: synthesized['after_time'] = after if before: synthesized['before_time'] = before datestamp = eventlog.timestamp_to_datestamp(timestamp) return datestamp, json.dumps(event) def _truncate_timestamp(self, timestamp): """Makes sure that timestamp is no earlier than limit specified, if any.""" if self.earliest_timestamp and timestamp < self.earliest_timestamp: return self.earliest_timestamp else: return timestamp def _get_fake_timestamp(self, after, before): """ Pick a time in an interval. Picks a microsecond after `after`, else a microsecond before `before`. Input and output values are ISO format strings. """ # Just pick the time at the beginning of the interval. if after: # Add a microsecond to 'after' return add_microseconds(after, 1) else: # Subtract a microsecond from 'before' return add_microseconds(before, -1) def _get_reason_string(self, prev_event, curr_event_string, curr_mode=None): """Provide a readable string giving the reason for generating the synthetic event.""" if curr_mode: return "{prev} => {curr} ({prev_mode}=>{curr_mode})".format( prev=prev_event.get_state_string(), curr=curr_event_string, prev_mode=prev_event.mode, curr_mode=curr_mode, ) else: return "{prev} => {curr}".format( prev=prev_event.get_state_string(), curr=curr_event_string, ) def _initialize_state(self, event): """Define initial values for validation state machine.""" self._update_state(event) # If the most-recent event is a mode-change event, then we don't # know from it what the activation is. if event.event_type == MODE_CHANGED: self.activation_type = None def _update_state(self, event): """Define current values for validation state machine.""" # Some events define an activation state. if event.event_type != MODE_CHANGED: self.activation_type = event.event_type self.activation_state = event.get_state_string() self.activation_timestamp = event.timestamp # Most events set mode. self.mode_changed = (event.event_type == MODE_CHANGED) if event.event_type != MISSING: self.current_mode = event.mode self.mode_type = event.event_type self.mode_state = event.get_state_string() self.mode_timestamp = event.timestamp else: self.current_mode = None # Only validation events define a created timestamp and activation state. if event.event_type == VALIDATED: self.currently_active = event.is_active if self.creation_timestamp: # compare with previously-viewed (i.e. later-in-time) validation: if event.created != self.creation_timestamp: log.error("Encountered validation with different creation timestamp: %s => %s", event.created, self.creation_timestamp) # Use the earliest validation: self.creation_timestamp = event.created def _check_for_mode_change(self, prev_event, last_timestamp): """Check if a mode-change event should be synthesized.""" # There are several reasons to suppress a mode-change. # The most obvious is if there is no change in mode. # Other cases include: # * No "current" mode (e.g. current event is "MISSING"). # * Current event is known to reset the mode, so no comparison # is needed (or possible). This includes MODE_CHANGED and ACTIVATE # events. # * No previous event (i.e. previous event is "SENTINEL"). suppress_mode_change = ( prev_event.mode == self.current_mode or not self.current_mode or self.mode_changed or self.mode_type == ACTIVATED or prev_event.event_type == SENTINEL ) if suppress_mode_change: return [] else: curr = self.mode_timestamp timestamp = self._get_fake_timestamp(last_timestamp, curr) reason = self._get_reason_string(prev_event, self.mode_state, self.current_mode) return [self.generate_output(timestamp, MODE_CHANGED, self.current_mode, reason, last_timestamp, curr)] def _check_on_activated(self, generate_output_for_event): """Check if a deactivation event should be synthesized after an activation event.""" if self.activation_type == ACTIVATED and self.include_nonstate_changes: # Duplicate activate event (a/a). return [generate_output_for_event(DEACTIVATED)] elif self.activation_type == DEACTIVATED: pass # normal case elif self.activation_type == VALIDATED and self.currently_active: pass # normal case elif self.activation_type == VALIDATED and not self.currently_active: # Missing deactivate event (a/vi) return [generate_output_for_event(DEACTIVATED)] elif self.activation_type == MISSING: # Missing deactivate event (a/m) return [generate_output_for_event(DEACTIVATED)] return [] def _check_on_deactivated(self, generate_output_for_event): """Check if an activation event should be synthesized after a deactivation event.""" if self.activation_type == ACTIVATED: pass # normal case elif self.activation_type == DEACTIVATED and self.include_nonstate_changes: # Duplicate deactivate event (d/d). return [generate_output_for_event(ACTIVATED)] elif self.activation_type == VALIDATED and not self.currently_active: pass # normal case elif self.activation_type == VALIDATED and self.currently_active: # Missing activate event (d/va) return [generate_output_for_event(ACTIVATED)] elif self.activation_type == MISSING and self.include_nonstate_changes: # Missing validation event (d/m) -- generate a second deactivate event # just to mark the inconsistency. return [generate_output_for_event(DEACTIVATED)] return [] def _check_on_validation(self, prev_event, generate_output_for_event): """Check if an event should be synthesized after a validation event.""" if self.activation_type == ACTIVATED: if prev_event.is_active and self.include_nonstate_changes: # Missing deactivate (va/a) return [generate_output_for_event(DEACTIVATED)] elif self.activation_type == DEACTIVATED: if not prev_event.is_active and self.include_nonstate_changes: # Missing activate (vi/d) return [generate_output_for_event(ACTIVATED)] elif self.activation_type == VALIDATED: if prev_event.is_active and not self.currently_active: # Missing deactivate (va/vi) return [generate_output_for_event(DEACTIVATED)] elif not prev_event.is_active and self.currently_active: # Missing activate (vi/va) return [generate_output_for_event(ACTIVATED)] return [] def _check_earliest_event(self, reason): """Check if an event should be synthesized before the first real event.""" curr = self.activation_timestamp if self.activation_type == ACTIVATED: pass # normal case elif self.activation_type == DEACTIVATED: # If we had a validation after the deactivation, # and it provided a creation_timestamp within the interval, # then there should be an activate within the interval. if self.creation_timestamp and ( self.generate_before or self.creation_timestamp >= self.lower_bound_date_string): timestamp = self._truncate_timestamp(self.creation_timestamp) return [(self.generate_output( timestamp, ACTIVATED, self.current_mode, reason, self.creation_timestamp, curr ))] elif self.generate_before: # For now, hack the timestamp by making it a little before the deactivate, # so that it at least has a value. timestamp = self._get_fake_timestamp(None, curr) return [(self.generate_output(timestamp, ACTIVATED, self.current_mode, reason, None, curr))] elif self.activation_type == VALIDATED: creation_timestamp = self._truncate_timestamp(self.creation_timestamp) if not self.generate_before and self.creation_timestamp < self.lower_bound_date_string: # If we are validating only within an interval and the create_timestamp # is outside this interval, we can't know whether the events are really # missing or just not included. pass elif self.currently_active: return [(self.generate_output( creation_timestamp, ACTIVATED, self.current_mode, reason, self.creation_timestamp, curr ))] elif self.include_nonstate_changes: # There may be missing Activate and Deactivate events, or there may # just be an inactive table row that was created as part of an enrollment # flow, but no enrollment was completed. event1 = self.generate_output( creation_timestamp, ACTIVATED, self.current_mode, reason, self.creation_timestamp, curr ) timestamp = self._get_fake_timestamp(creation_timestamp, curr) event2 = self.generate_output( timestamp, DEACTIVATED, self.current_mode, reason, self.creation_timestamp, curr ) return [event1, event2] return [] def _check_event(self, prev_event): """Compare a previous event with current state generated from later events. """ prev_timestamp_for_mode_change = prev_event.timestamp missing = [] if self.activation_type is not None: reason = self._get_reason_string(prev_event, self.activation_state) timestamp = self._get_fake_timestamp(prev_event.timestamp, self.activation_timestamp) def generate_output_for_event(event_type): """Wrapper to generate a synthetic event with common values.""" return self.generate_output( timestamp, event_type, prev_event.mode, reason, prev_event.timestamp, self.activation_timestamp ) prev_type = prev_event.event_type if prev_type == ACTIVATED: missing.extend(self._check_on_activated(generate_output_for_event)) elif prev_type == DEACTIVATED: missing.extend(self._check_on_deactivated(generate_output_for_event)) elif prev_type == VALIDATED: missing.extend(self._check_on_validation(prev_event, generate_output_for_event)) elif prev_type == SENTINEL: missing.extend(self._check_earliest_event(reason)) if missing: prev_timestamp_for_mode_change = timestamp # Check for mode change for all events: missing.extend(self._check_for_mode_change(prev_event, prev_timestamp_for_mode_change)) # Finally, set state for the next one. self._update_state(prev_event) return missing
class CourseEnrollmentValidationTaskMapTest(InitializeOpaqueKeysMixin, MapperTestMixin, unittest.TestCase): """ Tests to verify that event log parsing by mapper works correctly. """ def setUp(self): self.task_class = CourseEnrollmentValidationTask super(CourseEnrollmentValidationTaskMapTest, self).setUp() self.initialize_ids() self.user_id = 21 self.timestamp = "2013-12-17T15:38:32.805444" self.mode = 'honor' self.factory = SyntheticEventFactory( timestamp=self.timestamp, event_source='server', event_type=ACTIVATED, synthesizer='enrollment_from_db', reason='db entry', user_id=self.user_id, course_id=self.course_id, org_id=self.org_id, ) self.expected_key = (self.course_id, self.user_id) def _create_event_log_line(self, **kwargs): """Create an event log with test values, as a JSON string.""" return json.dumps(self._create_event_dict(**kwargs)) def _create_event_dict(self, **kwargs): """Create an event log with test values, as a dict.""" event_data = { 'course_id': self.course_id, 'user_id': self.user_id, 'mode': self.mode, } event = self.factory.create_event_dict(event_data, **kwargs) return event def test_non_enrollment_event(self): line = 'this is garbage' self.assert_no_map_output_for(line) def test_unparseable_enrollment_event(self): line = 'this is garbage but contains edx.course.enrollment' self.assert_no_map_output_for(line) def test_missing_event_type(self): event_dict = self._create_event_dict() event_dict['old_event_type'] = event_dict['event_type'] del event_dict['event_type'] line = json.dumps(event_dict) self.assert_no_map_output_for(line) def test_nonenroll_event_type(self): line = self._create_event_log_line( event_type='edx.course.enrollment.unknown') self.assert_no_map_output_for(line) def test_bad_datetime(self): line = self._create_event_log_line(time='this is a bogus time') self.assert_no_map_output_for(line) def test_bad_event_data(self): line = self._create_event_log_line(event=["not an event"]) self.assert_no_map_output_for(line) def test_illegal_course_id(self): line = self._create_event_log_line(event={ "course_id": ";;;;bad/id/val", "user_id": self.user_id }) self.assert_no_map_output_for(line) def test_missing_user_id(self): line = self._create_event_log_line(event={"course_id": self.course_id}) self.assert_no_map_output_for(line) def test_good_enroll_event(self): line = self._create_event_log_line() expected_value = (self.timestamp, ACTIVATED, self.mode, None) self.assert_single_map_output(line, self.expected_key, expected_value) def test_good_unenroll_event(self): line = self._create_event_log_line(event_type=DEACTIVATED) expected_value = (self.timestamp, DEACTIVATED, self.mode, None) self.assert_single_map_output(line, self.expected_key, expected_value) def test_good_mode_change_event(self): line = self._create_event_log_line(event_type=MODE_CHANGED) expected_value = (self.timestamp, MODE_CHANGED, self.mode, None) self.assert_single_map_output(line, self.expected_key, expected_value) def test_good_validation_event(self): validation_info = { 'is_active': True, 'created': '2012-07-24T12:37:32.000000', 'dump_start': '2014-10-08T04:52:48.154228', 'dump_end': '2014-10-08T04:57:38.145282', } event_info = { "course_id": self.course_id, "user_id": self.user_id, "mode": self.mode, } event_info.update(validation_info) line = self._create_event_log_line(event_type=VALIDATED, event=event_info) expected_value = (self.timestamp, VALIDATED, self.mode, validation_info) self.assert_single_map_output(line, self.expected_key, expected_value)
class CourseEnrollmentValidationTaskMapTest(InitializeOpaqueKeysMixin, MapperTestMixin, unittest.TestCase): """ Tests to verify that event log parsing by mapper works correctly. """ def setUp(self): self.task_class = CourseEnrollmentValidationTask super(CourseEnrollmentValidationTaskMapTest, self).setUp() self.initialize_ids() self.user_id = 21 self.timestamp = "2013-12-17T15:38:32.805444" self.mode = 'honor' self.factory = SyntheticEventFactory( timestamp=self.timestamp, event_source='server', event_type=ACTIVATED, synthesizer='enrollment_from_db', reason='db entry', user_id=self.user_id, course_id=self.course_id, org_id=self.org_id, ) self.expected_key = (self.course_id, self.user_id) def _create_event_log_line(self, **kwargs): """Create an event log with test values, as a JSON string.""" return json.dumps(self._create_event_dict(**kwargs)) def _create_event_dict(self, **kwargs): """Create an event log with test values, as a dict.""" event_data = { 'course_id': self.course_id, 'user_id': self.user_id, 'mode': self.mode, } event = self.factory.create_event_dict(event_data, **kwargs) return event def test_non_enrollment_event(self): line = 'this is garbage' self.assert_no_map_output_for(line) def test_unparseable_enrollment_event(self): line = 'this is garbage but contains edx.course.enrollment' self.assert_no_map_output_for(line) def test_missing_event_type(self): event_dict = self._create_event_dict() event_dict['old_event_type'] = event_dict['event_type'] del event_dict['event_type'] line = json.dumps(event_dict) self.assert_no_map_output_for(line) def test_nonenroll_event_type(self): line = self._create_event_log_line(event_type='edx.course.enrollment.unknown') self.assert_no_map_output_for(line) def test_bad_datetime(self): line = self._create_event_log_line(time='this is a bogus time') self.assert_no_map_output_for(line) def test_bad_event_data(self): line = self._create_event_log_line(event=["not an event"]) self.assert_no_map_output_for(line) def test_illegal_course_id(self): line = self._create_event_log_line(event={"course_id": ";;;;bad/id/val", "user_id": self.user_id}) self.assert_no_map_output_for(line) def test_missing_user_id(self): line = self._create_event_log_line(event={"course_id": self.course_id}) self.assert_no_map_output_for(line) def test_good_enroll_event(self): line = self._create_event_log_line() expected_value = (self.timestamp, ACTIVATED, self.mode, None) self.assert_single_map_output(line, self.expected_key, expected_value) def test_good_unenroll_event(self): line = self._create_event_log_line(event_type=DEACTIVATED) expected_value = (self.timestamp, DEACTIVATED, self.mode, None) self.assert_single_map_output(line, self.expected_key, expected_value) def test_good_mode_change_event(self): line = self._create_event_log_line(event_type=MODE_CHANGED) expected_value = (self.timestamp, MODE_CHANGED, self.mode, None) self.assert_single_map_output(line, self.expected_key, expected_value) def test_good_validation_event(self): validation_info = { 'is_active': True, 'created': '2012-07-24T12:37:32.000000', 'dump_start': '2014-10-08T04:52:48.154228', 'dump_end': '2014-10-08T04:57:38.145282', } event_info = { "course_id": self.course_id, "user_id": self.user_id, "mode": self.mode, } event_info.update(validation_info) line = self._create_event_log_line(event_type=VALIDATED, event=event_info) expected_value = (self.timestamp, VALIDATED, self.mode, validation_info) self.assert_single_map_output(line, self.expected_key, expected_value)