def init_local(self):
        super(CreateEnrollmentValidationEventsTask, self).init_local()

        # need to determine the date of the input, by reading the appropriate
        # metadata file.  File looks like this:
        # {"start_time": "2014-10-08T04:52:48.154228", "end_time": "2014-10-08T04:55:18.269070"}

        metadata_target = self._get_metadata_target()
        with metadata_target.open('r') as metadata_file:
            metadata = json.load(metadata_file)
            self.dump_start_time = metadata["start_time"]
            self.dump_end_time = metadata["end_time"]
            log.debug("Found self.dump_start_time = %s  end_time = %s", self.dump_start_time, self.dump_end_time)
            self.dump_date = ''.join((self.dump_start_time.split('T')[0]).split('-'))

        # Set the timestamp of all events to be the dump's end time.
        # The events that are actually dumped are not within a transaction,
        # so the actual event time may be earlier, anywhere up to the dump's start time.
        self.factory = SyntheticEventFactory(
            timestamp=self.dump_end_time,
            event_source='server',
            event_type=VALIDATED,
            synthesizer='enrollment_from_db',
            reason='db entry'
        )
    def __init__(self, course_id, user_id, interval, events, **kwargs):
        self.course_id = course_id
        self.user_id = user_id
        self.interval = interval
        self.creation_timestamp = None
        self.tuple_output = kwargs.get("tuple_output")
        self.include_nonstate_changes = kwargs.get("include_nonstate_changes")
        self.generate_before = kwargs.get("generate_before")
        self.lower_bound_date_string = kwargs.get("lower_bound_date_string")
        self.earliest_timestamp = kwargs.get("earliest_timestamp")
        self.expected_validation = kwargs.get("expected_validation")

        if self.tuple_output:
            self.generate_output = self._create_tuple
        else:
            self.factory = SyntheticEventFactory(event_source="server", synthesizer="enrollment_validation")
            self.generate_output = self._synthetic_event

        # Create list of events in reverse order, as processing goes backwards
        # from validation states.
        self.sorted_events = [
            EnrollmentEvent(timestamp, event_type, mode, validation_info)
            for timestamp, event_type, mode, validation_info in sorted(events, reverse=True)
        ]

        self._reorder_within_dumps()

        if self.expected_validation:
            self._check_for_missing_validation()

        # Add a marker event to signal the beginning of the interval.
        initial_state = EnrollmentEvent(None, SENTINEL, mode="honor", validation_info=None)
        self.sorted_events.append(initial_state)
Example #3
0
    def __init__(self, course_id, user_id, interval, events, **kwargs):
        self.course_id = course_id
        self.user_id = user_id
        self.interval = interval
        self.creation_timestamp = None
        self.tuple_output = kwargs.get('tuple_output')
        self.include_nonstate_changes = kwargs.get('include_nonstate_changes')
        self.generate_before = kwargs.get('generate_before')
        self.lower_bound_date_string = kwargs.get('lower_bound_date_string')
        self.earliest_timestamp = kwargs.get('earliest_timestamp')
        self.expected_validation = kwargs.get('expected_validation')

        if self.tuple_output:
            self.generate_output = self._create_tuple
        else:
            self.factory = SyntheticEventFactory(
                event_source='server',
                synthesizer='enrollment_validation',
            )
            self.generate_output = self._synthetic_event

        # Create list of events in reverse order, as processing goes backwards
        # from validation states.
        self.sorted_events = [
            EnrollmentEvent(timestamp, event_type, mode, validation_info)
            for timestamp, event_type, mode, validation_info in sorted(
                events, reverse=True)
        ]

        self._reorder_within_dumps()

        if self.expected_validation:
            self._check_for_missing_validation()

        # Add a marker event to signal the beginning of the interval.
        initial_state = EnrollmentEvent(None,
                                        SENTINEL,
                                        mode='honor',
                                        validation_info=None)
        self.sorted_events.append(initial_state)
Example #4
0
    def setUp(self):
        self.task_class = CourseEnrollmentValidationTask
        super(CourseEnrollmentValidationTaskMapTest, self).setUp()

        self.initialize_ids()

        self.user_id = 21
        self.timestamp = "2013-12-17T15:38:32.805444"
        self.mode = 'honor'

        self.factory = SyntheticEventFactory(
            timestamp=self.timestamp,
            event_source='server',
            event_type=ACTIVATED,
            synthesizer='enrollment_from_db',
            reason='db entry',
            user_id=self.user_id,
            course_id=self.course_id,
            org_id=self.org_id,
        )

        self.expected_key = (self.course_id, self.user_id)
    def setUp(self):
        self.initialize_ids()

        fake_param = luigi.DateIntervalParameter()
        self.task = CourseEnrollmentValidationTask(
            interval=fake_param.parse('2013-12-17'),
            output_root='/fake/output')
        self.task.init_local()

        self.user_id = 21
        self.timestamp = "2013-12-17T15:38:32.805444"
        self.mode = 'honor'

        self.factory = SyntheticEventFactory(
            timestamp=self.timestamp,
            event_source='server',
            event_type=ACTIVATED,
            synthesizer='enrollment_from_db',
            reason='db entry',
            user_id=self.user_id,
            course_id=self.course_id,
            org_id=self.org_id,
        )
    def setUp(self):
        self.task_class = CourseEnrollmentValidationTask
        super(CourseEnrollmentValidationTaskMapTest, self).setUp()

        self.initialize_ids()

        self.user_id = 21
        self.timestamp = "2013-12-17T15:38:32.805444"
        self.mode = 'honor'

        self.factory = SyntheticEventFactory(
            timestamp=self.timestamp,
            event_source='server',
            event_type=ACTIVATED,
            synthesizer='enrollment_from_db',
            reason='db entry',
            user_id=self.user_id,
            course_id=self.course_id,
            org_id=self.org_id,
        )

        self.expected_key = (self.course_id, self.user_id)
    def setUp(self):
        self.initialize_ids()

        fake_param = luigi.DateIntervalParameter()
        self.task = CourseEnrollmentValidationTask(
            interval=fake_param.parse('2013-12-17'),
            output_root='/fake/output'
        )
        self.task.init_local()

        self.user_id = 21
        self.timestamp = "2013-12-17T15:38:32.805444"
        self.mode = 'honor'

        self.factory = SyntheticEventFactory(
            timestamp=self.timestamp,
            event_source='server',
            event_type=ACTIVATED,
            synthesizer='enrollment_from_db',
            reason='db entry',
            user_id=self.user_id,
            course_id=self.course_id,
            org_id=self.org_id,
        )
Example #8
0
class CourseEnrollmentValidationTaskMapTest(InitializeOpaqueKeysMixin,
                                            MapperTestMixin,
                                            unittest.TestCase):
    """
    Tests to verify that event log parsing by mapper works correctly.
    """
    def setUp(self):
        self.task_class = CourseEnrollmentValidationTask
        super(CourseEnrollmentValidationTaskMapTest, self).setUp()

        self.initialize_ids()

        self.user_id = 21
        self.timestamp = "2013-12-17T15:38:32.805444"
        self.mode = 'honor'

        self.factory = SyntheticEventFactory(
            timestamp=self.timestamp,
            event_source='server',
            event_type=ACTIVATED,
            synthesizer='enrollment_from_db',
            reason='db entry',
            user_id=self.user_id,
            course_id=self.course_id,
            org_id=self.org_id,
        )

        self.expected_key = (self.course_id, self.user_id)

    def _create_event_log_line(self, **kwargs):
        """Create an event log with test values, as a JSON string."""
        return json.dumps(self._create_event_dict(**kwargs))

    def _create_event_dict(self, **kwargs):
        """Create an event log with test values, as a dict."""
        event_data = {
            'course_id': self.course_id,
            'user_id': self.user_id,
            'mode': self.mode,
        }
        event = self.factory.create_event_dict(event_data, **kwargs)
        return event

    def test_non_enrollment_event(self):
        line = 'this is garbage'
        self.assert_no_map_output_for(line)

    def test_unparseable_enrollment_event(self):
        line = 'this is garbage but contains edx.course.enrollment'
        self.assert_no_map_output_for(line)

    def test_missing_event_type(self):
        event_dict = self._create_event_dict()
        event_dict['old_event_type'] = event_dict['event_type']
        del event_dict['event_type']
        line = json.dumps(event_dict)
        self.assert_no_map_output_for(line)

    def test_nonenroll_event_type(self):
        line = self._create_event_log_line(
            event_type='edx.course.enrollment.unknown')
        self.assert_no_map_output_for(line)

    def test_bad_datetime(self):
        line = self._create_event_log_line(time='this is a bogus time')
        self.assert_no_map_output_for(line)

    def test_bad_event_data(self):
        line = self._create_event_log_line(event=["not an event"])
        self.assert_no_map_output_for(line)

    def test_illegal_course_id(self):
        line = self._create_event_log_line(event={
            "course_id": ";;;;bad/id/val",
            "user_id": self.user_id
        })
        self.assert_no_map_output_for(line)

    def test_missing_user_id(self):
        line = self._create_event_log_line(event={"course_id": self.course_id})
        self.assert_no_map_output_for(line)

    def test_good_enroll_event(self):
        line = self._create_event_log_line()
        expected_value = (self.timestamp, ACTIVATED, self.mode, None)
        self.assert_single_map_output(line, self.expected_key, expected_value)

    def test_good_unenroll_event(self):
        line = self._create_event_log_line(event_type=DEACTIVATED)
        expected_value = (self.timestamp, DEACTIVATED, self.mode, None)
        self.assert_single_map_output(line, self.expected_key, expected_value)

    def test_good_mode_change_event(self):
        line = self._create_event_log_line(event_type=MODE_CHANGED)
        expected_value = (self.timestamp, MODE_CHANGED, self.mode, None)
        self.assert_single_map_output(line, self.expected_key, expected_value)

    def test_good_validation_event(self):
        validation_info = {
            'is_active': True,
            'created': '2012-07-24T12:37:32.000000',
            'dump_start': '2014-10-08T04:52:48.154228',
            'dump_end': '2014-10-08T04:57:38.145282',
        }
        event_info = {
            "course_id": self.course_id,
            "user_id": self.user_id,
            "mode": self.mode,
        }
        event_info.update(validation_info)
        line = self._create_event_log_line(event_type=VALIDATED,
                                           event=event_info)
        expected_value = (self.timestamp, VALIDATED, self.mode,
                          validation_info)
        self.assert_single_map_output(line, self.expected_key, expected_value)
class CreateEnrollmentValidationEventsTask(MultiOutputMapReduceJobTask):
    """
    Convert a database dump of course enrollment into log files of validation events.

    Read from a directory location that points to a Sqoop dump of student_courseenrollment
    table.  Use map reduce simply because it allows the multiple file output to be read
    uniformly.  But it allows us to also separate the enrollment results into separate
    courses so that validation runs can be more fine-grained.

    The date for the synthesized events is the end time of the Sqoop dump.  This
    is when the particular enrollment states were observed.

    Parameters:

        source_dir: the URL of the location of the desired database dump.  This should
            include the 'dt=<date>' partition specification.

    Other parameters are defined by MultiOutputMapReduceJobTask.

    """
    # Note: we could just read the corresponding validation data into
    # the reducer.  So this would just need to produce reducer input
    # instead of mapper input.  Problem with that is that if there
    # were courses for which there were database entries but no
    # events, they wouldn't get validated.  So we put the events into
    # the mapper to make sure all courses get processed.

    # This defines the directory (with the dt=<date> partition) that contains
    # the desired database dump.
    source_dir = luigi.Parameter()

    def requires_hadoop(self):
        # Check first if running locally with Sqoop output.
        target = get_target_from_url(self.source_dir)
        if isinstance(target, luigi.LocalTarget) and os.path.isdir(self.source_dir):
            files = [f for f in os.listdir(self.source_dir) if f.startswith("part")]
            for filename in files:
                yield ExternalURL(url_path_join(self.source_dir, filename))
        else:
            yield ExternalURL(self.source_dir)

    def init_local(self):
        super(CreateEnrollmentValidationEventsTask, self).init_local()

        # need to determine the date of the input, by reading the appropriate
        # metadata file.  File looks like this:
        # {"start_time": "2014-10-08T04:52:48.154228", "end_time": "2014-10-08T04:55:18.269070"}

        metadata_target = self._get_metadata_target()
        with metadata_target.open('r') as metadata_file:
            metadata = json.load(metadata_file)
            self.dump_start_time = metadata["start_time"]
            self.dump_end_time = metadata["end_time"]
            log.debug("Found self.dump_start_time = %s  end_time = %s", self.dump_start_time, self.dump_end_time)
            self.dump_date = ''.join((self.dump_start_time.split('T')[0]).split('-'))

        # Set the timestamp of all events to be the dump's end time.
        # The events that are actually dumped are not within a transaction,
        # so the actual event time may be earlier, anywhere up to the dump's start time.
        self.factory = SyntheticEventFactory(
            timestamp=self.dump_end_time,
            event_source='server',
            event_type=VALIDATED,
            synthesizer='enrollment_from_db',
            reason='db entry'
        )

    def _get_metadata_target(self):
        """Returns target for metadata file from the given dump."""
        # find the .metadata file in the source directory.
        metadata_path = url_path_join(self.source_dir, ".metadata")
        return get_target_from_url(metadata_path)

    def mapper(self, line):
        fields = line.split('\x01')
        if len(fields) != 6:
            log.error("Encountered bad input: %s", line)
            return

        (_db_id, user_id_string, encoded_course_id, mysql_created, mysql_is_active, mode) = fields

        # `created` is of the form '2012-07-25 12:26:22.0', coming out of
        # mysql.  Convert it to isoformat.
        created = mysql_datetime_to_isoformat(mysql_created)
        # `is_active` should be a boolean and `user_id` is an int.
        is_active = (mysql_is_active == "true")
        user_id = int(user_id_string)

        # Note that we do not have several standard properties that we
        # might expect in such an event.  These include a username,
        # host, session_id, agent.  These values will be stubbed by
        # the factory as empty strings.

        course_id = encoded_course_id.decode('utf-8')
        # data for the particular type of event:
        event_data = {
            'course_id': course_id,
            'user_id': user_id,
            'mode': mode,
            'is_active': is_active,
            'created': created,
            'dump_start': self.dump_start_time,
            'dump_end': self.dump_end_time,
        }

        # stuff for context:
        event_properties = {
            'user_id': user_id,
            'course_id': course_id,
            'org_id': opaque_key_util.get_org_id_for_course(course_id),
        }

        event = self.factory.create_event(event_data, **event_properties)

        # Use the original utf-8 version of the course_id as the key.
        # (Note that if we want everything zipped into a single file,
        # then we can just pass a single dummy value for the key instead of
        # breaking the output out by course_id.)
        yield encoded_course_id, event

    def multi_output_reducer(self, _key, values, output_file):
        with gzip.GzipFile(mode='wb', fileobj=output_file) as outfile:
            for value in values:
                outfile.write(value)
                outfile.write('\n')

    def output_path_for_key(self, course_id):
        filename_safe_course_id = opaque_key_util.get_filename_safe_course_id(course_id, '_')
        filename = u'{course_id}_enroll_validated_{dumpdate}.log.gz'.format(
            course_id=filename_safe_course_id,
            dumpdate=self.dump_date,
        )
        return url_path_join(self.output_root, filename)
class ValidateEnrollmentForEvents(object):
    """
    Collects and validates events for a given user in a given course.
    """
    def __init__(self, course_id, user_id, interval, events, **kwargs):
        self.course_id = course_id
        self.user_id = user_id
        self.interval = interval
        self.creation_timestamp = None
        self.tuple_output = kwargs.get('tuple_output')
        self.include_nonstate_changes = kwargs.get('include_nonstate_changes')
        self.generate_before = kwargs.get('generate_before')
        self.lower_bound_date_string = kwargs.get('lower_bound_date_string')
        self.earliest_timestamp = kwargs.get('earliest_timestamp')
        self.expected_validation = kwargs.get('expected_validation')

        if self.tuple_output:
            self.generate_output = self._create_tuple
        else:
            self.factory = SyntheticEventFactory(
                event_source='server',
                synthesizer='enrollment_validation',
            )
            self.generate_output = self._synthetic_event

        # Create list of events in reverse order, as processing goes backwards
        # from validation states.
        self.sorted_events = [
            EnrollmentEvent(timestamp, event_type, mode, validation_info)
            for timestamp, event_type, mode, validation_info in sorted(events, reverse=True)
        ]

        self._reorder_within_dumps()

        if self.expected_validation:
            self._check_for_missing_validation()

        # Add a marker event to signal the beginning of the interval.
        initial_state = EnrollmentEvent(None, SENTINEL, mode='honor', validation_info=None)
        self.sorted_events.append(initial_state)

    def _reorder_within_dumps(self):
        """
        Fix the timestamp of a validation event if an enrollment event occurs during the dump.
        """
        num_events = len(self.sorted_events) - 1
        for index in range(num_events):
            event = self.sorted_events[index]
            prev_event = self.sorted_events[index + 1]
            is_nonvalidate_during_validate = (
                event.event_type == VALIDATED and
                prev_event.event_type != VALIDATED and
                event.is_during_dump(prev_event.timestamp)
            )
            if is_nonvalidate_during_validate:
                is_active_is_inconsistent = (
                    (event.is_active and prev_event.event_type == DEACTIVATED) or
                    (not event.is_active and prev_event.event_type == ACTIVATED)
                )
                mode_is_inconsistent = (event.mode != prev_event.mode and prev_event.event_type == MODE_CHANGED)

                if is_active_is_inconsistent or mode_is_inconsistent:
                    # Change the timestamp of the validation event to precede
                    # the other event, and swap them.
                    event.timestamp = add_microseconds(prev_event.timestamp, -1)
                    self.sorted_events[index] = prev_event
                    self.sorted_events[index + 1] = event

    def _check_for_missing_validation(self):
        """Check last event to see if it is missing an expected validation event."""
        if len(self.sorted_events) > 0 and self.sorted_events[0].timestamp < self.expected_validation:
            validation_info = {
                'is_active': False,
                'created': None,
                'dump_start': None,
                'dump_end': None,
            }
            missing_event = [EnrollmentEvent(
                self.expected_validation, MISSING, mode=None, validation_info=validation_info
            )]
            missing_event.extend(self.sorted_events)
            self.sorted_events = missing_event

    def missing_enrolled(self):
        """
        A synthetic event is yielded for each transition in user's events for which a real event is missing.

        Yields:
            json-encoded string representing a synthetic event, or a tuple.
        """
        # The last element of the list is a placeholder indicating the beginning of the interval.
        # Don't process it.
        num_events = len(self.sorted_events) - 1

        self._initialize_state(self.sorted_events[0])
        all_missing_events = []
        for index in range(num_events):
            event = self.sorted_events[index + 1]
            missing_events = self._check_event(event)
            if missing_events:
                all_missing_events.extend(missing_events)

        return all_missing_events

    def _create_tuple(self, timestamp, event_type, mode, reason, after=None, before=None):
        """Returns a tuple representation of the output, for TSV-based debugging."""
        datestamp = eventlog.timestamp_to_datestamp(timestamp)
        return datestamp, (self.course_id, self.user_id, timestamp, event_type, mode, reason, after, before)

    def _synthetic_event(self, timestamp, event_type, mode, reason, after=None, before=None):
        """Create a synthetic event."""
        # data specific to course enrollment events:
        event_data = {
            'course_id': self.course_id,
            'user_id': self.user_id,
            'mode': mode,
        }

        event_properties = {
            # main properties:
            'time': timestamp,
            'event_type': event_type,
            # stuff for context:
            'user_id': self.user_id,
            'course_id': self.course_id,
            'org_id': opaque_key_util.get_org_id_for_course(self.course_id),
            # stuff for synthesized:
            'reason': reason,
        }

        event = self.factory.create_event_dict(event_data, **event_properties)
        synthesized = event['synthesized']
        if after:
            synthesized['after_time'] = after
        if before:
            synthesized['before_time'] = before

        datestamp = eventlog.timestamp_to_datestamp(timestamp)
        return datestamp, json.dumps(event)

    def _truncate_timestamp(self, timestamp):
        """Makes sure that timestamp is no earlier than limit specified, if any."""
        if self.earliest_timestamp and timestamp < self.earliest_timestamp:
            return self.earliest_timestamp
        else:
            return timestamp

    def _get_fake_timestamp(self, after, before):
        """
        Pick a time in an interval.

        Picks a microsecond after `after`, else a microsecond before `before`.

        Input and output values are ISO format strings.
        """
        # Just pick the time at the beginning of the interval.
        if after:
            # Add a microsecond to 'after'
            return add_microseconds(after, 1)
        else:
            # Subtract a microsecond from 'before'
            return add_microseconds(before, -1)

    def _get_reason_string(self, prev_event, curr_event_string, curr_mode=None):
        """Provide a readable string giving the reason for generating the synthetic event."""
        if curr_mode:
            return "{prev} => {curr} ({prev_mode}=>{curr_mode})".format(
                prev=prev_event.get_state_string(),
                curr=curr_event_string,
                prev_mode=prev_event.mode,
                curr_mode=curr_mode,
            )
        else:
            return "{prev} => {curr}".format(
                prev=prev_event.get_state_string(),
                curr=curr_event_string,
            )

    def _initialize_state(self, event):
        """Define initial values for validation state machine."""
        self._update_state(event)
        # If the most-recent event is a mode-change event, then we don't
        # know from it what the activation is.
        if event.event_type == MODE_CHANGED:
            self.activation_type = None

    def _update_state(self, event):
        """Define current values for validation state machine."""
        # Some events define an activation state.
        if event.event_type != MODE_CHANGED:
            self.activation_type = event.event_type
            self.activation_state = event.get_state_string()
            self.activation_timestamp = event.timestamp

        # Most events set mode.
        self.mode_changed = (event.event_type == MODE_CHANGED)
        if event.event_type != MISSING:
            self.current_mode = event.mode
            self.mode_type = event.event_type
            self.mode_state = event.get_state_string()
            self.mode_timestamp = event.timestamp
        else:
            self.current_mode = None

        # Only validation events define a created timestamp and activation state.
        if event.event_type == VALIDATED:
            self.currently_active = event.is_active

            if self.creation_timestamp:
                # compare with previously-viewed (i.e. later-in-time) validation:
                if event.created != self.creation_timestamp:
                    log.error("Encountered validation with different creation timestamp: %s => %s",
                              event.created, self.creation_timestamp)
            # Use the earliest validation:
            self.creation_timestamp = event.created

    def _check_for_mode_change(self, prev_event, last_timestamp):
        """Check if a mode-change event should be synthesized."""
        # There are several reasons to suppress a mode-change.
        # The most obvious is if there is no change in mode.
        # Other cases include:
        #  * No "current" mode (e.g. current event is "MISSING").
        #  * Current event is known to reset the mode, so no comparison
        #    is needed (or possible).  This includes MODE_CHANGED and ACTIVATE
        #    events.
        #  * No previous event (i.e. previous event is "SENTINEL").
        suppress_mode_change = (
            prev_event.mode == self.current_mode or
            not self.current_mode or
            self.mode_changed or
            self.mode_type == ACTIVATED or
            prev_event.event_type == SENTINEL
        )

        if suppress_mode_change:
            return []
        else:
            curr = self.mode_timestamp
            timestamp = self._get_fake_timestamp(last_timestamp, curr)
            reason = self._get_reason_string(prev_event, self.mode_state, self.current_mode)
            return [self.generate_output(timestamp, MODE_CHANGED, self.current_mode, reason, last_timestamp, curr)]

    def _check_on_activated(self, generate_output_for_event):
        """Check if a deactivation event should be synthesized after an activation event."""
        if self.activation_type == ACTIVATED and self.include_nonstate_changes:
            # Duplicate activate event (a/a).
            return [generate_output_for_event(DEACTIVATED)]
        elif self.activation_type == DEACTIVATED:
            pass  # normal case
        elif self.activation_type == VALIDATED and self.currently_active:
            pass  # normal case
        elif self.activation_type == VALIDATED and not self.currently_active:
            # Missing deactivate event (a/vi)
            return [generate_output_for_event(DEACTIVATED)]
        elif self.activation_type == MISSING:
            # Missing deactivate event (a/m)
            return [generate_output_for_event(DEACTIVATED)]
        return []

    def _check_on_deactivated(self, generate_output_for_event):
        """Check if an activation event should be synthesized after a deactivation event."""
        if self.activation_type == ACTIVATED:
            pass  # normal case
        elif self.activation_type == DEACTIVATED and self.include_nonstate_changes:
            # Duplicate deactivate event (d/d).
            return [generate_output_for_event(ACTIVATED)]
        elif self.activation_type == VALIDATED and not self.currently_active:
            pass  # normal case
        elif self.activation_type == VALIDATED and self.currently_active:
            # Missing activate event (d/va)
            return [generate_output_for_event(ACTIVATED)]
        elif self.activation_type == MISSING and self.include_nonstate_changes:
            # Missing validation event (d/m) -- generate a second deactivate event
            # just to mark the inconsistency.
            return [generate_output_for_event(DEACTIVATED)]
        return []

    def _check_on_validation(self, prev_event, generate_output_for_event):
        """Check if an event should be synthesized after a validation event."""
        if self.activation_type == ACTIVATED:
            if prev_event.is_active and self.include_nonstate_changes:
                # Missing deactivate (va/a)
                return [generate_output_for_event(DEACTIVATED)]
        elif self.activation_type == DEACTIVATED:
            if not prev_event.is_active and self.include_nonstate_changes:
                # Missing activate (vi/d)
                return [generate_output_for_event(ACTIVATED)]
        elif self.activation_type == VALIDATED:
            if prev_event.is_active and not self.currently_active:
                # Missing deactivate (va/vi)
                return [generate_output_for_event(DEACTIVATED)]
            elif not prev_event.is_active and self.currently_active:
                # Missing activate (vi/va)
                return [generate_output_for_event(ACTIVATED)]
        return []

    def _check_earliest_event(self, reason):
        """Check if an event should be synthesized before the first real event."""
        curr = self.activation_timestamp

        if self.activation_type == ACTIVATED:
            pass  # normal case
        elif self.activation_type == DEACTIVATED:
            # If we had a validation after the deactivation,
            # and it provided a creation_timestamp within the interval,
            # then there should be an activate within the interval.
            if self.creation_timestamp and (
                    self.generate_before or
                    self.creation_timestamp >= self.lower_bound_date_string):
                timestamp = self._truncate_timestamp(self.creation_timestamp)
                return [(self.generate_output(
                    timestamp, ACTIVATED, self.current_mode, reason, self.creation_timestamp, curr
                ))]
            elif self.generate_before:
                # For now, hack the timestamp by making it a little before the deactivate,
                # so that it at least has a value.
                timestamp = self._get_fake_timestamp(None, curr)
                return [(self.generate_output(timestamp, ACTIVATED, self.current_mode, reason, None, curr))]

        elif self.activation_type == VALIDATED:
            creation_timestamp = self._truncate_timestamp(self.creation_timestamp)

            if not self.generate_before and self.creation_timestamp < self.lower_bound_date_string:
                # If we are validating only within an interval and the create_timestamp
                # is outside this interval, we can't know whether the events are really
                # missing or just not included.
                pass
            elif self.currently_active:
                return [(self.generate_output(
                    creation_timestamp, ACTIVATED, self.current_mode, reason, self.creation_timestamp, curr
                ))]
            elif self.include_nonstate_changes:
                # There may be missing Activate and Deactivate events, or there may
                # just be an inactive table row that was created as part of an enrollment
                # flow, but no enrollment was completed.
                event1 = self.generate_output(
                    creation_timestamp, ACTIVATED, self.current_mode, reason, self.creation_timestamp, curr
                )
                timestamp = self._get_fake_timestamp(creation_timestamp, curr)
                event2 = self.generate_output(
                    timestamp, DEACTIVATED, self.current_mode, reason, self.creation_timestamp, curr
                )
                return [event1, event2]
        return []

    def _check_event(self, prev_event):
        """Compare a previous event with current state generated from later events. """
        prev_timestamp_for_mode_change = prev_event.timestamp

        missing = []
        if self.activation_type is not None:
            reason = self._get_reason_string(prev_event, self.activation_state)
            timestamp = self._get_fake_timestamp(prev_event.timestamp, self.activation_timestamp)

            def generate_output_for_event(event_type):
                """Wrapper to generate a synthetic event with common values."""
                return self.generate_output(
                    timestamp, event_type, prev_event.mode, reason, prev_event.timestamp, self.activation_timestamp
                )

            prev_type = prev_event.event_type
            if prev_type == ACTIVATED:
                missing.extend(self._check_on_activated(generate_output_for_event))
            elif prev_type == DEACTIVATED:
                missing.extend(self._check_on_deactivated(generate_output_for_event))
            elif prev_type == VALIDATED:
                missing.extend(self._check_on_validation(prev_event, generate_output_for_event))
            elif prev_type == SENTINEL:
                missing.extend(self._check_earliest_event(reason))

            if missing:
                prev_timestamp_for_mode_change = timestamp

        # Check for mode change for all events:
        missing.extend(self._check_for_mode_change(prev_event, prev_timestamp_for_mode_change))

        # Finally, set state for the next one.
        self._update_state(prev_event)

        return missing
class CourseEnrollmentValidationTaskMapTest(InitializeOpaqueKeysMixin, MapperTestMixin, unittest.TestCase):
    """
    Tests to verify that event log parsing by mapper works correctly.
    """
    def setUp(self):
        self.task_class = CourseEnrollmentValidationTask
        super(CourseEnrollmentValidationTaskMapTest, self).setUp()

        self.initialize_ids()

        self.user_id = 21
        self.timestamp = "2013-12-17T15:38:32.805444"
        self.mode = 'honor'

        self.factory = SyntheticEventFactory(
            timestamp=self.timestamp,
            event_source='server',
            event_type=ACTIVATED,
            synthesizer='enrollment_from_db',
            reason='db entry',
            user_id=self.user_id,
            course_id=self.course_id,
            org_id=self.org_id,
        )

        self.expected_key = (self.course_id, self.user_id)

    def _create_event_log_line(self, **kwargs):
        """Create an event log with test values, as a JSON string."""
        return json.dumps(self._create_event_dict(**kwargs))

    def _create_event_dict(self, **kwargs):
        """Create an event log with test values, as a dict."""
        event_data = {
            'course_id': self.course_id,
            'user_id': self.user_id,
            'mode': self.mode,
        }
        event = self.factory.create_event_dict(event_data, **kwargs)
        return event

    def test_non_enrollment_event(self):
        line = 'this is garbage'
        self.assert_no_map_output_for(line)

    def test_unparseable_enrollment_event(self):
        line = 'this is garbage but contains edx.course.enrollment'
        self.assert_no_map_output_for(line)

    def test_missing_event_type(self):
        event_dict = self._create_event_dict()
        event_dict['old_event_type'] = event_dict['event_type']
        del event_dict['event_type']
        line = json.dumps(event_dict)
        self.assert_no_map_output_for(line)

    def test_nonenroll_event_type(self):
        line = self._create_event_log_line(event_type='edx.course.enrollment.unknown')
        self.assert_no_map_output_for(line)

    def test_bad_datetime(self):
        line = self._create_event_log_line(time='this is a bogus time')
        self.assert_no_map_output_for(line)

    def test_bad_event_data(self):
        line = self._create_event_log_line(event=["not an event"])
        self.assert_no_map_output_for(line)

    def test_illegal_course_id(self):
        line = self._create_event_log_line(event={"course_id": ";;;;bad/id/val", "user_id": self.user_id})
        self.assert_no_map_output_for(line)

    def test_missing_user_id(self):
        line = self._create_event_log_line(event={"course_id": self.course_id})
        self.assert_no_map_output_for(line)

    def test_good_enroll_event(self):
        line = self._create_event_log_line()
        expected_value = (self.timestamp, ACTIVATED, self.mode, None)
        self.assert_single_map_output(line, self.expected_key, expected_value)

    def test_good_unenroll_event(self):
        line = self._create_event_log_line(event_type=DEACTIVATED)
        expected_value = (self.timestamp, DEACTIVATED, self.mode, None)
        self.assert_single_map_output(line, self.expected_key, expected_value)

    def test_good_mode_change_event(self):
        line = self._create_event_log_line(event_type=MODE_CHANGED)
        expected_value = (self.timestamp, MODE_CHANGED, self.mode, None)
        self.assert_single_map_output(line, self.expected_key, expected_value)

    def test_good_validation_event(self):
        validation_info = {
            'is_active': True,
            'created': '2012-07-24T12:37:32.000000',
            'dump_start': '2014-10-08T04:52:48.154228',
            'dump_end': '2014-10-08T04:57:38.145282',
        }
        event_info = {
            "course_id": self.course_id,
            "user_id": self.user_id,
            "mode": self.mode,
        }
        event_info.update(validation_info)
        line = self._create_event_log_line(event_type=VALIDATED, event=event_info)
        expected_value = (self.timestamp, VALIDATED, self.mode, validation_info)
        self.assert_single_map_output(line, self.expected_key, expected_value)