def setup(self, db): self.date_for = datetime.date(2018, 10, 1) self.site = Site.objects.first() self.users = [ UserFactory(date_joined=as_datetime(self.date_for - datetime.timedelta(days=60))) for i in range(0, 3) ] self.course_overviews = [ CourseOverviewFactory( created=as_datetime(self.date_for - datetime.timedelta(days=60))) for i in range(0, 3) ] self.cdm_recs = [ CourseDailyMetricsFactory(site=self.site, date_for=self.date_for, **cdm) for cdm in CDM_INPUT_TEST_DATA ] self.prev_day_sdm = SiteDailyMetricsFactory(site=self.site, date_for=prev_day( self.date_for), **SDM_DATA[1]) if is_multisite(): self.organization = OrganizationFactory(sites=[self.site]) for co in self.course_overviews: OrganizationCourseFactory(organization=self.organization, course_id=str(co.id)) if organizations_support_sites(): for user in self.users: UserOrganizationMappingFactory( user=user, organization=self.organization)
def extract(self, site, date_for=None, **kwargs): # pylint: disable=unused-argument ''' We get the count from the User model since there can be registered users who have not enrolled. TODO: Exclude non-students from the user count ''' if not date_for: date_for = prev_day( datetime.datetime.utcnow().replace(tzinfo=utc).date()) data = dict() site_users = get_users_for_site(site) user_count = site_users.filter( date_joined__lt=as_datetime(next_day(date_for))).count() site_courses = get_courses_for_site(site) course_count = site_courses.filter( created__lt=as_datetime(next_day(date_for))).count() todays_active_users = get_site_active_users_for_date(site, date_for) todays_active_user_count = todays_active_users.count() mau = site_mau_1g_for_month_as_of_day(site, date_for) data['todays_active_user_count'] = todays_active_user_count data[ 'cumulative_active_user_count'] = get_previous_cumulative_active_user_count( site, date_for) + todays_active_user_count data['total_user_count'] = user_count data['course_count'] = course_count data['total_enrollment_count'] = get_total_enrollment_count( site, date_for) data['mau'] = mau.count() return data
def calc_from_user_model(): filter_args = dict( date_joined__gt=as_datetime(prev_day(start_date)), date_joined__lt=as_datetime(next_day(end_date)), ) users = figures.sites.get_users_for_site(site) return users.filter(**filter_args).values('id').distinct().count()
def seed_course_overviews(data=None): if not data: data = cans.COURSE_OVERVIEW_DATA # append with randomly generated course overviews to test pagination new_courses = [ generate_course_overview(i, org='FOO') for i in xrange(20) ] data += new_courses for rec in data: course_id = rec['id'] defaults = dict( display_name=rec['display_name'], org=rec['org'], display_org_with_default=rec['org'], number=rec['number'], created=as_datetime(rec['created']).replace(tzinfo=utc), start=as_datetime(rec['enrollment_start']).replace(tzinfo=utc), end=as_datetime(rec['enrollment_end']).replace(tzinfo=utc), enrollment_start=as_datetime( rec['enrollment_start']).replace(tzinfo=utc), enrollment_end=as_datetime( rec['enrollment_end']).replace(tzinfo=utc), ) if RELEASE_LINE != 'ginkgo': defaults['version'] = CourseOverview.VERSION CourseOverview.objects.update_or_create( id=as_course_key(course_id), defaults=defaults, )
def extract(self, date_for=None, **kwargs): ''' We get the count from the User model since there can be registered users who have not enrolled. TODO: Exclude non-students from the user count ''' if not date_for: date_for = prev_day( datetime.datetime.utcnow().replace(tzinfo=utc).date()) data = dict() user_count = get_user_model().objects.filter( date_joined__lt=as_datetime(next_day(date_for))).count() course_count = CourseOverview.objects.filter( created__lt=as_datetime(next_day(date_for))).count() todays_active_user_count = get_active_user_count_for_date(date_for) data['todays_active_user_count'] = todays_active_user_count data[ 'cumulative_active_user_count'] = get_previous_cumulative_active_user_count( date_for) + todays_active_user_count data['total_user_count'] = user_count data['course_count'] = course_count data['total_enrollment_count'] = get_total_enrollment_count(date_for) return data
def get_active_users_for_time_period(site, start_date, end_date, course_ids=None): """ Returns the number of users active in the time period. This is determined by finding the unique user ids for StudentModule records modified in a time period We don't do this only because it raises timezone warnings modified__range=(as_date(start_date), as_date(end_date)), """ # Get list of learners for the site user_ids = figures.sites.get_user_ids_for_site(site) filter_args = dict( modified__gt=as_datetime(prev_day(start_date)), modified__lt=as_datetime(next_day(end_date)), student_id__in=user_ids, ) if course_ids: filter_args['course_ids__in'] = course_ids return StudentModule.objects.filter( **filter_args).values('student__id').distinct().count()
def setup(self, db, settings): # Set up data that's the same for standalone or multisite self.date_for = utc_yesterday() self.site = Site.objects.first() self.courses = [CourseOverviewFactory(), CourseOverviewFactory()] # Two for "our" course, one for another course in the same site self.enrollments = [ CourseEnrollmentFactory(course_id=self.courses[0].id), CourseEnrollmentFactory(course_id=self.courses[0].id), CourseEnrollmentFactory(course_id=self.courses[1].id), ] self.ce0_sm = StudentModuleFactory.from_course_enrollment( self.enrollments[0], created=as_datetime(self.date_for), modified=as_datetime(self.date_for)) # Handle site mode specifices if organizations_support_sites(): settings.FEATURES['FIGURES_IS_MULTISITE'] = True self.org = OrganizationFactory(sites=[self.site]) for course in self.courses: OrganizationCourseFactory(organization=self.org, course_id=str(course.id)) map_users_to_org(self.org, [ce.user for ce in self.enrollments]) # For our tests, we focus on a single enrollment. We should not # need to stand up other site data, but if we find we do need to, # then here's the place to do it else: self.org = OrganizationFactory()
def seed_student_modules_fixed(data=None): ''' ''' if not data: data = STUDENT_MODULE_DATA for rec in data: StudentModule.objects.update_or_create( student=get_user_model().objects.get(username=rec['username']), course_id=as_course_key(rec['course_id']), create=as_datetime(rec['created']), modified=as_datetime(rec['modified']), )
def setup(self, db): self.date_for = datetime.date(2018, 10, 1) self.users = [UserFactory( date_joined=as_datetime(self.date_for - datetime.timedelta(days=60)) ) for i in range(0, 3)] self.course_overviews = [CourseOverviewFactory( created=as_datetime(self.date_for - datetime.timedelta(days=60)) ) for i in range(0, 3)] self.cdm_recs = [CourseDailyMetricsFactory( date_for=self.date_for, **cdm ) for cdm in CDM_INPUT_TEST_DATA] self.prev_day_sdm = SiteDailyMetricsFactory( date_for=prev_day(self.date_for), **SDM_PREV_DAY[1])
def seed_users(data=None): if not data: data = cans.USER_DATA first_date = days_from(LAST_DAY, DAYS_BACK * -1) created_users = [] for rec in data: try: profile_rec = rec.get('profile', None) user = get_user_model().objects.create_user( username=rec['username'], password=rec['password'], email=rec['email'], ) user.is_staff = rec.get('is_staff', False) user.is_superuser = rec.get('is_superuser', False) user.date_joined = as_datetime( FAKE.date_between(first_date, LAST_DAY)).replace(tzinfo=utc) user.save() created_users.append(user) if profile_rec: UserProfile.objects.create( user=user, name=profile_rec['fullname'], gender=profile_rec.get('gender', None), country=profile_rec.get('country', None), ) except IntegrityError as e: print('skipping duplicate user email {}'.format(e)) return created_users
def load(self, date_for=None, force_update=False, **_kwargs): """ TODO: clean up how we do this. We want to be able to call the loader with an existing data set (not having to call the extractor) but we need to make sure that the metrics row 'date_for' is the same as provided in the data. So before hacking something together, I want to think this over some more. If the record alrdady exists and force_update is False, then simply return the record with the 'created' flag to False. This saves us an unnecessary call to extract data Raises ValidationError if invalid data is attempted to be saved to the course daily metrics model instance """ if not date_for: date_for = prev_day( datetime.datetime.utcnow().replace(tzinfo=utc).date()) else: date_for = as_datetime(date_for).replace(tzinfo=utc) try: cdm = CourseDailyMetrics.objects.get(course_id=self.course_id, date_for=date_for) # record found, only update if force update flag is True if not force_update: return ( cdm, False, ) except CourseDailyMetrics.DoesNotExist: # record not found, move on to creating pass data = self.get_data(date_for=date_for) return self.save_metrics(date_for=date_for, data=data)
def test_extract(self, monkeypatch): expected_results = dict( cumulative_active_user_count=52, # previous cumulative is 50 todays_active_user_count=2, total_user_count=len(self.users), course_count=len(CDM_INPUT_TEST_DATA), total_enrollment_count=150, ) assert not StudentModule.objects.count() modified = as_datetime(self.date_for) def mock_student_modules_for_site(site): users = [UserFactory() for i in range(2)] for user in users: StudentModuleFactory(student=user, modified=modified) StudentModuleFactory(student=user, modified=modified) return StudentModule.objects.filter(student__in=users) monkeypatch.setattr(pipeline_sdm, 'get_student_modules_for_site', mock_student_modules_for_site) for course in figures.sites.get_courses_for_site(self.site): assert course.created.date() < self.date_for for user in figures.sites.get_users_for_site(self.site): assert user.date_joined.date() < self.date_for actual = pipeline_sdm.SiteDailyMetricsExtractor().extract( site=self.site, date_for=self.date_for) for key, value in expected_results.iteritems(): assert actual[key] == value, 'failed on key: "{}"'.format(key)
def get_active_users_for_time_period(start_date, end_date, site=None, course_ids=None): """ Returns the number of users active in the time period. This is determined by finding the unique user ids for StudentModule records modified in a time period """ filter_args = dict(created__gt=as_datetime(prev_day(start_date)), modified__lt=as_datetime(next_day(end_date))) if course_ids: filter_args['course_ids__in'] = course_ids return StudentModule.objects.filter( **filter_args).values('student__id').distinct().count()
def get_course_enrollments(course_id, date_for): """Convenience method to get a filterd queryset of CourseEnrollment objects """ return CourseEnrollment.objects.filter( course_id=as_course_key(course_id), created__lt=as_datetime(next_day(date_for)), )
def test_get_course_enrollments_for_course(self): course_id = self.course_overviews[0].id expected_ce = CourseEnrollment.objects.filter( course_id=course_id, created__lt=as_datetime(next_day( self.today))).values_list('id', flat=True) results_ce = pipeline_cdm.get_course_enrollments( course_id=course_id, date_for=self.today).values_list('id', flat=True) assert set(results_ce) == set(expected_ce)
def get_active_learner_ids_today(course_id, date_for): """Get unique user ids for learners who are active today for the given course and date """ return StudentModule.objects.filter( course_id=as_course_key(course_id), modified=as_datetime(date_for)).values_list('student__id', flat=True).distinct()
def seed_course_overviews(data=None): if not data: data = cans.COURSE_OVERVIEW_DATA for rec in data: course_id = rec['id'] CourseOverview.objects.update_or_create( id=as_course_key(course_id), defaults=dict( display_name=rec['display_name'], org=rec['org'], display_org_with_default=rec['org'], number=rec['number'], created=as_datetime(rec['created']).replace(tzinfo=utc), enrollment_start=as_datetime( rec['enrollment_start']).replace(tzinfo=utc), enrollment_end=as_datetime( rec['enrollment_end']).replace(tzinfo=utc), ))
def get_num_learners_completed(course_id, date_for): """ Get the total number of certificates generated for the course up to the 'date_for' date We will need to relabel this to "certificates" We may want to get the number of certificates granted in the given day """ certificates = GeneratedCertificate.objects.filter( course_id=as_course_key(course_id), created_date__lt=as_datetime(next_day(date_for))) return certificates.count()
def setup(self, db): self.today = datetime.date(2018, 6, 1) self.course_overview = CourseOverviewFactory() if OPENEDX_RELEASE == GINKGO: self.course_enrollments = [ CourseEnrollmentFactory(course_id=self.course_overview.id) for i in range(4) ] else: self.course_enrollments = [ CourseEnrollmentFactory(course=self.course_overview) for i in range(4) ] if organizations_support_sites(): self.my_site = SiteFactory(domain='my-site.test') self.my_site_org = OrganizationFactory(sites=[self.my_site]) OrganizationCourseFactory(organization=self.my_site_org, course_id=str(self.course_overview.id)) for ce in self.course_enrollments: UserOrganizationMappingFactory(user=ce.user, organization=self.my_site_org) self.course_access_roles = [ CourseAccessRoleFactory( user=self.course_enrollments[i].user, course_id=self.course_enrollments[i].course_id, role=role, ) for i, role in enumerate(self.COURSE_ROLES) ] # create student modules for yesterday and today for day in [prev_day(self.today), self.today]: self.student_modules = [ StudentModuleFactory(course_id=ce.course_id, student=ce.user, created=ce.created, modified=as_datetime(day)) for ce in self.course_enrollments ] self.cert_days_to_complete = [10, 20, 30] self.expected_avg_cert_days_to_complete = 20 self.generated_certificates = [ GeneratedCertificateFactory( user=self.course_enrollments[i].user, course_id=self.course_enrollments[i].course_id, created_date=(self.course_enrollments[i].created + datetime.timedelta(days=days)), ) for i, days in enumerate(self.cert_days_to_complete) ]
def get_days_to_complete(course_id, date_for): """Return a dict with a list of days to complete and errors NOTE: This is a work in progress, as it has issues to resolve: * It returns the delta in days, so working in ints * This means if a learner starts at midnight and finished just before midnight, then 0 days will be given NOTE: This has limited scaling. We ought to test it with 1k, 10k, 100k cert records TODO: change to use start_date, end_date with defaults that start_date is open and end_date is today TODO: Consider collecting the total seconds rather than days This will improve accuracy, but may actually not be that important TODO: Analyze the error based on number of completions When we have to support scale, we can look into optimization techinques. """ certificates = GeneratedCertificate.objects.filter( course_id=as_course_key(course_id), created_date__lte=as_datetime(date_for)) days = [] errors = [] for cert in certificates: ce = CourseEnrollment.objects.filter( course_id=as_course_key(course_id), user=cert.user) # How do we want to handle multiples? if ce.count() > 1: errors.append( dict( msg='Multiple CE records', course_id=course_id, user_id=cert.user.id, )) try: days.append((cert.created_date - ce[0].created).days) except IndexError: # sometimes a course enrollment is deleted after the cert is generated. why, who knows? # in which case just leave out that data errors.append( dict( msg='No CourseEnrollment matching user course certificate', course_id=course_id, user_id=cert.user.id, )) return dict(days=days, errors=errors)
def get_active_learner_ids_today(course_id, date_for): """Get unique user ids for learners who are active today for the given course and date Note: When Figures no longer has to support Django 1.8, we can simplify this date check: https://docs.djangoproject.com/en/1.9/ref/models/querysets/#date """ date_for_as_datetime = as_datetime(date_for) return StudentModule.objects.filter( course_id=as_course_key(course_id), modified__year=date_for_as_datetime.year, modified__month=date_for_as_datetime.month, modified__day=date_for_as_datetime.day, ).values_list('student__id', flat=True).distinct()
def test_get_now_from_date(self): ''' Returns date at midnight ''' a_date = self.now.date() expected = datetime.datetime( year=a_date.year, month=a_date.month, day=a_date.day, hour=0, minute=0, second=0, microsecond=0, ).replace(tzinfo=utc) assert as_datetime(a_date) == expected
def seed_course_enrollments_for_course(course_id, users, max_days_back): def enroll_date(max_days_back): days_back = random.randint(1, abs(max_days_back)) return days_from(LAST_DAY, days_back * -1) for user in users: if VERBOSE: print('seeding course enrollment for user {}'.format(user.username)) CourseEnrollment.objects.update_or_create( course_id=course_id, user=user, created=as_datetime(enroll_date(max_days_back)).replace(tzinfo=utc), )
def test_get_active_user_count_for_date(self, monkeypatch): assert not get_user_model().objects.count() assert not StudentModule.objects.count() modified = as_datetime(self.date_for) def mock_student_modules_for_site(site): for user in [UserFactory() for i in range(2)]: StudentModuleFactory(student=user, modified=modified) StudentModuleFactory(student=user, modified=modified) return StudentModule.objects.all() monkeypatch.setattr(pipeline_sdm, 'get_student_modules_for_site', mock_student_modules_for_site) users = pipeline_sdm.get_site_active_users_for_date( site=self.site, date_for=self.date_for) assert users.count() == get_user_model().objects.count()
def load(self, site, date_for=None, force_update=False, **_kwargs): ''' Architectural note: Initially, we're going to be explicit, requiring callers to specify the site model instance to be associated with the site specific metrics model(s) we are populating TODOs: Add filtering for * Multi-tenancy * Course acess groups ''' if not date_for: date_for = prev_day( datetime.datetime.utcnow().replace(tzinfo=utc).date()) else: date_for = as_datetime(date_for).replace(tzinfo=utc) # if we already have a record for the date_for and force_update is False # then skip getting data if not force_update: try: sdm = SiteDailyMetrics.objects.get(site=site, date_for=date_for) return ( sdm, False, ) except SiteDailyMetrics.DoesNotExist: # proceed normally pass data = self.extractor.extract(site=site, date_for=date_for) site_metrics, created = SiteDailyMetrics.objects.update_or_create( date_for=date_for, site=site, defaults=dict( cumulative_active_user_count=data[ 'cumulative_active_user_count'], todays_active_user_count=data['todays_active_user_count'], total_user_count=data['total_user_count'], course_count=data['course_count'], total_enrollment_count=data['total_enrollment_count'], mau=data['mau'], )) return site_metrics, created
def test_mau_1g_for_month_as_of_day_first_day_next_month(db): """ Test getting live MAU 1G values from StudentModule for the given day Quick-n-dirty data setup: We want to make sure we get the right records when the query happens on the first day of the next month. So we do the following * Add StudentModule records for the month before we want to capture records * Add StudentModule records for the month we want to capture records * Add StudentModule records for the month after we want to capture records This sets up the scenario that we run the daily pipeline to capture MAU "as of" yesterday (the last day of the previous month) to capture MAU for the previous month and not capture any records before the previous month, nor capture records for the "current month" """ month_before = [as_datetime('2020-02-02'), as_datetime('2020-02-29')] month_after = [as_datetime('2020-04-01'), as_datetime('2020-04-01 12:00')] in_month = [ as_datetime('2020-03-01'), as_datetime('2020-03-15'), as_datetime('2020-03-31'), as_datetime('2020-03-31 12:00') ] date_for = as_date('2020-03-31') # Create student modules for the month before, month after, and in the # month for which we want to retrieve records [StudentModuleFactory(created=dt, modified=dt) for dt in month_before] [StudentModuleFactory(created=dt, modified=dt) for dt in month_after] sm_in = [ StudentModuleFactory(created=rec, modified=rec) for rec in in_month ] expected_user_ids = [obj.student_id for obj in sm_in] sm_queryset = StudentModule.objects.all() user_ids = mau_1g_for_month_as_of_day(sm_queryset=sm_queryset, date_for=date_for) assert len(user_ids) == len(in_month) assert set([rec['student__id'] for rec in user_ids]) == set(expected_user_ids)
def seed_course_completions(): """ go over the dates """ for co in CourseOverview.objects.all(): # Note there is a performance hit for using '?' qs = CourseEnrollment.objects.filter(course_id=co.id) # we just want a few of the enrollments to have completed # first cut, have 25% of learners complete course sample = int(qs.count() * 0.25) for ce in qs.order_by('?')[:sample]: GeneratedCertificate.objects.create( user=ce.user, course_id=co.id, created_date=as_datetime(FAKE.date_between( ce.created, LAST_DAY)).replace(tzinfo=utc), )
def seed_student_modules(): """ We're assuming active students here. Improvement is to skip a few and make others more active. Do it in a normal distrubution """ for ce in CourseEnrollment.objects.all(): # Added assert to check if it traps for this error: # ValueError: empty range for randrange() (1554681600,1554595201, -86399) assert ce.created <= LAST_DAY, "ce.created={}, LAST_DAY={}".format( ce.created, LAST_DAY) for i in range(random.randint(1, 5)): StudentModule.objects.update_or_create( student=ce.user, course_id=ce.course_id, created=ce.created, modified=as_datetime(FAKE.date_between(ce.created, LAST_DAY)), )
def get_num_enrolled_in_exclude_admins(course_id, date_for): """ Copied over from CourseEnrollmentManager.num_enrolled_in_exclude_admins method and modified to filter on date LT """ course_locator = course_id if getattr(course_id, 'ccx', None): course_locator = course_id.to_course_locator() staff = CourseStaffRole(course_locator).users_with_role() admins = CourseInstructorRole(course_locator).users_with_role() coaches = CourseCcxCoachRole(course_locator).users_with_role() return CourseEnrollment.objects.filter( course_id=course_id, is_active=1, created__lt=as_datetime(next_day(date_for)), ).exclude(user__in=staff).exclude(user__in=admins).exclude( user__in=coaches).count()
def missing_course_daily_metrics(site, date_for): ''' Return a list of course ids for any courses missing from the set of CourseDailyMetrics for the given date (and site after we implement multi- tenancy) The type returned is CourseLocator We use this to make sure that we are not missing course data when we populat the SiteDailyMetrics instance for the given date ''' cdm_course_keys = [ as_course_key(cdm.course_id) for cdm in CourseDailyMetrics.objects.filter(site=site, date_for=date_for) ] site_course_overviews = figures.sites.get_courses_for_site(site) course_overviews = site_course_overviews.filter( created__lt=as_datetime(next_day(date_for))).exclude(id__in=cdm_course_keys) return set(course_overviews.values_list('id', flat=True))