def create_course_problem_table(course_id, force_recompute=False, use_dataset_latest=False): ''' Generate course_problem table, with one row per (problem_id), giving average points, standard deviation on points, number of unique users attempted, max points possible. Uses person_item and course_item. ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) tablename = "course_problem" the_sql = """ # compute course_problem table for {course_id} SELECT problem_nid, problem_id, problem_short_id, avg(problem_grade) as avg_problem_raw_score, stddev(problem_grade) as sdv_problem_raw_score, # max(problem_grade) as max_problem_raw_score, max(possible_raw_score) as max_possible_raw_score, avg(problem_grade / possible_raw_score * 100) as avg_problem_pct_score, count(unique(user_id)) as n_unique_users_attempted, problem_name, is_split, split_name, FROM ( SELECT problem_nid, problem_id, problem_short_id, sum(item_grade) as problem_grade, user_id, sum(CI.item_points_possible) as possible_raw_score, problem_name, is_split, split_name, FROM [{dataset}.person_item] PI JOIN [{dataset}.course_item] CI on PI.item_nid = CI.item_nid group by problem_nid, problem_short_id, problem_id, user_id, problem_name, is_split, split_name ) group by problem_nid, problem_id, problem_short_id, problem_name, is_split, split_name # order by problem_short_id order by avg_problem_pct_score desc """.format(dataset=dataset, course_id=course_id) depends_on = [ "%s.course_item" % dataset, "%s.person_item" % dataset ] try: bqdat = bqutil.get_bq_table(dataset, tablename, the_sql, depends_on=depends_on, force_query=force_recompute, startIndex=-2) except Exception as err: print "[make_course_problem_table] ERR! failed in creating %s.%s using this sql:" % (dataset, tablename) print the_sql raise if not bqdat: nfound = 0 else: nfound = bqutil.get_bq_table_size_rows(dataset, tablename) print "--> Done with %s for %s, %d entries found" % (tablename, course_id, nfound) sys.stdout.flush()
def createPersonCourseVideo( course_id, force_recompute=False, use_dataset_latest=False ): ''' Create the person_course_video_watched table, based on video_stats. Each row gives the number of unique videos watched by a given user, for the given course. ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) table = TABLE_PERSON_COURSE_VIDEO_WATCHED the_sql = """ SELECT user_id, "{course_id}" as course_id, count(*) n_unique_videos_watched, count(*) / n_total_videos as fract_total_videos_watched, viewed, certified, verified FROM ( SELECT PC.user_id as user_id, UV.username as username, video_id, n_views, NV.n_total_videos as n_total_videos, certified, viewed, (mode=="verified") as verified, FROM ( SELECT username, video_id, count(*) as n_views FROM [{dataset}.video_stats_day] GROUP BY username, video_id ) UV JOIN [{dataset}.person_course] PC on UV.username = PC.username CROSS JOIN ( SELECT count(*) as n_total_videos FROM [{dataset}.video_axis] ) NV WHERE ((PC.roles = 'Student') OR (PC.roles is NULL)) # accommodate case when roles.csv is missing # WHERE PC.roles = 'Student' ) GROUP BY user_id, certified, viewed, verified, n_total_videos order by user_id """ the_sql = the_sql.format(course_id=course_id, dataset=dataset) bqdat = bqutil.get_bq_table(dataset, table, the_sql, force_query=force_recompute, depends_on=["%s.%s" % (dataset, TABLE_VIDEO_STATS)], newer_than=datetime.datetime( 2017, 2, 6, 18, 30 ), startIndex=-2) if not bqdat: nfound = 0 else: nfound = bqutil.get_bq_table_size_rows(dataset, table) print "--> Done with %s for %s, %d entries found" % (table, course_id, nfound) sys.stdout.flush() return bqdat
def create_person_problem_table(course_id, force_recompute=False, use_dataset_latest=False): ''' Generate person_problem table, with one row per (user_id, problem_id), giving problem raw_score earned, attempts, and datestamp. Computed by aggregating over person_item, and joining with course_item ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) tablename = "person_problem" the_sql = """ # compute person-problem table for {course_id} SELECT user_id, course_id, CI.problem_nid as problem_nid, sum(item_grade) as problem_raw_score, sum(item_grade) / sum(CI.item_points_possible) * 100 as problem_pct_score, max(PI.grade) as grade, max(n_attempts) as n_attempts, max(date) as date, FROM [{dataset}.person_item] PI JOIN [{dataset}.course_item] CI on PI.item_nid = CI.item_nid group by user_id, course_id, problem_nid order by user_id, course_id, problem_nid """.format(dataset=dataset, course_id=course_id) depends_on = ["%s.course_item" % dataset, "%s.person_item" % dataset] try: bqdat = bqutil.get_bq_table(dataset, tablename, the_sql, depends_on=depends_on, force_query=force_recompute, startIndex=-2) except Exception as err: print "[make_person_problem_table] ERR! failed in creating %s.%s using this sql:" % ( dataset, tablename) print the_sql raise if not bqdat: nfound = 0 else: nfound = bqutil.get_bq_table_size_rows(dataset, tablename) print "--> Done with %s for %s, %d entries found" % (tablename, course_id, nfound) sys.stdout.flush()
def create_person_item_table(course_id, force_recompute=False, use_dataset_latest=False): ''' Generate person_item table, with one row per (user_id, item_id), giving grade points earned, attempts, and datestamp. ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) tablename = "person_item" the_sql = """ # compute person-item table SELECT user_id, # PA.item_id as item_id, CI.item_short_id as item_short_id, CI.item_nid as item_nid, item_grade, n_attempts, date FROM ( SELECT user_id, item.answer_id as item_id, if(item.correct_bool, 1, 0) as item_grade, attempts as n_attempts, max(created) as date, FROM [{dataset}.problem_analysis] group by user_id, item_id, item_grade, n_attempts # force (user_id, item_id) to be unique (it should always be, even w/o this) ) PA JOIN [{dataset}.course_item] CI on PA.item_id = CI.item_id order by user_id, CI.content_index, CI.item_number """.format(dataset=dataset, course_id=course_id) depends_on = [ "%s.course_item" % dataset, "%s.problem_analysis" % dataset ] try: bqdat = bqutil.get_bq_table(dataset, tablename, the_sql, depends_on=depends_on, force_query=force_recompute, startIndex=-2) except Exception as err: print "[make_person_item_table] ERR! failed in creating %s.%s using this sql:" % (dataset, tablename) print the_sql raise if not bqdat: nfound = 0 else: nfound = bqutil.get_bq_table_size_rows(dataset, tablename) print "--> Done with %s for %s, %d entries found" % (tablename, course_id, nfound) sys.stdout.flush()
def create_person_problem_table(course_id, force_recompute=False, use_dataset_latest=False): ''' Generate person_problem table, with one row per (user_id, problem_id), giving problem raw_score earned, attempts, and datestamp. Computed by aggregating over person_item, and joining with course_item ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) tablename = "person_problem" the_sql = """ # compute person-problem table for {course_id} SELECT user_id, course_id, CI.problem_nid as problem_nid, sum(item_grade) as problem_raw_score, sum(item_grade) / sum(CI.item_points_possible) * 100 as problem_pct_score, max(PI.grade) as grade, max(n_attempts) as n_attempts, max(date) as date, FROM [{dataset}.person_item] PI JOIN [{dataset}.course_item] CI on PI.item_nid = CI.item_nid group by user_id, course_id, problem_nid order by user_id, course_id, problem_nid """.format(dataset=dataset, course_id=course_id) depends_on = [ "%s.course_item" % dataset, "%s.person_item" % dataset ] try: bqdat = bqutil.get_bq_table(dataset, tablename, the_sql, depends_on=depends_on, force_query=force_recompute, startIndex=-2) except Exception as err: print "[make_person_problem_table] ERR! failed in creating %s.%s using this sql:" % (dataset, tablename) print the_sql raise if not bqdat: nfound = 0 else: nfound = bqutil.get_bq_table_size_rows(dataset, tablename) print "--> Done with %s for %s, %d entries found" % (tablename, course_id, nfound) sys.stdout.flush()
def count_tracking_log_events(self): ''' Loop over all tracking logs up to cutoff date, and sum up number of entries, by doing table info lookups, with no SQL queries. ''' if self.skip_or_do_step("count_events") < 0: return # skip step tlend = self.end_date.replace('-', '') # end_date normally specified as YYYY-MM-DD log_event_counts = {} # iterate over each course, one at a time for course_id in self.course_id_set: log_dataset = bqutil.course_id2dataset(course_id, dtype="logs") # get list of all tracking log files for this course log_tables = [x for x in bqutil.get_list_of_table_ids(log_dataset) if x.startswith('tracklog_20')] log_tables_todo = [x for x in log_tables if x[9:] <= tlend] log_tables_todo.sort() print "[count_tracking_log_events] for course %s using %d tracking log tables, from %s to %s" % (course_id, len(log_tables_todo), log_tables_todo[0], log_tables_todo[-1]) sys.stdout.flush() # go through all log files and get size on each row_sizes = [ bqutil.get_bq_table_size_rows(log_dataset, x) for x in log_tables_todo ] log_event_counts[course_id] = sum(row_sizes) print " For %s found %d total tracking log events" % (course_id, log_event_counts[course_id]) sys.stdout.flush() self.log_event_counts = log_event_counts self.total_events = sum(log_event_counts.values()) print "--> Total number of events for %s = %d" % (self.org, self.total_events)
def obsolete_process_course(course_id, force_recompute=False, check_dates=True): ''' make person_course_day tables for specified course_id. This version produces one table for each day. It is inefficient when there are many days with very small daily tracking log tables. ''' PCDAY_SQL = """ select username, "{course_id}" as course_id, sum(bevent) as nevents, sum(bprogress) as nprogcheck, sum(bshow_answer) as nshow_answer, sum(bvideo) as nvideo, sum(bproblem_check) as nproblem_check, sum(bforum) as nforum, sum(bshow_transcript) as ntranscript, sum(bseq_goto) as nseq_goto, sum(bseek_video) as nseek_video, sum(bpause_video) as npause_video, MAX(time) as last_event, AVG( case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end ) as avg_dt, STDDEV( case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end ) as sdv_dt, MAX( case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end ) as max_dt, COUNT( case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end ) as n_dt, SUM( case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end ) as sum_dt from (SELECT username, case when event_type = "play_video" then 1 else 0 end as bvideo, case when event_type = "problem_check" then 1 else 0 end as bproblem_check, case when username != "" then 1 else 0 end as bevent, case when regexp_match(event_type, "^/courses/{course_id}/discussion/.*") then 1 else 0 end as bforum, case when regexp_match(event_type, "^/courses/{course_id}/progress") then 1 else 0 end as bprogress, case when event_type in ("show_answer", "showanswer") then 1 else 0 end as bshow_answer, case when event_type = 'show_transcript' then 1 else 0 end as bshow_transcript, case when event_type = 'seq_goto' then 1 else 0 end as bseq_goto, case when event_type = 'seek_video' then 1 else 0 end as bseek_video, case when event_type = 'pause_video' then 1 else 0 end as bpause_video, # case when event_type = 'edx.course.enrollment.activated' then 1 else 0 end as benroll, # case when event_type = 'edx.course.enrollment.deactivated' then 1 else 0 end as bunenroll time, lag(time, 1) over (partition by username order by time) last_time FROM [{dataset}.{table_id}] WHERE NOT event_type contains "/xblock/" AND username != "" ) group by course_id, username order by sdv_dt desc """ course_dir = course_id.replace('/','__') dataset = bqutil.course_id2dataset(course_id) log_dataset = bqutil.course_id2dataset(course_id, dtype="logs") pcd_dataset = bqutil.course_id2dataset(course_id, dtype="pcday") print "Processing course %s (start %s)" % (course_id, datetime.datetime.now()) sys.stdout.flush() log_tables = bqutil.get_tables(log_dataset) try: bqutil.create_dataset_if_nonexistent(pcd_dataset) except Exception as err: print "Oops, err when creating %s, err=%s" % (pcd_dataset, str(err)) pcday_tables_info = bqutil.get_tables(pcd_dataset) pcday_tables = [x['tableReference']['tableId'] for x in pcday_tables_info.get('tables', [])] print "pcday_tables = ", pcday_tables log_table_list = log_tables['tables'] log_table_list.sort() for table in log_table_list: tr = table['tableReference'] table_id = tr['tableId'] if not table_id.startswith('tracklog'): continue date = table_id[9:] table_out = 'pcday_%s' % date if (table_out in pcday_tables) and not force_recompute: skip = True if check_dates: table_out_date = bqutil.get_bq_table_last_modified_datetime(pcd_dataset, table_out) log_table_date = bqutil.get_bq_table_last_modified_datetime(log_dataset, table_id) if log_table_date > table_out_date: skip = False print "%s...already exists, but table_out date=%s and log_table date=%s, so re-computing" % (table_out, table_out_date, log_table_date) if skip: print "%s...already done, skipping" % table_out sys.stdout.flush() continue if bqutil.get_bq_table_size_rows(log_dataset, table_id)==0: print "...zero size table %s, skipping" % table_id sys.stdout.flush() continue print ("Creating %s " % table_out), the_sql = PCDAY_SQL.format(course_id=course_id, dataset=log_dataset, table_id=table_id) sys.stdout.flush() bqutil.create_bq_table(pcd_dataset, table_out, the_sql, wait=False) print "Done with course %s (end %s)" % (course_id, datetime.datetime.now()) print "="*77 sys.stdout.flush()
def old_process_course(course_id, force_recompute=False): ''' DEPRACATED - instead of creating one table per day, because there is so little total data, create one enrollday_all table (see other function below). make enrollday2_* tables for specified course_id ''' SQL = """ SELECT "{course_id}" as course_id, time, event_struct.user_id as user_id, (case when (event_type = "edx.course.enrollment.activated" and event_struct.mode = "honor") then 1 when (event_type = "edx.course.enrollment.deactivated" and event_struct.mode = "honor") then -1 else 0 end) as diff_enrollment_honor, (case when (event_type = "edx.course.enrollment.activated" and event_struct.mode = "verified") then 1 when (event_type = "edx.course.enrollment.deactivated" and event_struct.mode = "verified") then -1 else 0 end) as diff_enrollment_verified, (case when (event_type = "edx.course.enrollment.activated" and event_struct.mode = "audit") then 1 when (event_type = "edx.course.enrollment.deactivated" and event_struct.mode = "audit") then -1 else 0 end) as diff_enrollment_audit, FROM [{dataset}.{table_id}] where (event_type = "edx.course.enrollment.activated") or (event_type = "edx.course.enrollment.deactivated") order by time; """ course_dir = course_id.replace('/','__') dataset = bqutil.course_id2dataset(course_id) log_dataset = bqutil.course_id2dataset(course_id, dtype="logs") pcd_dataset = bqutil.course_id2dataset(course_id, dtype="pcday") print "Processing course %s (start %s)" % (course_id, datetime.datetime.now()) sys.stdout.flush() log_tables = bqutil.get_tables(log_dataset) try: bqutil.create_dataset_if_nonexistent(pcd_dataset) except Exception as err: print "Oops, err when creating %s, err=%s" % (pcd_dataset, str(err)) pcday_tables_info = bqutil.get_tables(pcd_dataset) pcday_tables = [x['tableReference']['tableId'] for x in pcday_tables_info.get('tables', [])] # print "pcday_tables = ", pcday_tables log_table_list = log_tables['tables'] log_table_list.sort() for table in log_table_list: tr = table['tableReference'] table_id = tr['tableId'] if not table_id.startswith('tracklog'): continue date = table_id[9:] table_out = 'enrollday2_%s' % date if (table_out in pcday_tables) and not force_recompute: print "%s...already done, skipping" % table_id sys.stdout.flush() continue if bqutil.get_bq_table_size_rows(log_dataset, table_id)==0: print "...zero size table %s, skipping" % table_id sys.stdout.flush() continue print ("Creating %s " % table_out), the_sql = SQL.format(course_id=course_id, dataset=log_dataset, table_id=table_id) sys.stdout.flush() bqutil.create_bq_table(pcd_dataset, table_out, the_sql, wait=False) print "Done with course %s (end %s)" % (course_id, datetime.datetime.now()) print "="*77 sys.stdout.flush()
def make_irt_report(course_id, force_recompute=False, use_dataset_latest=False): ''' ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) the_sql_alpha = """ IR.itemtestcorr as item_test, IR.itemrestcorr as item_rest, IR.alpha as alpha, """ the_sql_no_alpha = """ null as item_test, null as item_rest, null as alpha, """ the_sql_alpha_join = """ JOIN [{dataset}.item_reliabilities] IR on IR.item = CP.problem_yid """.format(dataset=dataset) the_sql = """ # item_response_theory_report for {course_id} # # problem_nid,problem_short_id,chapter,assignment_type,problem_label,problem_id,IRT item number,avg_problem_raw_score,avg_problem_pct_score, # n_unique_users_attempted,item_test,item_rest,alpha ,Discrimination,Difficulty SELECT "{course_id}" as course_id, IG.problem_nid as problem_nid, CP.problem_short_id as problem_short_id, CI.chapter_name as chapter, assignment_type, CONCAT("[", STRING(IG.problem_nid), "] ", CI.chapter_name, " / ", CI.section_name, " / ", CP.problem_name) as problem_label, CP.problem_id as problem_id, CONCAT(STRING(CP.problem_nid), "/", STRING(cutnum)) as IRT_item_number, CP.avg_problem_raw_score avg_problem_raw_score, CP.avg_problem_pct_score avg_problem_pct_score, CP.n_unique_users_attempted n_unique_users_attempted, {sql_alpha} irt_diff as Difficulty, irt_disc as Discrimination, diff_se as Difficulty_SE, disc_se as Discrimination_SE, "{irt_method}" as irt_method, FROM [{dataset}.{item_irt_grm}] IG JOIN [{dataset}.course_item] CI on IG.problem_nid = CI.problem_nid JOIN ( SELECT *, CONCAT("y", STRING(problem_nid)) as problem_yid, FROM [{dataset}.course_problem] ) CP on IG.problem_nid = CP.problem_nid {sql_alpha_join} where CI.item_number = 1 """ tablename = "item_response_theory_report" RELIABILITIES_TABLE = "item_reliabilities" IRT_TABLES = OrderedDict([ ("item_irt_grm", "STATA GRM"), ("item_irt_grm_R", "R mirt GRM"), ]) irt_table_to_use = None irt_table_date = None # use newest of the existing IRT tables for irt_tablename in IRT_TABLES: try: tinfo = bqutil.get_bq_table_info(dataset, irt_tablename ) assert tinfo is not None, "%s.%s does not exist" % ( dataset, irt_tablename ) lmt = tinfo.get('lastModifiedTime') use_table = lmt and ( (not irt_table_date) or (irt_table_date and lmt > irt_table_date) ) if use_table: irt_table_date = lmt irt_table_to_use = irt_tablename else: print "[make_irt_report] Not using IRT table %s (date %s) - older than %s (date %s)" % ( irt_tablename, lmt, irt_table_to_use, irt_table_date ) except Exception as err: pass if not irt_table_to_use: raise Exception("[make_irt_report] Cannot generate IRT report; requires one of %s" % (','.join(IRT_TABLES.keys()))) # SQL changes depending on whether item_reliabilities exists or not have_reliabilities = False try: tinfo = bqutil.get_bq_table_info(dataset, RELIABILITIES_TABLE) assert tinfo is not None, "%s.%s does not exist" % ( dataset, RELIABILITIES_TABLE ) if tinfo is not None: have_reliabilities = True except Exception as err: pass if have_reliabilities: sql_alpha = {'sql_alpha': the_sql_alpha, "sql_alpha_join": the_sql_alpha_join } else: sql_alpha = {'sql_alpha': the_sql_no_alpha, "sql_alpha_join": "" } the_sql = the_sql.format(dataset=dataset, course_id=course_id, item_irt_grm=irt_table_to_use, irt_method=IRT_TABLES[irt_table_to_use], **sql_alpha) depends_on = [ "%s.course_item" % dataset, "%s.course_problem" % dataset, "%s.%s" % (dataset, irt_table_to_use), ] if have_reliabilities: depends_on.append("%s.item_reliabilities" % dataset) try: bqdat = bqutil.get_bq_table(dataset, tablename, the_sql, depends_on=depends_on, force_query=force_recompute, newer_than=datetime.datetime(2016, 9, 27, 14, 48), startIndex=-2) except Exception as err: print "[make_irt_report] ERR! failed in creating %s.%s using this sql:" % (dataset, tablename) print the_sql raise if not bqdat: nfound = 0 else: nfound = bqutil.get_bq_table_size_rows(dataset, tablename) print "--> Done with %s for %s, %d problem items found" % (tablename, course_id, nfound) sys.stdout.flush()
def old_process_course(course_id, force_recompute=False): ''' DEPRACATED - instead of creating one table per day, because there is so little total data, create one enrollday_all table (see other function below). make enrollday2_* tables for specified course_id ''' SQL = """ SELECT "{course_id}" as course_id, time, event_struct.user_id as user_id, (case when (event_type = "edx.course.enrollment.activated" and event_struct.mode = "honor") then 1 when (event_type = "edx.course.enrollment.deactivated" and event_struct.mode = "honor") then -1 else 0 end) as diff_enrollment_honor, (case when (event_type = "edx.course.enrollment.activated" and event_struct.mode = "verified") then 1 when (event_type = "edx.course.enrollment.deactivated" and event_struct.mode = "verified") then -1 else 0 end) as diff_enrollment_verified, (case when (event_type = "edx.course.enrollment.activated" and event_struct.mode = "audit") then 1 when (event_type = "edx.course.enrollment.deactivated" and event_struct.mode = "audit") then -1 else 0 end) as diff_enrollment_audit, FROM [{dataset}.{table_id}] where (event_type = "edx.course.enrollment.activated") or (event_type = "edx.course.enrollment.deactivated") order by time; """ course_dir = course_id.replace('/', '__') dataset = bqutil.course_id2dataset(course_id) log_dataset = bqutil.course_id2dataset(course_id, dtype="logs") pcd_dataset = bqutil.course_id2dataset(course_id, dtype="pcday") print "Processing course %s (start %s)" % (course_id, datetime.datetime.now()) sys.stdout.flush() log_tables = bqutil.get_tables(log_dataset) try: bqutil.create_dataset_if_nonexistent(pcd_dataset) except Exception as err: print "Oops, err when creating %s, err=%s" % (pcd_dataset, str(err)) pcday_tables_info = bqutil.get_tables(pcd_dataset) pcday_tables = [ x['tableReference']['tableId'] for x in pcday_tables_info.get('tables', []) ] # print "pcday_tables = ", pcday_tables log_table_list = log_tables['tables'] log_table_list.sort() for table in log_table_list: tr = table['tableReference'] table_id = tr['tableId'] if not table_id.startswith('tracklog'): continue date = table_id[9:] table_out = 'enrollday2_%s' % date if (table_out in pcday_tables) and not force_recompute: print "%s...already done, skipping" % table_id sys.stdout.flush() continue if bqutil.get_bq_table_size_rows(log_dataset, table_id) == 0: print "...zero size table %s, skipping" % table_id sys.stdout.flush() continue print("Creating %s " % table_out), the_sql = SQL.format(course_id=course_id, dataset=log_dataset, table_id=table_id) sys.stdout.flush() bqutil.create_bq_table(pcd_dataset, table_out, the_sql, wait=False) print "Done with course %s (end %s)" % (course_id, datetime.datetime.now()) print "=" * 77 sys.stdout.flush()
def make_irt_report(course_id, force_recompute=False, use_dataset_latest=False): ''' ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) the_sql_alpha = """ IR.itemtestcorr as item_test, IR.itemrestcorr as item_rest, IR.alpha as alpha, """ the_sql_no_alpha = """ null as item_test, null as item_rest, null as alpha, """ the_sql_alpha_join = """ JOIN [{dataset}.item_reliabilities] IR on IR.item = CP.problem_yid """.format(dataset=dataset) the_sql = """ # item_response_theory_report for {course_id} # # problem_nid,problem_short_id,chapter,assignment_type,problem_label,problem_id,IRT item number,avg_problem_raw_score,avg_problem_pct_score, # n_unique_users_attempted,item_test,item_rest,alpha ,Discrimination,Difficulty SELECT "{course_id}" as course_id, IG.problem_nid as problem_nid, CP.problem_short_id as problem_short_id, CI.chapter_name as chapter, assignment_type, CONCAT("[", STRING(IG.problem_nid), "] ", CI.chapter_name, " / ", CI.section_name, " / ", CP.problem_name) as problem_label, CP.problem_id as problem_id, CONCAT(STRING(CP.problem_nid), "/", STRING(cutnum)) as IRT_item_number, CP.avg_problem_raw_score avg_problem_raw_score, CP.avg_problem_pct_score avg_problem_pct_score, CP.n_unique_users_attempted n_unique_users_attempted, {sql_alpha} irt_diff as Difficulty, irt_disc as Discrimination, diff_se as Difficulty_SE, disc_se as Discrimination_SE, "{irt_method}" as irt_method, FROM [{dataset}.{item_irt_grm}] IG JOIN [{dataset}.course_item] CI on IG.problem_nid = CI.problem_nid JOIN ( SELECT *, CONCAT("y", STRING(problem_nid)) as problem_yid, FROM [{dataset}.course_problem] ) CP on IG.problem_nid = CP.problem_nid {sql_alpha_join} where CI.item_number = 1 """ tablename = "item_response_theory_report" RELIABILITIES_TABLE = "item_reliabilities" IRT_TABLES = OrderedDict([ ("item_irt_grm", "STATA GRM"), ("item_irt_grm_R", "R mirt GRM"), ]) irt_table_to_use = None irt_table_date = None # use newest of the existing IRT tables for irt_tablename in IRT_TABLES: try: tinfo = bqutil.get_bq_table_info(dataset, irt_tablename) assert tinfo is not None, "%s.%s does not exist" % (dataset, irt_tablename) lmt = tinfo.get('lastModifiedTime') use_table = lmt and ((not irt_table_date) or (irt_table_date and lmt > irt_table_date)) if use_table: irt_table_date = lmt irt_table_to_use = irt_tablename else: print "[make_irt_report] Not using IRT table %s (date %s) - older than %s (date %s)" % ( irt_tablename, lmt, irt_table_to_use, irt_table_date) except Exception as err: pass if not irt_table_to_use: raise Exception( "[make_irt_report] Cannot generate IRT report; requires one of %s" % (','.join(IRT_TABLES.keys()))) # SQL changes depending on whether item_reliabilities exists or not have_reliabilities = False try: tinfo = bqutil.get_bq_table_info(dataset, RELIABILITIES_TABLE) assert tinfo is not None, "%s.%s does not exist" % ( dataset, RELIABILITIES_TABLE) if tinfo is not None: have_reliabilities = True except Exception as err: pass if have_reliabilities: sql_alpha = { 'sql_alpha': the_sql_alpha, "sql_alpha_join": the_sql_alpha_join } else: sql_alpha = {'sql_alpha': the_sql_no_alpha, "sql_alpha_join": ""} the_sql = the_sql.format(dataset=dataset, course_id=course_id, item_irt_grm=irt_table_to_use, irt_method=IRT_TABLES[irt_table_to_use], **sql_alpha) depends_on = [ "%s.course_item" % dataset, "%s.course_problem" % dataset, "%s.%s" % (dataset, irt_table_to_use), ] if have_reliabilities: depends_on.append("%s.item_reliabilities" % dataset) try: bqdat = bqutil.get_bq_table(dataset, tablename, the_sql, depends_on=depends_on, force_query=force_recompute, newer_than=datetime.datetime( 2016, 9, 27, 14, 48), startIndex=-2) except Exception as err: print "[make_irt_report] ERR! failed in creating %s.%s using this sql:" % ( dataset, tablename) print the_sql raise if not bqdat: nfound = 0 else: nfound = bqutil.get_bq_table_size_rows(dataset, tablename) print "--> Done with %s for %s, %d problem items found" % ( tablename, course_id, nfound) sys.stdout.flush()
def createPersonCourseVideo(course_id, force_recompute=False, use_dataset_latest=False): ''' Create the person_course_video_watched table, based on video_stats. Each row gives the number of unique videos watched by a given user, for the given course. ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) table = TABLE_PERSON_COURSE_VIDEO_WATCHED the_sql = """ SELECT user_id, "{course_id}" as course_id, count(*) n_unique_videos_watched, count(*) / n_total_videos as fract_total_videos_watched, viewed, certified, verified FROM ( SELECT PC.user_id as user_id, UV.username as username, video_id, n_views, NV.n_total_videos as n_total_videos, certified, viewed, (mode=="verified") as verified, FROM ( SELECT username, video_id, count(*) as n_views FROM [{dataset}.video_stats_day] GROUP BY username, video_id ) UV JOIN [{dataset}.person_course] PC on UV.username = PC.username CROSS JOIN ( SELECT count(*) as n_total_videos FROM [{dataset}.video_axis] ) NV WHERE ((PC.roles = 'Student') OR (PC.roles is NULL)) # accommodate case when roles.csv is missing # WHERE PC.roles = 'Student' ) GROUP BY user_id, certified, viewed, verified, n_total_videos order by user_id """ the_sql = the_sql.format(course_id=course_id, dataset=dataset) bqdat = bqutil.get_bq_table( dataset, table, the_sql, force_query=force_recompute, depends_on=["%s.%s" % (dataset, TABLE_VIDEO_STATS)], newer_than=datetime.datetime(2017, 2, 6, 18, 30), startIndex=-2) if not bqdat: nfound = 0 else: nfound = bqutil.get_bq_table_size_rows(dataset, table) print "--> Done with %s for %s, %d entries found" % (table, course_id, nfound) sys.stdout.flush() return bqdat
def create_problem_first_attempt_correct_table(course_id, force_recompute=False, use_dataset_latest=False): ''' It is very useful to know, for each graded problem, the percentage of users who got the problem correct on their first attempt. This information is computed and stored in the problem_first_attempt_correct table, for exploreres, users who completed, and users who certified. Problems are indexed by problem_nid, which is a unique index used by course_problem and course_item. ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) tablename = "problem_first_attempt_correct" the_sql = """ # compute problem_first_attempt_correct table for {course_id} SELECT problem_nid, n_first_attempt_correct_by_certified, n_certified_users_attempted, n_first_attempt_correct_by_certified / n_certified_users_attempted * 100 as pct_correct_first_attempt_by_certified, n_first_attempt_correct_by_completed, n_completed_users_attempted, n_first_attempt_correct_by_completed / n_completed_users_attempted * 100 as pct_correct_first_attempt_by_completed, n_first_attempt_correct_by_explored, n_explored_users_attempted, n_first_attempt_correct_by_explored / n_explored_users_attempted * 100 as pct_correct_first_attempt_by_explored, FROM ( SELECT PP.problem_nid as problem_nid, sum(case when PC.certified and PP.n_attempts=1 and PP.problem_pct_score=100 then 1 else 0 end) as n_first_attempt_correct_by_certified, sum(case when PC.completed and PP.n_attempts=1 and PP.problem_pct_score=100 then 1 else 0 end) as n_first_attempt_correct_by_completed, sum(case when PC.explored and PP.n_attempts=1 and PP.problem_pct_score=100 then 1 else 0 end) as n_first_attempt_correct_by_explored, count(case when PC.certified then PP.user_id else null end) as n_certified_users_attempted, count(case when PC.completed then PP.user_id else null end) as n_completed_users_attempted, count(case when PC.explored then PP.user_id else null end) as n_explored_users_attempted, FROM [{dataset}.person_problem] PP JOIN [{dataset}.person_course] PC on PP.user_id = PC.user_id WHERE PC.certified or PC.completed or PC.explored group by problem_nid order by problem_nid ) """.format(dataset=dataset, course_id=course_id) depends_on = [ "%s.person_problem" % dataset, "%s.person_course" % dataset, ] try: bqdat = bqutil.get_bq_table(dataset, tablename, the_sql, depends_on=depends_on, force_query=force_recompute, startIndex=-2) except Exception as err: print "[create_problem_first_attempt_correct_table] ERR! failed in creating %s.%s using this sql:" % ( dataset, tablename) print the_sql raise if not bqdat: nfound = 0 else: nfound = bqutil.get_bq_table_size_rows(dataset, tablename) print "--> Done with %s for %s, %d entries found" % (tablename, course_id, nfound) sys.stdout.flush()
def create_problem_first_attempt_correct_table(course_id, force_recompute=False, use_dataset_latest=False): ''' It is very useful to know, for each graded problem, the percentage of users who got the problem correct on their first attempt. This information is computed and stored in the problem_first_attempt_correct table, for exploreres, users who completed, and users who certified. Problems are indexed by problem_nid, which is a unique index used by course_problem and course_item. ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) tablename = "problem_first_attempt_correct" the_sql = """ # compute problem_first_attempt_correct table for {course_id} SELECT problem_nid, n_first_attempt_correct_by_certified, n_certified_users_attempted, n_first_attempt_correct_by_certified / n_certified_users_attempted * 100 as pct_correct_first_attempt_by_certified, n_first_attempt_correct_by_completed, n_completed_users_attempted, n_first_attempt_correct_by_completed / n_completed_users_attempted * 100 as pct_correct_first_attempt_by_completed, n_first_attempt_correct_by_explored, n_explored_users_attempted, n_first_attempt_correct_by_explored / n_explored_users_attempted * 100 as pct_correct_first_attempt_by_explored, FROM ( SELECT PP.problem_nid as problem_nid, sum(case when PC.certified and PP.n_attempts=1 and PP.problem_pct_score=100 then 1 else 0 end) as n_first_attempt_correct_by_certified, sum(case when PC.completed and PP.n_attempts=1 and PP.problem_pct_score=100 then 1 else 0 end) as n_first_attempt_correct_by_completed, sum(case when PC.explored and PP.n_attempts=1 and PP.problem_pct_score=100 then 1 else 0 end) as n_first_attempt_correct_by_explored, count(case when PC.certified then PP.user_id else null end) as n_certified_users_attempted, count(case when PC.completed then PP.user_id else null end) as n_completed_users_attempted, count(case when PC.explored then PP.user_id else null end) as n_explored_users_attempted, FROM [{dataset}.person_problem] PP JOIN [{dataset}.person_course] PC on PP.user_id = PC.user_id WHERE PC.certified or PC.completed or PC.explored group by problem_nid order by problem_nid ) """.format(dataset=dataset, course_id=course_id) depends_on = [ "%s.person_problem" % dataset, "%s.person_course" % dataset, ] try: bqdat = bqutil.get_bq_table(dataset, tablename, the_sql, depends_on=depends_on, force_query=force_recompute, startIndex=-2) except Exception as err: print "[create_problem_first_attempt_correct_table] ERR! failed in creating %s.%s using this sql:" % (dataset, tablename) print the_sql raise if not bqdat: nfound = 0 else: nfound = bqutil.get_bq_table_size_rows(dataset, tablename) print "--> Done with %s for %s, %d entries found" % (tablename, course_id, nfound) sys.stdout.flush()