def createVideoStats_day( course_id, force_recompute=False, use_dataset_latest=False, skip_last_day=False, end_date=None): ''' Create video statistics per ay for viewed by looking for users who had a video position > 0, and watched by looking for users who had a video position > 95% of the total video length duration. ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) logs = bqutil.course_id2dataset(course_id, dtype='logs') table = TABLE_VIDEO_STATS_PER_DAY the_sql = """ SELECT date(time)as date, username, #module_id as video_id, #REGEXP_REPLACE(REGEXP_EXTRACT(JSON_EXTRACT(event, '$.id'), r'(?:i4x-)(.*)(?:"$)'), '-', '/') as video_id, # Old method takes full video id path (case when REGEXP_MATCH( JSON_EXTRACT(event, '$.id') , r'([-])' ) then REGEXP_EXTRACT(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(JSON_EXTRACT(event, '$.id'), '-', '/'), '"', ''), 'i4x/', ''), r'(?:.*\/)(.*)') else REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(JSON_EXTRACT(event, '$.id'), '-', '/'), '"', ''), 'i4x/', '') end) as video_id, # This takes video id only max(case when JSON_EXTRACT_SCALAR(event, '$.speed') is not null then float(JSON_EXTRACT_SCALAR(event,'$.speed'))*float(JSON_EXTRACT_SCALAR(event, '$.currentTime')) else float(JSON_EXTRACT_SCALAR(event, '$.currentTime')) end) as position, FROM {DATASETS} WHERE (event_type = "play_video" or event_type = "pause_video" or event_type = "stop_video") and event is not null group by username, video_id, date order by date """ try: tinfo = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_STATS_PER_DAY ) assert tinfo is not None, "[analyze_videos] Creating %s.%s table for %s" % (dataset, TABLE_VIDEO_STATS_PER_DAY, course_id) print "[analyze_videos] Appending latest data to %s.%s table for %s" % (dataset, TABLE_VIDEO_STATS_PER_DAY, course_id) sys.stdout.flush() except (AssertionError, Exception) as err: print str(err) sys.stdout.flush() print " --> Missing %s.%s? Attempting to create..." % ( dataset, TABLE_VIDEO_STATS_PER_DAY ) sys.stdout.flush() pass print "=== Processing Video Stats Per Day for %s (start %s)" % (course_id, datetime.datetime.now()) sys.stdout.flush() def gdf(row): return datetime.datetime.strptime(row['date'], '%Y-%m-%d') process_tracking_logs.run_query_on_tracking_logs(the_sql, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, get_date_function=gdf, skip_last_day=skip_last_day) print "Done with Video Stats Per Day for %s (end %s)" % (course_id, datetime.datetime.now()) print "="*77 sys.stdout.flush()
def select_problem_answer_histories(self, course_id, url_name): ''' Compute table of answers ever submitted for a given problem, as specified by a module_id. ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=self.use_dataset_latest()) org, num, semester = course_id.split('/') module_id = '%s/%s/problem/%s' % (org, num, url_name) sql = """ SELECT '{course_id}' as course_id, username, time, student_answers, attempts, success, grade, FROM [{dataset}.problem_check] WHERE module_id = "{module_id}" order by time """.format(dataset=dataset, course_id=course_id, module_id=module_id) table = 'problem_check_for_%s' % (url_name.replace(':','__').replace('-','_')) key = None return self.cached_get_bq_table(dataset, table, sql=sql, key=key, depends_on=['%s.problem_check' % dataset])
def upload_grades_persistent_data(cid, basedir, datedir, use_dataset_latest=False, subsection=False): """ Upload grades_persistent csv.gz to Google Storage, create the BigQuery table, then insert the data into the table. :param cid: the course id :param basedir: the base directory path :param datedir: the date directory name (represented as YYYY-MM-DD) :param use_dataset_latest: should the most recent dataset be used? :param subsection: should grades_persistentsubsection be uploaded? :type cid: str :type basedir: str :type datedir: str :type use_dataset_latest: bool :type subsection: bool """ gsdir = path(gsutil.gs_path_from_course_id(cid, use_dataset_latest=use_dataset_latest)) if subsection: csv_name = "grades_persistentsubsectiongrade.csv.gz" temp_name = "grades_persistentsubsectiongrade_temp.csv.gz" table = "grades_persistent_subsection" else: csv_name = "grades_persistentcoursegrade.csv.gz" temp_name = "grades_persistentcoursegrade_temp.csv.gz" table = "grades_persistent" sdir = load_course_sql.find_course_sql_dir(cid, basedir=basedir, datedir=datedir, use_dataset_latest=(use_dataset_latest), ) csvfn = sdir / csv_name tempfn = sdir / temp_name mypath = os.path.dirname(os.path.realpath(__file__)) the_schema = json.loads(open('%s/schemas/schema_%s.json' % (mypath, table)).read())[table] if not os.path.exists(csvfn): print "[edx2bigquery] make_grades_persistent: missing file %s, skipping" % csvfn return if not subsection: cleanup_rows_from_grade_persistent(csvfn, tempfn) else: cleanup_rows_from_grade_persistent(csvfn, tempfn, field_to_fix="first_attempted") gsutil.upload_file_to_gs(csvfn, gsdir, options="-z csv", verbose=True) dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent(dataset) # create dataset if not already existent bqutil.load_data_to_table(dataset, table, gsdir / csv_name, the_schema, format="csv", skiprows=1)
def make_enrollment_verified_events_per_user(course_id, force_recompute=False, use_dataset_latest=False, end_date=None): dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) table = TABLE_PERSON_ENROLLMENT_VERIFIED SQL = """ SELECT "{course_id}" as course_id, user_id, min(TIMESTAMP(time)) as verified_enroll_time, case when max(TIMESTAMP_TO_SEC(time)) == min(TIMESTAMP_TO_SEC(time)) then null else max(TIMESTAMP(time)) end as verified_unenroll_time, FROM [{dataset}.{enrollment_events}] WHERE ((mode = 'verified' and deactivated) or # Unenrolled (mode='verified' and not activated and mode_changed) # Enrolled ) GROUP BY course_id, user_id ORDER BY verified_enroll_time asc """.format(dataset=dataset, course_id=course_id, enrollment_events=TABLE_ENROLLMENT_EVENTS) bqdat = bqutil.get_bq_table( dataset, table, SQL, force_query=force_recompute, depends_on=["%s.%s" % (dataset, TABLE_ENROLLMENT_EVENTS)], )
def createVideoAxis(course_id, force_recompute=False, use_dataset_latest=False): ''' Video axis depends on the current course axis, and looks for the category field defines as video. In addition, the edx video id is extracted (with the full path stripped, in order to generalize tracking log searches for video ids where it was found that some courses contained the full path beginning with i4x, while other courses only had the edx video id), youtube id and the chapter name / index for that respective video ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) table = TABLE_VIDEO_AXIS # Get Video results the_sql = """ SELECT chapters.index as index_chapter, videos.index as index_video, videos.category as category, videos.course_id as course_id, videos.name as name, videos.vid_id as video_id, videos.yt_id as youtube_id, chapters.name as chapter_name FROM ( SELECT index, category, course_id, name, chapter_mid, #REGEXP_REPLACE(module_id, '[.]', '_') as vid_id, # vid id containing full path REGEXP_EXTRACT(REGEXP_REPLACE(module_id, '[.]', '_'), r'(?:.*\/)(.*)') as vid_id, # Only containing video id REGEXP_EXTRACT(data.ytid, r'\:(.*)') as yt_id, FROM [{dataset}.course_axis] WHERE category = "video") as videos LEFT JOIN ( SELECT name, module_id, index FROM [{dataset}.course_axis] ) as chapters ON videos.chapter_mid = chapters.module_id ORDER BY videos.index asc """.format(dataset=dataset) print "[analyze_videos] Creating %s.%s table for %s" % ( dataset, TABLE_VIDEO_AXIS, course_id) sys.stdout.flush() try: tinfo = bqutil.get_bq_table_info(dataset, TABLE_COURSE_AXIS) assert tinfo is not None, "[analyze videos] %s table depends on %s, which does not exist" % ( TABLE_VIDEO_AXIS, TABLE_COURSE_AXIS) except (AssertionError, Exception) as err: print " --> Err: missing %s.%s? Skipping creation of %s" % ( dataset, TABLE_COURSE_AXIS, TABLE_VIDEO_AXIS) sys.stdout.flush() return bqdat = bqutil.get_bq_table( dataset, table, the_sql, force_query=force_recompute, depends_on=["%s.course_axis" % (dataset)], ) return bqdat
def upload_grades_persistent_data(cid, basedir, datedir, use_dataset_latest=False, subsection=False): """Upload grades_persistent csv.gz to Google Storage, create the BigQuery table, then insert the data into the table :param cid: the course id :param basedir: the base directory path :param datedir: the date directory name (represented as YYYY-MM-DD) :param use_dataset_latest: should the most recent dataset be used? :param subsection: should grades_persistentsubsection be uploaded? :type cid: str :type basedir: str :type datedir: str :type use_dataset_latest: bool :type subsection: bool """ gsdir = path( gsutil.gs_path_from_course_id(cid, use_dataset_latest=use_dataset_latest)) if subsection: csv_name = "grades_persistentsubsectiongrade.csv.gz" temp_name = "grades_persistentsubsectiongrade_temp.csv.gz" table = "grades_persistent_subsection" else: csv_name = "grades_persistentcoursegrade.csv.gz" temp_name = "grades_persistentcoursegrade_temp.csv.gz" table = "grades_persistent" csvfn = '%s/%s/%s/%s' % (basedir, cid.replace('/', '__'), datedir, csv_name) tempfn = '%s/%s/%s/%s' % (basedir, cid.replace('/', '__'), datedir, temp_name) mypath = os.path.dirname(os.path.realpath(__file__)) the_schema = json.loads( open('%s/schemas/schema_%s.json' % (mypath, table)).read())[table] if not subsection: remove_nulls_from_grade_persistent(csvfn, tempfn) gsutil.upload_file_to_gs(csvfn, gsdir, options="-z csv", verbose=True) dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent( dataset) # create dataset if not already existent bqutil.load_data_to_table(dataset, table, gsdir / csv_name, the_schema, format="csv", skiprows=1)
def createPersonCourseVideo( course_id, force_recompute=False, use_dataset_latest=False ): ''' Create the person_course_video_watched table, based on video_stats. Each row gives the number of unique videos watched by a given user, for the given course. ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) table = TABLE_PERSON_COURSE_VIDEO_WATCHED the_sql = """ SELECT user_id, "{course_id}" as course_id, count(*) n_unique_videos_watched, count(*) / n_total_videos as fract_total_videos_watched, viewed, certified, verified FROM ( SELECT PC.user_id as user_id, UV.username as username, video_id, n_views, NV.n_total_videos as n_total_videos, certified, viewed, (mode=="verified") as verified, FROM ( SELECT username, video_id, count(*) as n_views FROM [{dataset}.video_stats_day] GROUP BY username, video_id ) UV JOIN [{dataset}.person_course] PC on UV.username = PC.username CROSS JOIN ( SELECT count(*) as n_total_videos FROM [{dataset}.video_axis] ) NV WHERE ((PC.roles = 'Student') OR (PC.roles is NULL)) # accommodate case when roles.csv is missing # WHERE PC.roles = 'Student' ) GROUP BY user_id, certified, viewed, verified, n_total_videos order by user_id """ the_sql = the_sql.format(course_id=course_id, dataset=dataset) bqdat = bqutil.get_bq_table(dataset, table, the_sql, force_query=force_recompute, depends_on=["%s.%s" % (dataset, TABLE_VIDEO_STATS)], newer_than=datetime.datetime( 2017, 2, 6, 18, 30 ), startIndex=-2) if not bqdat: nfound = 0 else: nfound = bqutil.get_bq_table_size_rows(dataset, table) print "--> Done with %s for %s, %d entries found" % (table, course_id, nfound) sys.stdout.flush() return bqdat
def create_course_problem_table(course_id, force_recompute=False, use_dataset_latest=False): ''' Generate course_problem table, with one row per (problem_id), giving average points, standard deviation on points, number of unique users attempted, max points possible. Uses person_item and course_item. ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) tablename = "course_problem" the_sql = """ # compute course_problem table for {course_id} SELECT problem_nid, problem_id, problem_short_id, avg(problem_grade) as avg_problem_raw_score, stddev(problem_grade) as sdv_problem_raw_score, # max(problem_grade) as max_problem_raw_score, max(possible_raw_score) as max_possible_raw_score, avg(problem_grade / possible_raw_score * 100) as avg_problem_pct_score, count(unique(user_id)) as n_unique_users_attempted, problem_name, is_split, split_name, FROM ( SELECT problem_nid, problem_id, problem_short_id, sum(item_grade) as problem_grade, user_id, sum(CI.item_points_possible) as possible_raw_score, problem_name, is_split, split_name, FROM [{dataset}.person_item] PI JOIN [{dataset}.course_item] CI on PI.item_nid = CI.item_nid group by problem_nid, problem_short_id, problem_id, user_id, problem_name, is_split, split_name ) group by problem_nid, problem_id, problem_short_id, problem_name, is_split, split_name # order by problem_short_id order by avg_problem_pct_score desc """.format(dataset=dataset, course_id=course_id) depends_on = [ "%s.course_item" % dataset, "%s.person_item" % dataset ] try: bqdat = bqutil.get_bq_table(dataset, tablename, the_sql, depends_on=depends_on, force_query=force_recompute, startIndex=-2) except Exception as err: print "[make_course_problem_table] ERR! failed in creating %s.%s using this sql:" % (dataset, tablename) print the_sql raise if not bqdat: nfound = 0 else: nfound = bqutil.get_bq_table_size_rows(dataset, tablename) print "--> Done with %s for %s, %d entries found" % (tablename, course_id, nfound) sys.stdout.flush()
def reset_enrollment_by_day(self, course_id): dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=self.use_dataset_latest()) # where to store result table = self.add_collection_name_prefix('stats_enrollment_by_day') logging.info('[reset enrollment by day] removing table %s.%s...' % (dataset, table)) memset = '%s.%s' % (dataset,table) mem.delete(memset) try: bqutil.delete_bq_table(dataset, table) except Exception as err: logging.error(err)
def create_person_problem_table(course_id, force_recompute=False, use_dataset_latest=False): ''' Generate person_problem table, with one row per (user_id, problem_id), giving problem raw_score earned, attempts, and datestamp. Computed by aggregating over person_item, and joining with course_item ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) tablename = "person_problem" the_sql = """ # compute person-problem table for {course_id} SELECT user_id, course_id, CI.problem_nid as problem_nid, sum(item_grade) as problem_raw_score, sum(item_grade) / sum(CI.item_points_possible) * 100 as problem_pct_score, max(PI.grade) as grade, max(n_attempts) as n_attempts, max(date) as date, FROM [{dataset}.person_item] PI JOIN [{dataset}.course_item] CI on PI.item_nid = CI.item_nid group by user_id, course_id, problem_nid order by user_id, course_id, problem_nid """.format(dataset=dataset, course_id=course_id) depends_on = ["%s.course_item" % dataset, "%s.person_item" % dataset] try: bqdat = bqutil.get_bq_table(dataset, tablename, the_sql, depends_on=depends_on, force_query=force_recompute, startIndex=-2) except Exception as err: print "[make_person_problem_table] ERR! failed in creating %s.%s using this sql:" % ( dataset, tablename) print the_sql raise if not bqdat: nfound = 0 else: nfound = bqutil.get_bq_table_size_rows(dataset, tablename) print "--> Done with %s for %s, %d entries found" % (tablename, course_id, nfound) sys.stdout.flush()
def createVideoAxis(course_id, force_recompute=False, use_dataset_latest=False): """ Video axis depends on the current course axis, and looks for the category field defines as video. In addition, the edx video id is extracted (with the full path stripped, in order to generalize tracking log searches for video ids where it was found that some courses contained the full path beginning with i4x, while other courses only had the edx video id), youtube id and the chapter name / index for that respective video """ dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) table = TABLE_VIDEO_AXIS # Get Video results the_sql = """ SELECT chapters.index as index_chapter, videos.index as index_video, videos.category as category, videos.course_id as course_id, videos.name as name, videos.vid_id as video_id, videos.yt_id as youtube_id, chapters.name as chapter_name FROM ( SELECT index, category, course_id, name, chapter_mid, #REGEXP_REPLACE(module_id, '[.]', '_') as vid_id, # vid id containing full path REGEXP_EXTRACT(REGEXP_REPLACE(module_id, '[.]', '_'), r'(?:.*\/)(.*)') as vid_id, # Only containing video id REGEXP_EXTRACT(data.ytid, r'\:(.*)') as yt_id, FROM [{dataset}.course_axis] WHERE category = "video") as videos LEFT JOIN ( SELECT name, module_id, index FROM [{dataset}.course_axis] ) as chapters ON videos.chapter_mid = chapters.module_id ORDER BY videos.index asc """.format( dataset=dataset ) print "[analyze_videos] Creating %s.%s table for %s" % (dataset, TABLE_VIDEO_AXIS, course_id) sys.stdout.flush() try: tinfo = bqutil.get_bq_table_info(dataset, TABLE_COURSE_AXIS) assert tinfo is not None, "[analyze videos] %s table depends on %s, which does not exist" % ( TABLE_VIDEO_AXIS, TABLE_COURSE_AXIS, ) except (AssertionError, Exception) as err: print " --> Err: missing %s.%s? Skipping creation of %s" % (dataset, TABLE_COURSE_AXIS, TABLE_VIDEO_AXIS) sys.stdout.flush() return bqdat = bqutil.get_bq_table( dataset, table, the_sql, force_query=force_recompute, depends_on=["%s.course_axis" % (dataset)] ) return bqdat
def OLD_compute_activity_by_day(self, course_id, start="2012-08-20", end="2015-01-01"): ''' Compute course activity by day, based on *_pcday tables (DEPRECATED) ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=self.use_dataset_latest()) input_dataset = bqutil.course_id2dataset(course_id, 'pcday') sql = """ SELECT date(last_event) as date, sum(nevents) as nevents, sum(nvideo) as nvideo, sum(nshow_answer) as nshow_answer, sum(nproblem_check) as nproblem_check, sum(nforum) as nforum, sum(ntranscript) as ntranscript, sum(nseq_goto) as nseq_goto, sum(nseek_video) as nseek_video, sum(nprogcheck) as nprogcheck, sum(npause_video) as npause_video, sum(sum_dt) as sum_dt, avg(avg_dt) as avg_dt, sum(n_dt) as n_dt, FROM (TABLE_DATE_RANGE([{dataset}.pcday_], TIMESTAMP('{start}'), TIMESTAMP('{end}'))) group by date order by date """.format(dataset=input_dataset, course_id=course_id, start=start, end=end) pcday_tables = bqutil.get_list_of_table_ids(input_dataset) last_pcday = None for k in pcday_tables: if k.startswith('pcday_'): if last_pcday is None or k > last_pcday: last_pcday = k table = 'stats_activity_by_day' key = None return self.cached_get_bq_table(dataset, table, sql=sql, key=key, depends_on=['%s.%s' % (input_dataset, last_pcday), '%s.person_course' % dataset ], logger=logging.error)
def get_stats_module_usage( course_id, basedir="X-Year-2-data-sql", datedir="2013-09-21", use_dataset_latest=False, ): ''' Get data from the stats_module_usage table, if it doesn't already exist as a local file. Compute it if necessary. ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) sql = """ SELECT module_type, module_id, count(*) as ncount FROM [{dataset}.studentmodule] group by module_id, module_type order by module_id """.format(dataset=dataset) table = 'stats_module_usage' course_dir = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest) csvfn = course_dir / (table + ".csv") data = {} if csvfn.exists(): # read file into data structure for k in list(csv.DictReader(open(csvfn))): midfrag = tuple(k['module_id'].split('/')[-2:]) data[midfrag] = k else: # download if it is already computed, or recompute if needed bqdat = bqutil.get_bq_table(dataset, table, sql=sql) if bqdat is None: bqdat = {'data': []} fields = ["module_type", "module_id", "ncount"] fp = open(csvfn, 'w') cdw = csv.DictWriter(fp, fieldnames=fields) cdw.writeheader() for k in bqdat['data']: midfrag = tuple(k['module_id'].split('/')[-2:]) data[midfrag] = k try: k['module_id'] = k['module_id'].encode('utf8') cdw.writerow(k) except Exception as err: print "Error writing row %s, err=%s" % (k, str(err)) fp.close() print "[analyze_content] got %d lines of studentmodule usage data" % len( data) return data
def compute_problem_stats(self, course_id): ''' Compute problem average grade, attempts ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=self.use_dataset_latest()) sql = """ SELECT '{course_id}' as course_id, PA.problem_url_name as url_name, avg(PA.attempts) as avg_attempts, avg(PA.grade) as avg_grade, max(PA.max_grade) as max_max_grade, max(PA.grade) as emperical_max_grade, count(*) as nsubmissions, min(PA.created) as first_date, max(PA.created) as last_date, max(PA.attempts) as max_attempts, sum(case when integer(10*PA.grade/PA.max_grade)=0 then 1 else 0 end) as grade_hist_bin0, sum(case when integer(10*PA.grade/PA.max_grade)=1 then 1 else 0 end) as grade_hist_bin1, sum(case when integer(10*PA.grade/PA.max_grade)=2 then 1 else 0 end) as grade_hist_bin2, sum(case when integer(10*PA.grade/PA.max_grade)=3 then 1 else 0 end) as grade_hist_bin3, sum(case when integer(10*PA.grade/PA.max_grade)=4 then 1 else 0 end) as grade_hist_bin4, sum(case when integer(10*PA.grade/PA.max_grade)=5 then 1 else 0 end) as grade_hist_bin5, sum(case when integer(10*PA.grade/PA.max_grade)=6 then 1 else 0 end) as grade_hist_bin6, sum(case when integer(10*PA.grade/PA.max_grade)=7 then 1 else 0 end) as grade_hist_bin7, sum(case when integer(10*PA.grade/PA.max_grade)=8 then 1 else 0 end) as grade_hist_bin8, sum(case when integer(10*PA.grade/PA.max_grade)=9 then 1 else 0 end) as grade_hist_bin9, sum(case when integer(10*PA.grade/PA.max_grade)=10 then 1 else 0 end) as grade_hist_bin10, sum(case when integer(10*PA.attempts/M.max_attempts)=0 then 1 else 0 end) as attempts_hist_bin0, sum(case when integer(10*PA.attempts/M.max_attempts)=1 then 1 else 0 end) as attempts_hist_bin1, sum(case when integer(10*PA.attempts/M.max_attempts)=2 then 1 else 0 end) as attempts_hist_bin2, sum(case when integer(10*PA.attempts/M.max_attempts)=3 then 1 else 0 end) as attempts_hist_bin3, sum(case when integer(10*PA.attempts/M.max_attempts)=4 then 1 else 0 end) as attempts_hist_bin4, sum(case when integer(10*PA.attempts/M.max_attempts)=5 then 1 else 0 end) as attempts_hist_bin5, sum(case when integer(10*PA.attempts/M.max_attempts)=6 then 1 else 0 end) as attempts_hist_bin6, sum(case when integer(10*PA.attempts/M.max_attempts)=7 then 1 else 0 end) as attempts_hist_bin7, sum(case when integer(10*PA.attempts/M.max_attempts)=8 then 1 else 0 end) as attempts_hist_bin8, sum(case when integer(10*PA.attempts/M.max_attempts)=9 then 1 else 0 end) as attempts_hist_bin9, sum(case when integer(10*PA.attempts/M.max_attempts)=10 then 1 else 0 end) as attempts_hist_bin10, FROM [{dataset}.problem_analysis] PA JOIN (SELECT problem_url_name as url_name, max(attempts) as max_attempts FROM [{dataset}.problem_analysis] group by url_name) as M ON PA.problem_url_name = M.url_name group by url_name, problem_url_name order by problem_url_name """.format(dataset=dataset, course_id=course_id) table = self.add_collection_name_prefix('stats_for_problems') key = {'name': 'url_name'} return self.cached_get_bq_table(dataset, table, sql=sql, key=key, depends_on=['%s.problem_analysis' % dataset])
def create_person_item_table(course_id, force_recompute=False, use_dataset_latest=False): ''' Generate person_item table, with one row per (user_id, item_id), giving grade points earned, attempts, and datestamp. ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) tablename = "person_item" the_sql = """ # compute person-item table SELECT user_id, # PA.item_id as item_id, CI.item_short_id as item_short_id, CI.item_nid as item_nid, item_grade, n_attempts, date FROM ( SELECT user_id, item.answer_id as item_id, if(item.correct_bool, 1, 0) as item_grade, attempts as n_attempts, max(created) as date, FROM [{dataset}.problem_analysis] group by user_id, item_id, item_grade, n_attempts # force (user_id, item_id) to be unique (it should always be, even w/o this) ) PA JOIN [{dataset}.course_item] CI on PA.item_id = CI.item_id order by user_id, CI.content_index, CI.item_number """.format(dataset=dataset, course_id=course_id) depends_on = [ "%s.course_item" % dataset, "%s.problem_analysis" % dataset ] try: bqdat = bqutil.get_bq_table(dataset, tablename, the_sql, depends_on=depends_on, force_query=force_recompute, startIndex=-2) except Exception as err: print "[make_person_item_table] ERR! failed in creating %s.%s using this sql:" % (dataset, tablename) print the_sql raise if not bqdat: nfound = 0 else: nfound = bqutil.get_bq_table_size_rows(dataset, tablename) print "--> Done with %s for %s, %d entries found" % (tablename, course_id, nfound) sys.stdout.flush()
def load_person_course(self, course_id): ''' Get person_course table from BQ. Use memcache. The person_course table has these relevant fields (among many): username, viewed, explored, ip ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=self.use_dataset_latest()) table = "person_course" key = None return self.cached_get_bq_table(dataset, table)
def do_user_part_csv(course_id, basedir=None, datedir=None, use_dataset_latest=False, verbose=False, pin_date=None): sdir = find_course_sql_dir(course_id, basedir=basedir, datedir=datedir, use_dataset_latest=(use_dataset_latest and not pin_date), ) # upload to google storage dfn = sdir / "user_api_usercoursetag.csv.gz" if not os.path.exists(dfn): print("[load_user_part] Missing %s, skipping" % dfn) return # reformat True / False to 1/0 for "value" field if verbose: print("[load_user_part] extracting user partition data from %s" % dfn) sys.stdout.flush() cdr = csv.DictReader(gzip.GzipFile(dfn)) fields = cdr.fieldnames if verbose: print("fieldnames = %s" % fields) fixed_data = [] bmap = {'true': 1, 'false': 0} for row in cdr: vstr = row['value'].lower() row['value'] = bmap.get(vstr, vstr) fixed_data.append(row) ofnb = 'user_partitions.csv.gz' odfn = sdir / ofnb with gzip.GzipFile(odfn, 'w') as ofp: cdw = csv.DictWriter(ofp, fieldnames=fields) cdw.writeheader() cdw.writerows(fixed_data) if verbose: print("[load_user_part] Wrote %d rows of user partition data to %s" % (len(fixed_data), odfn)) sys.stdout.flush() gsdir = path(gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest)) gsutil.upload_file_to_gs(odfn, gsdir / ofnb, verbose=False) mypath = os.path.dirname(os.path.realpath(__file__)) schema = json.loads(open('%s/schemas/schema_user_partitions.json' % mypath).read())['user_partitions'] # import into BigQuery table = "user_partitions" dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) bqutil.load_data_to_table(dataset, table, gsdir / ofnb, schema, format='csv', skiprows=1)
def get_table(self, dataset=None, table=None, org=None, number=None, semester=None): ''' show arbitrary table from bigquery -- mainly for debugging ''' if dataset is None: course_id = '/'.join([org, number, semester]) dataset = bqutil.course_id2dataset( course_id, use_dataset_latest=self.use_dataset_latest()) if not self.is_user_authorized_for_course(course_id): return self.no_auth_sorry() if ('person' in table) or ('track' in table) or ('student' in table): if not self.does_user_have_role('instructor', course_id): return self.no_auth_sorry() # be more restrictive: researchers only if not (self.does_user_have_role('researcher', course_id)): return self.no_auth_sorry() else: course_id = None if not self.user in self.AUTHORIZED_USERS: return self.no_auth_sorry() tableinfo = bqutil.get_bq_table_info(dataset, table) fields = tableinfo['schema']['fields'] field_names = [x['name'] for x in fields] tablecolumns = json.dumps([{ 'data': x, 'title': x, 'class': 'dt-center' } for x in field_names]) logging.info(tablecolumns) data = self.common_data data.update({ 'dataset': dataset, 'table': table, 'course_id': course_id, 'tablecolumns': tablecolumns, }) template = JINJA_ENVIRONMENT.get_template('show_table.html') self.response.out.write(template.render(data))
def ajax_get_table_data(self, org=None, number=None, semester=None, table=None): ''' show arbitrary table from bigquery -- mainly for debugging - ajax data ''' course_id = '/'.join([org, number, semester]) dataset = bqutil.course_id2dataset( course_id, use_dataset_latest=self.use_dataset_latest()) if ('person' in table) or ('track' in table) or ('student' in table): if not self.does_user_have_role('instructor', course_id): return self.no_auth_sorry() # be more restrictive: researchers only if not (self.does_user_have_role('researcher', course_id)): return self.no_auth_sorry() # DataTables server-side processing: http://datatables.net/manual/server-side draw = int(self.request.POST['draw']) start = int(self.request.POST['start']) length = int(self.request.POST['length']) bqdata = self.cached_get_bq_table(dataset, table, startIndex=start, maxResults=length) self.fix_bq_dates(bqdata) logging.info('get draw=%s, start=%s, length=%s' % (draw, start, length)) if 0: for row in bqdata['data']: for key in row: row[key] = row[key].encode('utf-8') data = self.common_data data.update({ 'data': bqdata['data'], 'draw': draw, 'recordsTotal': bqdata['numRows'], 'recordsFiltered': bqdata['numRows'], }) self.response.headers['Content-Type'] = 'application/json' self.response.out.write(json.dumps(data))
def create_person_problem_table(course_id, force_recompute=False, use_dataset_latest=False): ''' Generate person_problem table, with one row per (user_id, problem_id), giving problem raw_score earned, attempts, and datestamp. Computed by aggregating over person_item, and joining with course_item ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) tablename = "person_problem" the_sql = """ # compute person-problem table for {course_id} SELECT user_id, course_id, CI.problem_nid as problem_nid, sum(item_grade) as problem_raw_score, sum(item_grade) / sum(CI.item_points_possible) * 100 as problem_pct_score, max(PI.grade) as grade, max(n_attempts) as n_attempts, max(date) as date, FROM [{dataset}.person_item] PI JOIN [{dataset}.course_item] CI on PI.item_nid = CI.item_nid group by user_id, course_id, problem_nid order by user_id, course_id, problem_nid """.format(dataset=dataset, course_id=course_id) depends_on = [ "%s.course_item" % dataset, "%s.person_item" % dataset ] try: bqdat = bqutil.get_bq_table(dataset, tablename, the_sql, depends_on=depends_on, force_query=force_recompute, startIndex=-2) except Exception as err: print "[make_person_problem_table] ERR! failed in creating %s.%s using this sql:" % (dataset, tablename) print the_sql raise if not bqdat: nfound = 0 else: nfound = bqutil.get_bq_table_size_rows(dataset, tablename) print "--> Done with %s for %s, %d entries found" % (tablename, course_id, nfound) sys.stdout.flush()
def get_stats_module_usage(course_id, basedir="X-Year-2-data-sql", datedir="2013-09-21", use_dataset_latest=False, ): ''' Get data from the stats_module_usage table, if it doesn't already exist as a local file. Compute it if necessary. ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) sql = """ SELECT module_type, module_id, count(*) as ncount FROM [{dataset}.studentmodule] group by module_id, module_type order by module_id """.format(dataset=dataset) table = 'stats_module_usage' course_dir = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest) csvfn = course_dir / (table + ".csv") data = {} if csvfn.exists(): # read file into data structure for k in list(csv.DictReader(open(csvfn))): midfrag = tuple(k['module_id'].split('/')[-2:]) data[midfrag] = k else: # download if it is already computed, or recompute if needed bqdat = bqutil.get_bq_table(dataset, table, sql=sql) if bqdat is None: bqdat = {'data': []} fields = [ "module_type", "module_id", "ncount" ] fp = open(csvfn, 'w') cdw = csv.DictWriter(fp, fieldnames=fields) cdw.writeheader() for k in bqdat['data']: midfrag = tuple(k['module_id'].split('/')[-2:]) data[midfrag] = k try: k['module_id'] = k['module_id'].encode('utf8') cdw.writerow(k) except Exception as err: print "Error writing row %s, err=%s" % (k, str(err)) fp.close() print "[analyze_content] got %d lines of studentmodule usage data" % len(data) return data
def compute_activity_by_day(self, course_id, start=None, end="2015-01-01"): ''' Compute course activity by day, based on person_course_day table ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=self.use_dataset_latest()) ongoing = self.get_collection_metadata('ONGOING', False) if ongoing: end = datetime.datetime.now().strftime('%Y-%m-%d') else: end = self.get_collection_metadata('END_DATE', end) start = start or self.get_collection_metadata('START_DATE', "2012-08-20") logging.info('[compute_activity_by_day] start=%s, end=%s' % (start, end)) tables = bqutil.get_list_of_table_ids(dataset) if 'person_course_day' not in tables: logging.info('--> Warning: using old *_pcday tables for activity_by_day for %s' % course_id) return self.OLD_compute_activity_by_day(course_id, start=start, end=end) sql = """ SELECT date(last_event) as date, sum(nevents) as nevents, sum(nvideo) as nvideo, sum(nshow_answer) as nshow_answer, sum(nproblem_check) as nproblem_check, sum(nforum) as nforum, sum(ntranscript) as ntranscript, sum(nseq_goto) as nseq_goto, sum(nseek_video) as nseek_video, sum(nprogcheck) as nprogcheck, sum(npause_video) as npause_video, sum(sum_dt) as sum_dt, avg(avg_dt) as avg_dt, sum(n_dt) as n_dt, FROM [{dataset}.person_course_day] WHERE date <= "{end}" AND date >= "{start}" group by date order by date """.format(dataset=dataset, course_id=course_id, start=start, end=end) table = self.add_collection_name_prefix('stats_activity_by_day') key = None ret = self.cached_get_bq_table(dataset, table, sql=sql, key=key, depends_on=['%s.person_course_day' % dataset ], logger=logging.error) return ret
def already_exists(course_id, use_dataset_latest=False): ''' Return True if course_axis table already exists, and is sufficiently new ''' table = "course_axis" dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) # tables = bqutil.get_list_of_table_ids(dataset) # return table in tables try: mdt = bqutil.get_bq_table_last_modified_datetime(dataset, table) except Exception as err: return False if mdt < datetime.datetime(2015, 10, 31, 17, 00): return False return True
def load_course_axis(self, course_id, dtype='data_by_key'): ''' Get course axis table from BQ. Use memcache. The course axis has these fields: category, index, url_name, name, gformat, due, start, module_id, course_id, path, data.ytid, data.weight, chapter_mid ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=self.use_dataset_latest()) table = "course_axis" key={'name': 'url_name'} # return self.cached_get_bq_table(dataset, table, key=key, drop=['data'])['data_by_key'] bqdat = self.cached_get_bq_table(dataset, table, key=key, depends_on=['%s.%s' % (dataset, table)]) self.course_axis = bqdat return bqdat[dtype]
def make_gp_table(course_id, basedir=None, datedir=None, use_dataset_latest=False, verbose=False, pin_date=None): if pin_date: datedir = pin_date sdir = load_course_sql.find_course_sql_dir(course_id, basedir=basedir, datedir=datedir, use_dataset_latest=(use_dataset_latest and not pin_date), ) fn_to_try = ['course.xml.tar.gz', 'course-prod-analytics.xml.tar.gz', 'course-prod-edge-analytics.xml.tar.gz', 'course-prod-edx-replica.xml.tar.gz', ] for fntt in fn_to_try: fn = sdir / fntt if os.path.exists(fn): break if not os.path.exists(fn): msg = "---> oops, cannot get couese content (with grading policy file) for %s, file %s (or 'course.xml.tar.gz' or 'course-prod-edge-analytics.xml.tar.gz') missing!" % (course_id, fn) raise Exception(msg) gpstr, gpfn = read_grading_policy_from_tar_file(fn) fields, gptab, schema = load_grading_policy(gpstr, verbose=verbose, gpfn=gpfn) ofnb = 'grading_policy.csv' ofn = sdir / ofnb ofp = open(ofn, 'w') cdw = csv.DictWriter(ofp, fieldnames=fields) cdw.writeheader() cdw.writerows(gptab) ofp.close() # upload to google storage gsdir = path(gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest)) gsutil.upload_file_to_gs(ofn, gsdir / ofnb, verbose=False) # import into BigQuery table = "grading_policy" dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) bqutil.load_data_to_table(dataset, table, gsdir / ofnb, schema, format='csv', skiprows=1)
def compute_sm_usage(self, course_id): ''' Compute usage stats from studentmodule table for course ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=self.use_dataset_latest()) sql = """ SELECT module_type, module_id, count(*) as ncount FROM [{dataset}.studentmodule] group by module_id, module_type order by module_id """.format(dataset=dataset) table = self.add_collection_name_prefix('stats_module_usage') key = {'name': 'module_id'} return self.cached_get_bq_table(dataset, table, sql=sql, key=key, depends_on=['%s.studentmodule' % dataset, '%s.course_axis' % dataset ])
def process_time_on_task_totals(course_id, force_recompute=False, use_dataset_latest=False): SQL = """ SELECT "{course_id}" as course_id, username, sum(total_time_5) as total_time_5, sum(total_time_30) as total_time_30, sum(total_video_time_5) as total_video_time_5, sum(total_video_time_30) as total_video_time_30, sum(serial_video_time_30) as serial_video_time_30, sum(total_problem_time_5) as total_problem_time_5, sum(total_problem_time_30) as total_problem_time_30, sum(serial_problem_time_30) as serial_problem_time_30, sum(total_forum_time_5) as total_forum_time_5, sum(total_forum_time_30) as total_forum_time_30, sum(serial_forum_time_30) as serial_forum_time_30, sum(total_text_time_5) as total_text_time_5, sum(total_text_time_30) as total_text_time_30, sum(serial_text_time_30) as serial_text_time_30, FROM [{dataset}.time_on_task] GROUP BY course_id, username order by username """ dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) the_sql = SQL.format(dataset=dataset, course_id=course_id) tablename = 'time_on_task_totals' print "Computing %s for %s" % (tablename, dataset) sys.stdout.flush() bqdat = bqutil.get_bq_table(dataset, tablename, the_sql, force_query=force_recompute, depends_on=[ '%s.time_on_task' % dataset ], ) return bqdat
def ajax_get_table_data(self, org=None, number=None, semester=None, table=None): ''' show arbitrary table from bigquery -- mainly for debugging - ajax data ''' course_id = '/'.join([org, number, semester]) dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=self.use_dataset_latest()) if ('person' in table) or ('track' in table) or ('student' in table): if not self.does_user_have_role('instructor', course_id): return self.no_auth_sorry() # be more restrictive: researchers only if not (self.does_user_have_role('researcher', course_id)): return self.no_auth_sorry() # DataTables server-side processing: http://datatables.net/manual/server-side draw = int(self.request.POST['draw']) start = int(self.request.POST['start']) length = int(self.request.POST['length']) bqdata = self.cached_get_bq_table(dataset, table, startIndex=start, maxResults=length) self.fix_bq_dates(bqdata) logging.info('get draw=%s, start=%s, length=%s' % (draw, start, length)) if 0: for row in bqdata['data']: for key in row: row[key] = row[key].encode('utf-8') data = self.common_data data.update({'data': bqdata['data'], 'draw': draw, 'recordsTotal': bqdata['numRows'], 'recordsFiltered': bqdata['numRows'], }) self.response.headers['Content-Type'] = 'application/json' self.response.out.write(json.dumps(data))
def compute_geo_stats(self, course_id): ''' Compute geographic distributions ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=self.use_dataset_latest()) sql = """ SELECT '{course_id}' as course_id, cc_by_ip as cc, countryLabel as countryLabel, count(*) as nregistered, sum(case when viewed then 1 else 0 end) as nviewed, sum(case when explored then 1 else 0 end) as nexplored, sum(case when certified then 1 else 0 end) as ncertified, sum(case when gender='m' then 1 else 0 end) as n_male, sum(case when gender='f' then 1 else 0 end) as n_female, sum(case when mode="verified" then 1 else 0 end) as n_verified_id, sum(case when (certified and mode="verified") then 1 else 0 end) as n_verified_certified, sum(ndays_act) as ndays_act_sum, sum(nevents) as nevents_sum, sum(nforum_posts) as nforum_posts_sum, sum(nshow_answer) as nshow_answer_sum, avg(avg_dt) as avg_of_avg_dt, avg(sum_dt) as avg_of_sum_dt, avg(case when certified then avg_dt else null end) as certified_avg_dt, avg(case when certified then sum_dt else null end) as certified_sum_dt, FROM [{dataset}.person_course] # WHERE cc_by_ip is not null group by cc, countryLabel, course_id order by cc """.format(dataset=dataset, course_id=course_id) table = self.add_collection_name_prefix('stats_geo0') key = None return self.cached_get_bq_table(dataset, table, sql=sql, key=key, depends_on=['%s.person_course' % dataset])
def get_table(self, dataset=None, table=None, org=None, number=None,semester=None): ''' show arbitrary table from bigquery -- mainly for debugging ''' if dataset is None: course_id = '/'.join([org, number, semester]) dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=self.use_dataset_latest()) if not self.is_user_authorized_for_course(course_id): return self.no_auth_sorry() if ('person' in table) or ('track' in table) or ('student' in table): if not self.does_user_have_role('instructor', course_id): return self.no_auth_sorry() # be more restrictive: researchers only if not (self.does_user_have_role('researcher', course_id)): return self.no_auth_sorry() else: course_id = None if not self.user in self.AUTHORIZED_USERS: return self.no_auth_sorry() tableinfo = bqutil.get_bq_table_info(dataset, table) fields = tableinfo['schema']['fields'] field_names = [x['name'] for x in fields] tablecolumns = json.dumps([ { 'data': x, 'title': x, 'class': 'dt-center' } for x in field_names ]) logging.info(tablecolumns) data = self.common_data data.update({'dataset': dataset, 'table': table, 'course_id': course_id, 'tablecolumns': tablecolumns, }) template = JINJA_ENVIRONMENT.get_template('show_table.html') self.response.out.write(template.render(data))
def count_tracking_log_events(self): ''' Loop over all tracking logs up to cutoff date, and sum up number of entries, by doing table info lookups, with no SQL queries. ''' if self.skip_or_do_step("count_events") < 0: return # skip step tlend = self.end_date.replace('-', '') # end_date normally specified as YYYY-MM-DD log_event_counts = {} # iterate over each course, one at a time for course_id in self.course_id_set: log_dataset = bqutil.course_id2dataset(course_id, dtype="logs") # get list of all tracking log files for this course log_tables = [x for x in bqutil.get_list_of_table_ids(log_dataset) if x.startswith('tracklog_20')] log_tables_todo = [x for x in log_tables if x[9:] <= tlend] log_tables_todo.sort() print "[count_tracking_log_events] for course %s using %d tracking log tables, from %s to %s" % (course_id, len(log_tables_todo), log_tables_todo[0], log_tables_todo[-1]) sys.stdout.flush() # go through all log files and get size on each row_sizes = [ bqutil.get_bq_table_size_rows(log_dataset, x) for x in log_tables_todo ] log_event_counts[course_id] = sum(row_sizes) print " For %s found %d total tracking log events" % (course_id, log_event_counts[course_id]) sys.stdout.flush() self.log_event_counts = log_event_counts self.total_events = sum(log_event_counts.values()) print "--> Total number of events for %s = %d" % (self.org, self.total_events)
def make_enrollment_verified_events_per_user(course_id, force_recompute=False, use_dataset_latest=False, end_date=None): dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) table = TABLE_PERSON_ENROLLMENT_VERIFIED SQL = """ SELECT "{course_id}" as course_id, user_id, min(TIMESTAMP(time)) as verified_enroll_time, case when max(TIMESTAMP_TO_SEC(time)) == min(TIMESTAMP_TO_SEC(time)) then null else max(TIMESTAMP(time)) end as verified_unenroll_time, FROM [{dataset}.{enrollment_events}] WHERE ((mode = 'verified' and deactivated) or # Unenrolled (mode='verified' and not activated and mode_changed) # Enrolled ) GROUP BY course_id, user_id ORDER BY verified_enroll_time asc """.format( dataset=dataset, course_id=course_id, enrollment_events=TABLE_ENROLLMENT_EVENTS ) bqdat = bqutil.get_bq_table(dataset, table, SQL, force_query=force_recompute, depends_on=["%s.%s" % (dataset, TABLE_ENROLLMENT_EVENTS )], )
def run_query_on_tracking_logs(SQL, table, course_id, force_recompute=False, use_dataset_latest=False, start_date=None, end_date=None, get_date_function=None, existing=None, log_dates=None, days_delta=1, skip_last_day=False, has_hash_limit=False, newer_than=None, table_max_size_mb=800, limit_query_size=False): ''' make a certain table (with SQL given) for specified course_id. The master table holds all the data for a course. It isn't split into separate days. It is ordered in time, however. To update it, a new day's logs are processed, then the results appended to this table. If the table doesn't exist, then run it once on all the existing tracking logs. If it already exists, then run a query on it to see what dates have already been done. Then do all tracking logs except those which have already been done. Append the results to the existing table. If the query fails because of "Resources exceeded during query execution" then try setting the end_date, to do part at a time. NOTE: the SQL must produce a result which is ordered by date, in increasing order. days_delta = integer number of days to increase each time; specify 0 for one day overlap, but make sure the SQL query only selects for time > TIMESTAMP("{last_date}") If skip_last_day is True then do not include the last day of tracking log data in the processing. This is done to avoid processing partial data, e.g. when tracking log data are incrementally loaded with a delta of less than one day. start_date = optional argument, giving min start date for logs to process, in YYYY-MM-DD format. newer_than = if specified, as datetime, then any existing destination table must be newer than this datetime, else force_recompute is made True ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) # destination log_dataset = bqutil.course_id2dataset(course_id, dtype="logs") if existing is None: existing = bqutil.get_list_of_table_ids(dataset) print "[run_query_on_tracking_logs] got %s existing tables in dataset %s" % (len(existing or []), dataset) if log_dates is None: log_tables = [x for x in bqutil.get_list_of_table_ids(log_dataset) if x.startswith('tracklog_20')] log_dates = [x[9:] for x in log_tables] log_dates.sort() if len(log_dates)==0: print "--> no tracking logs in %s aborting!" % (log_dataset) return if skip_last_day: old_max_date = max(log_dates) log_dates.remove(max(log_dates)) # remove the last day of data from consideration max_date = max(log_dates) print " --> skip_last_day is True: dropping %s, new max_date=%s" % (old_max_date, max_date) sys.stdout.flush() min_date = min(log_dates) max_date = max(log_dates) if start_date: start_date = start_date.replace('-','') if min_date < start_date: print " --> logs start at %s, but that is before start_date, so using min_date=start_date=%s" % (min_date, start_date) min_date = start_date if end_date is not None: print "[run_query_on_tracking_logs] %s: min_date=%s, max_date=%s, using end_date=%s for max_date cutoff" % (table, min_date, max_date, end_date) sys.stdout.flush() the_end_date = end_date.replace('-','') # end_date should be YYYY-MM-DD if the_end_date < max_date: max_date = the_end_date if (table in existing) and newer_than: # get date on existing table table_datetime = bqutil.get_bq_table_last_modified_datetime(dataset, table) if (table_datetime < newer_than): print "[run_query_on_tracking_logs] existing %s.%s table last modified on %s, older than newer_than=%s, forcing recompute!" % (dataset, table, table_datetime, newer_than) force_recompute = True if force_recompute: overwrite = True else: overwrite = False last_date = "2010-10-01 01:02:03" # default last date if (not overwrite) and table in existing: # find out what the end date is of the current table pc_last = bqutil.get_table_data(dataset, table, startIndex=-10, maxResults=100) if (pc_last is None): print "--> no data in table %s.%s, starting from scratch!" % (dataset, table) overwrite = True else: last_dates = [get_date_function(x) for x in pc_last['data']] last_date = max(last_dates) table_max_date = last_date.strftime('%Y%m%d') if max_date <= table_max_date: print '--> %s already exists, max_date=%s, but tracking log data min=%s, max=%s, nothing new!' % (table, table_max_date, min_date, max_date) sys.stdout.flush() return min_date = (last_date + datetime.timedelta(days=days_delta)).strftime('%Y%m%d') print '--> %s already exists, max_date=%s, adding tracking log data from %s to max=%s' % (table, table_max_date, min_date, max_date) sys.stdout.flush() overwrite = 'append' if overwrite=='append': print "Appending to %s table for course %s (start=%s, end=%s, last_date=%s) [%s]" % (table, course_id, min_date, max_date, last_date, datetime.datetime.now()) else: print "Making new %s table for course %s (start=%s, end=%s) [%s]" % (table, course_id, min_date, max_date, datetime.datetime.now()) sys.stdout.flush() if limit_query_size: # do only one day's tracking logs, and force use of hash if table is too large print '--> limiting query size, so doing only one day at a time, and checking tracking log table size as we go (max=%s MB)' % table_max_size_mb the_max_date = max_date # save max_date information while min_date not in log_dates: print " tracklog_%s does not exist!" % min_date for ld in log_dates: if ld < min_date: continue if (ld > min_date) and (ld <= max_date): min_date = ld break if min_date not in log_dates: print "--> ERROR! Cannot find tracking log file for %s, aborting!" % min_date raise Exception("[process_tracking_logs] missing tracking log") max_date = min_date print ' min_date = max_date = %s' % min_date tablename = 'tracklog_%s' % min_date.replace('-', '') tablesize_mb = bqutil.get_bq_table_size_bytes(log_dataset, tablename) / (1024.0*1024) nhashes = int(math.ceil(tablesize_mb / table_max_size_mb)) from_datasets = "[%s.%s]" % (log_dataset, tablename) print "--> table %s.%s size %s MB > max=%s MB, using %d hashes" % (log_dataset, tablename, tablesize_mb, table_max_size_mb, nhashes) sys.stdout.flush() if nhashes: if not has_hash_limit: print "--> ERROR! table %s.%s size %s MB > max=%s MB, but no hash_limit in SQL available" % (log_dataset, tablename, tablesize_mb, table_max_size_mb) print "SQL: ", SQL raise Exception("[process_tracking_logs] table too large") for k in range(nhashes): hash_limit = "AND ABS(HASH(username)) %% %d = %d" % (nhashes, k) the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit=hash_limit) print " Hash %d" % k sys.stdout.flush() try: bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) except Exception as err: print the_sql raise overwrite = "append" else: the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit="") try: bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) except Exception as err: print the_sql raise txt = '[%s] added tracking log data from %s' % (datetime.datetime.now(), tablename) bqutil.add_description_to_table(dataset, table, txt, append=True) print "----> Done with day %s" % max_date if the_max_date > max_date: # more days still to be done print "--> Moving on to another day (max_date=%s)" % the_max_date run_query_on_tracking_logs(SQL, table, course_id, force_recompute=False, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=get_date_function, existing=existing, log_dates=log_dates, has_hash_limit=has_hash_limit, table_max_size_mb=table_max_size_mb, limit_query_size=limit_query_size, ) return from_datasets = """( TABLE_QUERY({dataset}, "integer(regexp_extract(table_id, r'tracklog_([0-9]+)')) BETWEEN {start} and {end}" ) ) """.format(dataset=log_dataset, start=min_date, end=max_date) the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit="") try: bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) except Exception as err: if ( ('Response too large to return.' in str(err)) and has_hash_limit ): # try using hash limit on username # e.g. WHERE ABS(HASH(username)) % 4 = 0 for k in range(4): hash_limit = "AND ABS(HASH(username)) %% 4 = %d" % k the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit=hash_limit) bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) overwrite = "append" elif ('Resources exceeded during query execution' in str(err)): if True: # figure out time interval in days, and split that in half start_date = datetime.datetime.strptime(min_date, '%Y%m%d') end_date = datetime.datetime.strptime(max_date, '%Y%m%d') ndays = (end_date - start_date).days if (ndays < 1) and has_hash_limit: print "----> ndays=%d; retrying with limit_query_size" % ndays sys.stdout.flush() return run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=max_date, get_date_function=get_date_function, has_hash_limit=has_hash_limit, # existing=existing, log_dates=log_dates, limit_query_size=True, ) elif (ndays < 1): print "====> ERROR with resources exceeded during query execution; ndays=%d, cannot split -- ABORTING!" % ndays raise nd1 = int(ndays/2) nd2 = ndays - nd1 #if nd2 > nd1: # nd1 = nd2 # nd2 = ndays - nd1 print "====> ERROR with resources exceeded during query execution; re-trying based on splitting %d days into %d + %d days" % (ndays, nd1, nd2) sys.stdout.flush() end_date = (start_date + datetime.timedelta(days=nd1)).strftime('%Y%m%d') print "--> part 1 with %d days (end_date=%s)" % (nd1, end_date) sys.stdout.flush() run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=get_date_function, has_hash_limit=has_hash_limit, # existing=existing, log_dates=log_dates) end_date = max_date print "--> part 2 with %d days (end_date=%s)" % (nd2, end_date) sys.stdout.flush() run_query_on_tracking_logs(SQL, table, course_id, force_recompute=False, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=get_date_function, has_hash_limit=has_hash_limit, # existing=existing, log_dates=log_dates) print "--> Done with %d + %d days!" % (nd1, nd2) return if False: def get_ym(x): return int(x[0:4]), int(x[4:6]), int(x[6:]) (min_year, min_month, min_day) = get_ym(min_date) (max_year, max_month, max_day) = get_ym(max_date) nmonths = max_month - min_month + 12 * (max_year - min_year) print "====> ERROR with resources exceeded during query execution; re-trying based on one month's data at a time" sys.stdout.flush() (end_year, end_month) = (min_year, min_month) for dm in range(nmonths): end_month += 1 if end_month > 12: end_month = 1 end_year += 1 end_date = "%04d-%02d-%02d" % (end_year, end_month, min_day) print "--> with end_date=%s" % end_date sys.stdout.flush() run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=get_date_function, has_hash_limit=has_hash_limit, # existing=existing, log_dates=log_dates) force_recompute = False # after first, don't force recompute return else: print the_sql raise if overwrite=='append': txt = '[%s] added tracking log data from %s to %s' % (datetime.datetime.now(), min_date, max_date) bqutil.add_description_to_table(dataset, table, txt, append=True) print "Done with course %s (end %s)" % (course_id, datetime.datetime.now()) print "="*77 sys.stdout.flush()
def make_video_stats(course_id, api_key, basedir, datedir, force_recompute, use_dataset_latest): ''' Create Video stats for Videos Viewed and Videos Watched. First create a video axis, based on course axis. Then use tracking logs to count up videos viewed and videos watched ''' assert api_key is not None, "[analyze videos]: Public API Key is missing from configuration file. Visit https://developers.google.com/console/help/new/#generatingdevkeys for details on how to generate public key, and then add to edx2bigquery_config.py as API_KEY variable" # Get Course Dir path basedir = path(basedir or '') course_dir = course_id.replace('/', '__') lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest) # get schema mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/%s' % (mypath, SCHEMA_VIDEO_AXIS) the_schema = json.loads(open(SCHEMA_FILE).read())[SCHEMA_VIDEO_AXIS_NAME] the_dict_schema = schema2dict(the_schema) # Create initial video axis videoAxisExists = False dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) va_date = None try: tinfo = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_AXIS) assert tinfo is not None, "[analyze videos] %s.%s does not exist. First time creating table" % ( dataset, TABLE_VIDEO_AXIS) videoAxisExists = True va_date = tinfo['lastModifiedTime'] # datetime except (AssertionError, Exception) as err: print "%s --> Attempting to process %s table" % (str(err), TABLE_VIDEO_AXIS) sys.stdout.flush() # get course axis time ca_date = None try: tinfo = bqutil.get_bq_table_info(dataset, TABLE_COURSE_AXIS) ca_date = tinfo['lastModifiedTime'] # datetime except (AssertionError, Exception) as err: pass if videoAxisExists and (not force_recompute) and ca_date and va_date and ( ca_date > va_date): force_recompute = True print "video_axis exists, but has date %s, older than course_axis date %s; forcing recompute" % ( va_date, ca_date) sys.stdout.flush() if not videoAxisExists or force_recompute: force_recompute = True createVideoAxis(course_id=course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest) # Get video lengths va = bqutil.get_table_data(dataset, TABLE_VIDEO_AXIS) assert va is not None, "[analyze videos] Possibly no data in video axis table. Check course axis table" va_bqdata = va['data'] fileoutput = lfp / FILENAME_VIDEO_AXIS getYoutubeDurations(dataset=dataset, bq_table_input=va_bqdata, api_key=api_key, outputfilename=fileoutput, schema=the_dict_schema, force_recompute=force_recompute) # upload and import video axis gsfn = gsutil.gs_path_from_course_id( course_id, use_dataset_latest=use_dataset_latest) / FILENAME_VIDEO_AXIS gsutil.upload_file_to_gs(fileoutput, gsfn) table = TABLE_VIDEO_AXIS bqutil.load_data_to_table(dataset, table, gsfn, the_schema, wait=True) else: print "[analyze videos] %s.%s already exists (and force recompute not specified). Skipping step to generate %s using latest course axis" % ( dataset, TABLE_VIDEO_AXIS, TABLE_VIDEO_AXIS) # Lastly, create video stats createVideoStats_day(course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest) createVideoStats(course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest)
def createVideoStats_obsolete(course_id, force_recompute=False, use_dataset_latest=False, startDate=DATE_DEFAULT_START, endDate=DATE_DEFAULT_END): ''' Create video statistics for viewed by looking for users who had a video position > 0, and watched by looking for users who had a video position > 95% of the total video length duration. This was the original method used, but is not the most efficient since it queries entire log set. Instead, generate video stats per day, then incrementally append to that data table as the daily log data comes in. ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) logs = bqutil.course_id2dataset(course_id, dtype='logs') table = TABLE_VIDEO_STATS the_sql = """ SELECT index_chapter, index_video, name, video_id, chapter_name, sum(case when position > 0 then 1 else 0 end) as videos_viewed, sum(case when position > video_length*0.95 then 1 else 0 end) as videos_watched, FROM (SELECT username, #module_id as video_id, #REGEXP_REPLACE(REGEXP_EXTRACT(JSON_EXTRACT(event, '$.id'), r'(?:i4x-)(.*)(?:"$)'), '-', '/') as video_id, # Old method takes full video id path (case when REGEXP_MATCH( JSON_EXTRACT(event, '$.id') , r'[-]' ) then REGEXP_EXTRACT(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(JSON_EXTRACT(event, '$.id'), '-', '/'), '"', ''), 'i4x/', ''), r'(?:.*\/)(.*)') else REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(JSON_EXTRACT(event, '$.id'), '-', '/'), '"', ''), 'i4x/', '') end) as video_id, # This takes video id only max(case when JSON_EXTRACT_SCALAR(event, '$.speed') is not null then float(JSON_EXTRACT_SCALAR(event,'$.speed'))*float(JSON_EXTRACT_SCALAR(event, '$.currentTime')) else float(JSON_EXTRACT_SCALAR(event, '$.currentTime')) end) as position, FROM (TABLE_QUERY({logs}, "integer(regexp_extract(table_id, r'tracklog_([0-9]+)')) BETWEEN {start_date} and {end_date}")) WHERE (event_type = "play_video" or event_type = "pause_video" or event_type = "stop_video") and event is not null group by username, video_id order by username, video_id) as video_log, LEFT JOIN EACH (SELECT video_length, video_id as vid_id, name, index_video, index_chapter, chapter_name FROM [{dataset}.{videoaxis}] ) as {videoaxis} ON video_log.video_id = {videoaxis}.vid_id WHERE video_id is not null group by video_id, name, index_chapter, index_video, chapter_name order by index_video asc; """.format(dataset=dataset, start_date=startDate, end_date=endDate, logs=logs, videoaxis=TABLE_VIDEO_AXIS) print "[analyze_videos] Creating %s.%s table for %s" % ( dataset, TABLE_VIDEO_STATS, course_id) sys.stdout.flush() try: tinfo = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_AXIS) assert tinfo is not None, "[analyze videos] %s table depends on %s, which does not exist" % ( TABLE_VIDEO_STATS, TABLE_VIDEO_AXIS) except (AssertionError, Exception) as err: print " --> Err: missing %s.%s? Skipping creation of %s" % ( dataset, TABLE_VIDEO_AXIS, TABLE_VIDEO_STATS) sys.stdout.flush() return bqdat = bqutil.get_bq_table( dataset, table, the_sql, force_query=force_recompute, depends_on=["%s.%s" % (dataset, TABLE_VIDEO_AXIS)], ) return bqdat
def createVideoStats(course_id, force_recompute=False, use_dataset_latest=False): ''' Final step for video stats is to run through daily video stats table and aggregate for entire course for videos watch and videos viewed Join results with video axis to get detailed metadata per video for dashboard data ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) logs = bqutil.course_id2dataset(course_id, dtype='logs') table = TABLE_VIDEO_STATS the_sql = """ SELECT index_chapter, index_video, name, video_id, chapter_name, sum(case when position > 0 then 1 else 0 end) as videos_viewed, sum(case when position > video_length*0.95 then 1 else 0 end) as videos_watched, FROM ( SELECT username, index_chapter, index_video, name, video_id, chapter_name, max(position) as position, video_length, FROM (SELECT * FROM [{dataset}.{videostatsperday}]) as video_log, LEFT JOIN EACH (SELECT video_length, video_id as vid_id, name, index_video, index_chapter, chapter_name FROM [{dataset}.{videoaxis}] ) as video_axis ON video_log.video_id = video_axis.vid_id WHERE video_id is not null and username is not null group by username, video_id, name, index_chapter, index_video, chapter_name, video_length order by video_id asc) GROUP BY video_id, index_chapter, index_video, name, chapter_name ORDER BY index_video asc; """.format(dataset=dataset, videoaxis=TABLE_VIDEO_AXIS, videostatsperday=TABLE_VIDEO_STATS_PER_DAY) print "[analyze_videos] Creating %s.%s table for %s" % ( dataset, TABLE_VIDEO_STATS, course_id) sys.stdout.flush() try: tinfo_va = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_AXIS) trows_va = int(tinfo_va['numRows']) tinfo_va_day = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_STATS_PER_DAY) trows_va_day = int(tinfo_va['numRows']) assert tinfo_va is not None and trows_va != 0, "[analyze videos] %s table depends on %s, which does not exist" % ( TABLE_VIDEO_STATS, TABLE_VIDEO_AXIS) assert tinfo_va_day is not None and trows_va_day != 0, "[analyze videos] %s table depends on %s, which does not exist" % ( TABLE_VIDEO_STATS, TABLE_VIDEO_STATS_PER_DAY) except (AssertionError, Exception) as err: print " --> Err: missing %s.%s and/or %s (including 0 rows in table)? Skipping creation of %s" % ( dataset, TABLE_VIDEO_AXIS, TABLE_VIDEO_STATS_PER_DAY, TABLE_VIDEO_STATS) sys.stdout.flush() return bqdat = bqutil.get_bq_table( dataset, table, the_sql, force_query=force_recompute, depends_on=["%s.%s" % (dataset, TABLE_VIDEO_AXIS)], ) return bqdat
def createVideoStats_day(course_id, force_recompute=False, use_dataset_latest=False, skip_last_day=False, end_date=None): ''' Create video statistics per ay for viewed by looking for users who had a video position > 0, and watched by looking for users who had a video position > 95% of the total video length duration. ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) logs = bqutil.course_id2dataset(course_id, dtype='logs') table = TABLE_VIDEO_STATS_PER_DAY the_sql = """ SELECT date(time)as date, username, #module_id as video_id, #REGEXP_REPLACE(REGEXP_EXTRACT(JSON_EXTRACT(event, '$.id'), r'(?:i4x-)(.*)(?:"$)'), '-', '/') as video_id, # Old method takes full video id path (case when REGEXP_MATCH( JSON_EXTRACT(event, '$.id') , r'([-])' ) then REGEXP_EXTRACT(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(JSON_EXTRACT(event, '$.id'), '-', '/'), '"', ''), 'i4x/', ''), r'(?:.*\/)(.*)') else REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(JSON_EXTRACT(event, '$.id'), '-', '/'), '"', ''), 'i4x/', '') end) as video_id, # This takes video id only max(case when JSON_EXTRACT_SCALAR(event, '$.speed') is not null then float(JSON_EXTRACT_SCALAR(event,'$.speed'))*float(JSON_EXTRACT_SCALAR(event, '$.currentTime')) else float(JSON_EXTRACT_SCALAR(event, '$.currentTime')) end) as position, FROM {DATASETS} WHERE (event_type = "play_video" or event_type = "pause_video" or event_type = "stop_video") and event is not null group by username, video_id, date order by date """ try: tinfo = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_STATS_PER_DAY) assert tinfo is not None, "[analyze_videos] Creating %s.%s table for %s" % ( dataset, TABLE_VIDEO_STATS_PER_DAY, course_id) print "[analyze_videos] Appending latest data to %s.%s table for %s" % ( dataset, TABLE_VIDEO_STATS_PER_DAY, course_id) sys.stdout.flush() except (AssertionError, Exception) as err: print str(err) sys.stdout.flush() print " --> Missing %s.%s? Attempting to create..." % ( dataset, TABLE_VIDEO_STATS_PER_DAY) sys.stdout.flush() pass print "=== Processing Video Stats Per Day for %s (start %s)" % ( course_id, datetime.datetime.now()) sys.stdout.flush() def gdf(row): return datetime.datetime.strptime(row['date'], '%Y-%m-%d') process_tracking_logs.run_query_on_tracking_logs( the_sql, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, get_date_function=gdf, skip_last_day=skip_last_day) print "Done with Video Stats Per Day for %s (end %s)" % ( course_id, datetime.datetime.now()) print "=" * 77 sys.stdout.flush()
def CreateForumPosts(course_id, force_recompute=True, use_dataset_latest=False, skip_last_day=False, end_date=None): ''' Create Forum posts table, based on forum data. Categorizes forum posts as initial_post, response_post or comment. Also extracts first 100 characters of the post content as a preview. ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) table = TABLE_FORUM_POSTS the_sql = """ SELECT * FROM ( SELECT ADDED_TITLE.FA.username as username, "{course_id}" as course_id, ADDED_TITLE.FA.slug_id as slug_id, ADDED_TITLE.FA.slug_type as slug_type, ADDED_TITLE.FA.thread_id as thread_id, ADDED_TITLE.FA.parent_id as parent_id, ADDED_TITLE.IP.username as original_poster, ADD_RESPOND_TO.username as responded_to, ADDED_TITLE.IP.title as title, ADDED_TITLE.FA.first_time as first_time, ADDED_TITLE.FA.body_preview as body_preview, FROM ( SELECT * FROM ( SELECT author_username as username, mongoid as slug_id, (case when _type = "Comment" and comment_thread_id is not null and parent_id is null and mongoid is not null then "response_post" else (case when _type = "Comment" and parent_id is not null then "comment" else null end) end) as slug_type, comment_thread_id as thread_id, parent_id, title, created_at as first_time, SUBSTR(body, 0, {post_preview_char_count}) as body_preview, FROM [{dataset}.{forum}] WHERE (_type = "Comment" and parent_id is not null) ) as FA # 3rd level comment LEFT JOIN EACH ( SELECT * FROM ( SELECT author_username as username, mongoid as slug_id, (case when _type = "Comment" and comment_thread_id is not null and parent_id is null and mongoid is not null then "response_post" else (case when _type = "Comment" and parent_id is not null then "comment" else null end) end) as slug_type, comment_thread_id as thread_id, parent_id, title, created_at as first_time, SUBSTR(body, 0, {post_preview_char_count}) as body_preview, FROM [{dataset}.{forum}] WHERE (_type = "CommentThread" and comment_thread_id is null and parent_id is null and mongoid is not null) ) ) as IP ON FA.thread_id = IP.slug_id ) as ADDED_TITLE LEFT JOIN EACH ( SELECT author_username as username, mongoid as slug_id, comment_thread_id as thread_id, parent_id, FROM [{dataset}.{forum}] ) as ADD_RESPOND_TO ON ADDED_TITLE.FA.parent_id = ADD_RESPOND_TO.slug_id WHERE ADDED_TITLE.FA.slug_type = "comment" ) as RC, ( SELECT FA.username as username, "{course_id}" as course_id, FA.slug_id as slug_id, FA.slug_type as slug_type, FA.thread_id as thread_id, FA.parent_id as parent_id, IP.username as original_poster, IP.title as title, FA.first_time as first_time, FA.body_preview as body_preview FROM ( SELECT author_username as username, mongoid as slug_id, (case when _type = "Comment" and comment_thread_id is not null and parent_id is null and mongoid is not null then "response_post" else (case when _type = "Comment" and parent_id is not null then "comment" else null end) end) as slug_type, comment_thread_id as thread_id, parent_id, title, created_at as first_time, SUBSTR(body, 0, {post_preview_char_count}) as body_preview, FROM [{dataset}.{forum}] WHERE (_type = "Comment" and comment_thread_id is not null and parent_id is null and mongoid is not null) ) as FA # 2nd level comment LEFT JOIN EACH ( SELECT * FROM ( SELECT author_username as username, mongoid as slug_id, (case when _type = "Comment" and comment_thread_id is not null and parent_id is null and mongoid is not null then "response_post" else (case when _type = "Comment" and parent_id is not null then "comment" else null end) end) as slug_type, comment_thread_id as thread_id, parent_id, title, created_at as first_time, SUBSTR(body, 0, {post_preview_char_count}) as body_preview, FROM [{dataset}.{forum}] WHERE (_type = "CommentThread" and comment_thread_id is null and parent_id is null and mongoid is not null) ) ) as IP ON FA.thread_id = IP.slug_id ) as RC2, ( SELECT * FROM ( SELECT author_username as username, "{course_id}" as course_id, mongoid as slug_id, (case when _type = "CommentThread" and comment_thread_id is null and parent_id is null and mongoid is not null then "initial_post" end) as slug_type, comment_thread_id as thread_id, parent_id, title, created_at as first_time, SUBSTR(body, 0, {post_preview_char_count}) as body_preview, FROM [{dataset}.{forum}] WHERE (_type = "CommentThread" and comment_thread_id is null and parent_id is null and mongoid is not null) ) ) as NA """.format(dataset=dataset, course_id=course_id, forum=TABLE_FORUM, post_preview_char_count=POST_PREVIEW_CHAR_COUNT) print "[make_forum_analysis] Creating %s.%s table for %s" % ( dataset, TABLE_FORUM_POSTS, course_id) sys.stdout.flush() try: tinfo = bqutil.get_bq_table_info(dataset, TABLE_FORUM) assert tinfo is not None, "[make_forum_analysis] %s table depends on %s, which does not exist" % ( TABLE_FORUM_POSTS, TABLE_FORUM) except (AssertionError, Exception) as err: print " --> Err: missing %s.%s? Skipping creation of %s" % ( dataset, TABLE_FORUM, TABLE_FORUM_POSTS) sys.stdout.flush() return bqdat = bqutil.get_bq_table( dataset, table, the_sql, force_query=force_recompute, depends_on=["%s.%s" % (dataset, TABLE_FORUM)], ) return bqdat
def rephrase_forum_json_for_course( course_id, gsbucket="gs://x-data", basedir="X-Year-2-data-sql", datedir=None, do_gs_copy=False, use_dataset_latest=False, ): print "Loading SQL for course %s into BigQuery (start: %s)" % ( course_id, datetime.datetime.now()) sys.stdout.flush() lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest) print "Using this directory for local files: ", lfp sys.stdout.flush() fn = 'forum.mongo' gsdir = gsutil.gs_path_from_course_id(course_id, gsbucket, use_dataset_latest) def openfile(fn, mode='r'): if (not os.path.exists(lfp / fn)) and (not fn.endswith('.gz')): fn += ".gz" if fn.endswith('.gz'): return gzip.GzipFile(lfp / fn, mode) return open(lfp / fn, mode) fp = openfile(fn) ofn = lfp / "forum-rephrased.json.gz" dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent(dataset) if os.path.exists(ofn): tables = bqutil.get_list_of_table_ids(dataset) if not 'forum' in tables: print "Already done? But no forums table loaded into datasaet %s. Redoing." % dataset else: print "Already done %s -> %s (skipping)" % (fn, ofn) sys.stdout.flush() return print "Processing %s -> %s (%s)" % (fn, ofn, datetime.datetime.now()) sys.stdout.flush() cnt = 0 ofp = gzip.GzipFile('tmp.json.gz', 'w') for line in fp: cnt += 1 newline = do_rephrase_line(line, linecnt=cnt) ofp.write(newline) ofp.close() print "...done (%s)" % datetime.datetime.now() if cnt == 0: print "...but cnt=0 entries found, skipping forum loading" sys.stdout.flush() return print "...copying to gsc" sys.stdout.flush() # do upload twice, because GSE file metadata doesn't always make it to BigQuery right away? gsfn = gsdir + '/' + "forum-rephrased.json.gz" cmd = 'gsutil cp tmp.json.gz %s' % (gsfn) os.system(cmd) os.system(cmd) table = 'forum' bqutil.load_data_to_table(dataset, table, gsfn, SCHEMA, wait=True) msg = "Original data from %s" % (lfp / fn) bqutil.add_description_to_table(dataset, table, msg, append=True) os.system('mv tmp.json.gz "%s"' % (ofn)) print "...done (%s)" % datetime.datetime.now() sys.stdout.flush()
def old_process_course(course_id, force_recompute=False): ''' DEPRACATED - instead of creating one table per day, because there is so little total data, create one enrollday_all table (see other function below). make enrollday2_* tables for specified course_id ''' SQL = """ SELECT "{course_id}" as course_id, time, event_struct.user_id as user_id, (case when (event_type = "edx.course.enrollment.activated" and event_struct.mode = "honor") then 1 when (event_type = "edx.course.enrollment.deactivated" and event_struct.mode = "honor") then -1 else 0 end) as diff_enrollment_honor, (case when (event_type = "edx.course.enrollment.activated" and event_struct.mode = "verified") then 1 when (event_type = "edx.course.enrollment.deactivated" and event_struct.mode = "verified") then -1 else 0 end) as diff_enrollment_verified, (case when (event_type = "edx.course.enrollment.activated" and event_struct.mode = "audit") then 1 when (event_type = "edx.course.enrollment.deactivated" and event_struct.mode = "audit") then -1 else 0 end) as diff_enrollment_audit, FROM [{dataset}.{table_id}] where (event_type = "edx.course.enrollment.activated") or (event_type = "edx.course.enrollment.deactivated") order by time; """ course_dir = course_id.replace('/','__') dataset = bqutil.course_id2dataset(course_id) log_dataset = bqutil.course_id2dataset(course_id, dtype="logs") pcd_dataset = bqutil.course_id2dataset(course_id, dtype="pcday") print "Processing course %s (start %s)" % (course_id, datetime.datetime.now()) sys.stdout.flush() log_tables = bqutil.get_tables(log_dataset) try: bqutil.create_dataset_if_nonexistent(pcd_dataset) except Exception as err: print "Oops, err when creating %s, err=%s" % (pcd_dataset, str(err)) pcday_tables_info = bqutil.get_tables(pcd_dataset) pcday_tables = [x['tableReference']['tableId'] for x in pcday_tables_info.get('tables', [])] # print "pcday_tables = ", pcday_tables log_table_list = log_tables['tables'] log_table_list.sort() for table in log_table_list: tr = table['tableReference'] table_id = tr['tableId'] if not table_id.startswith('tracklog'): continue date = table_id[9:] table_out = 'enrollday2_%s' % date if (table_out in pcday_tables) and not force_recompute: print "%s...already done, skipping" % table_id sys.stdout.flush() continue if bqutil.get_bq_table_size_rows(log_dataset, table_id)==0: print "...zero size table %s, skipping" % table_id sys.stdout.flush() continue print ("Creating %s " % table_out), the_sql = SQL.format(course_id=course_id, dataset=log_dataset, table_id=table_id) sys.stdout.flush() bqutil.create_bq_table(pcd_dataset, table_out, the_sql, wait=False) print "Done with course %s (end %s)" % (course_id, datetime.datetime.now()) print "="*77 sys.stdout.flush()
def already_exists(course_id, use_dataset_latest): table = "grading_policy" dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) tables = bqutil.get_list_of_table_ids(dataset) return table in tables
def analyze_course_content( course_id, listings_file=None, basedir="X-Year-2-data-sql", datedir="2013-09-21", use_dataset_latest=False, do_upload=False, courses=None, verbose=True, pin_date=None, ): ''' Compute course_content table, which quantifies: - number of chapter, sequential, vertical modules - number of video modules - number of problem, *openended, mentoring modules - number of dicussion, annotatable, word_cloud modules Do this using the course "xbundle" file, produced when the course axis is computed. Include only modules which had nontrivial use, to rule out the staff and un-shown content. Do the exclusion based on count of module appearing in the studentmodule table, based on stats_module_usage for each course. Also, from the course listings file, compute the number of weeks the course was open. If do_upload (triggered by --force-recompute) then upload all accumulated data to the course report dataset as the "stats_course_content" table. Also generate a "course_summary_stats" table, stored in the course_report_ORG or course_report_latest dataset. The course_summary_stats table combines data from many reports,, including stats_course_content, the medians report, the listings file, broad_stats_by_course, and time_on_task_stats_by_course. ''' if do_upload: if use_dataset_latest: org = "latest" else: org = courses[0].split( '/', 1)[0] # extract org from first course_id in courses crname = 'course_report_%s' % org gspath = gsutil.gs_path_from_course_id(crname) gsfnp = gspath / CCDATA gsutil.upload_file_to_gs(CCDATA, gsfnp) tableid = "stats_course_content" dataset = crname mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_content_stats.json' % mypath try: the_schema = json.loads(open(SCHEMA_FILE).read())[tableid] except Exception as err: print "Oops! Failed to load schema file for %s. Error: %s" % ( tableid, str(err)) raise if 0: bqutil.load_data_to_table(dataset, tableid, gsfnp, the_schema, wait=True, verbose=False, format='csv', skiprows=1) table = 'course_metainfo' course_tables = ',\n'.join([ ('[%s.course_metainfo]' % bqutil.course_id2dataset(x)) for x in courses ]) sql = "select * from {course_tables}".format( course_tables=course_tables) print "--> Creating %s.%s using %s" % (dataset, table, sql) if 1: metainfo_dataset = bqutil.get_bq_table( dataset, table, sql=sql, newer_than=datetime.datetime(2015, 1, 16, 3, 0), ) # bqutil.create_bq_table(dataset, table, sql, overwrite=True) #----------------------------------------------------------------------------- # make course_summary_stats table # # This is a combination of the broad_stats_by_course table (if that exists), and course_metainfo. # Also use (and create if necessary) the nregistered_by_wrap table. # get the broad_stats_by_course data bsbc = bqutil.get_table_data(dataset, 'broad_stats_by_course') table_list = bqutil.get_list_of_table_ids(dataset) latest_person_course = max( [x for x in table_list if x.startswith('person_course_')]) print "Latest person_course table in %s is %s" % (dataset, latest_person_course) sql = """ SELECT pc.course_id as course_id, cminfo.wrap_date as wrap_date, count(*) as nregistered, sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap, sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct, FROM [{dataset}.{person_course}] as pc left join ( SELECT course_id, TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date, FROM ( SELECT course_id, regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month, regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day, regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year, FROM [{dataset}.course_metainfo] where key='listings_Course Wrap' )) as cminfo on pc.course_id = cminfo.course_id group by course_id, wrap_date order by course_id """.format(dataset=dataset, person_course=latest_person_course) nr_by_wrap = bqutil.get_bq_table(dataset, 'nregistered_by_wrap', sql=sql, key={'name': 'course_id'}) # rates for registrants before and during course sql = """ SELECT *, ncertified / nregistered * 100 as pct_certified_of_reg, ncertified_and_registered_before_launch / nregistered_before_launch * 100 as pct_certified_reg_before_launch, ncertified_and_registered_during_course / nregistered_during_course * 100 as pct_certified_reg_during_course, ncertified / nregistered_by_wrap * 100 as pct_certified_of_reg_by_wrap, ncertified / nviewed * 100 as pct_certified_of_viewed, ncertified / nviewed_by_wrap * 100 as pct_certified_of_viewed_by_wrap, ncertified_by_ewrap / nviewed_by_ewrap * 100 as pct_certified_of_viewed_by_ewrap, FROM ( # ------------------------ # get aggregate data SELECT pc.course_id as course_id, cminfo.wrap_date as wrap_date, count(*) as nregistered, sum(case when pc.certified then 1 else 0 end) ncertified, sum(case when (TIMESTAMP(pc.cert_created_date) < cminfo.ewrap_date) and (pc.certified and pc.viewed) then 1 else 0 end) ncertified_by_ewrap, sum(case when pc.viewed then 1 else 0 end) nviewed, sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap, sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct, sum(case when (pc.start_time < cminfo.wrap_date) and pc.viewed then 1 else 0 end) nviewed_by_wrap, sum(case when (pc.start_time < cminfo.ewrap_date) and pc.viewed then 1 else 0 end) nviewed_by_ewrap, sum(case when pc.start_time < cminfo.launch_date then 1 else 0 end) nregistered_before_launch, sum(case when pc.start_time < cminfo.launch_date and pc.certified then 1 else 0 end) ncertified_and_registered_before_launch, sum(case when (pc.start_time >= cminfo.launch_date) and (pc.start_time < cminfo.wrap_date) then 1 else 0 end) nregistered_during_course, sum(case when (pc.start_time >= cminfo.launch_date) and (pc.start_time < cminfo.wrap_date) and pc.certified then 1 else 0 end) ncertified_and_registered_during_course, FROM [{dataset}.{person_course}] as pc left join ( # -------------------- # get course launch and wrap dates from course_metainfo SELECT AA.course_id as course_id, AA.wrap_date as wrap_date, AA.launch_date as launch_date, BB.ewrap_date as ewrap_date, FROM ( # inner get course launch and wrap dates from course_metainfo SELECT A.course_id as course_id, A.wrap_date as wrap_date, B.launch_date as launch_date, from ( SELECT course_id, TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date, FROM ( SELECT course_id, regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month, regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day, regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year, FROM [{dataset}.course_metainfo] where key='listings_Course Wrap' ) ) as A left outer join ( SELECT course_id, TIMESTAMP(concat(launch_year, "-", launch_month, '-', launch_day)) as launch_date, FROM ( SELECT course_id, regexp_extract(value, r'(\d+)/\d+/\d+') as launch_month, regexp_extract(value, r'\d+/(\d+)/\d+') as launch_day, regexp_extract(value, r'\d+/\d+/(\d+)') as launch_year, FROM [{dataset}.course_metainfo] where key='listings_Course Launch' ) ) as B on A.course_id = B.course_id # end inner course_metainfo subquery ) as AA left outer join ( SELECT course_id, TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as ewrap_date, FROM ( SELECT course_id, regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month, regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day, regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year, FROM [{dataset}.course_metainfo] where key='listings_Empirical Course Wrap' ) ) as BB on AA.course_id = BB.course_id # end course_metainfo subquery # -------------------- ) as cminfo on pc.course_id = cminfo.course_id group by course_id, wrap_date order by course_id # ---- end get aggregate data ) order by course_id """.format(dataset=dataset, person_course=latest_person_course) print "--> Assembling course_summary_stats from %s" % 'stats_cert_rates_by_registration' sys.stdout.flush() cert_by_reg = bqutil.get_bq_table(dataset, 'stats_cert_rates_by_registration', sql=sql, newer_than=datetime.datetime( 2015, 1, 16, 3, 0), key={'name': 'course_id'}) # start assembling course_summary_stats c_sum_stats = defaultdict(OrderedDict) for entry in bsbc['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] cmci.update(entry) cnbw = nr_by_wrap['data_by_key'][course_id] nbw = int(cnbw['nregistered_by_wrap']) cmci['nbw_wrap_date'] = cnbw['wrap_date'] cmci['nregistered_by_wrap'] = nbw cmci['nregistered_by_wrap_pct'] = cnbw['nregistered_by_wrap_pct'] cmci['frac_female'] = float(entry['n_female_viewed']) / (float( entry['n_male_viewed']) + float(entry['n_female_viewed'])) ncert = float(cmci['certified_sum']) if ncert: cmci[ 'certified_of_nregistered_by_wrap_pct'] = nbw / ncert * 100.0 else: cmci['certified_of_nregistered_by_wrap_pct'] = None cbr = cert_by_reg['data_by_key'][course_id] for field, value in cbr.items(): cmci['cbr_%s' % field] = value # add medians for viewed, explored, and certified msbc_tables = { 'msbc_viewed': "viewed_median_stats_by_course", 'msbc_explored': 'explored_median_stats_by_course', 'msbc_certified': 'certified_median_stats_by_course', 'msbc_verified': 'verified_median_stats_by_course', } for prefix, mtab in msbc_tables.items(): print "--> Merging median stats data from %s" % mtab sys.stdout.flush() bqdat = bqutil.get_table_data(dataset, mtab) for entry in bqdat['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] for field, value in entry.items(): cmci['%s_%s' % (prefix, field)] = value # add time on task data tot_table = "time_on_task_stats_by_course" prefix = "ToT" print "--> Merging time on task data from %s" % tot_table sys.stdout.flush() try: bqdat = bqutil.get_table_data(dataset, tot_table) except Exception as err: bqdat = {'data': {}} for entry in bqdat['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] for field, value in entry.items(): if field == 'course_id': continue cmci['%s_%s' % (prefix, field)] = value # add serial time on task data tot_table = "time_on_task_serial_stats_by_course" prefix = "SToT" print "--> Merging serial time on task data from %s" % tot_table sys.stdout.flush() try: bqdat = bqutil.get_table_data(dataset, tot_table) except Exception as err: bqdat = {'data': {}} for entry in bqdat['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] for field, value in entry.items(): if field == 'course_id': continue cmci['%s_%s' % (prefix, field)] = value # add show_answer stats tot_table = "show_answer_stats_by_course" prefix = "SAS" print "--> Merging show_answer stats data from %s" % tot_table sys.stdout.flush() try: bqdat = bqutil.get_table_data(dataset, tot_table) except Exception as err: bqdat = {'data': {}} for entry in bqdat['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] for field, value in entry.items(): if field == 'course_id': continue cmci['%s_%s' % (prefix, field)] = value # setup list of keys, for CSV output css_keys = c_sum_stats.values()[0].keys() # retrieve course_metainfo table, pivot, add that to summary_stats print "--> Merging course_metainfo from %s" % table sys.stdout.flush() bqdat = bqutil.get_table_data(dataset, table) listings_keys = map(make_key, [ "Institution", "Semester", "New or Rerun", "Andrew Recodes New/Rerun", "Course Number", "Short Title", "Andrew's Short Titles", "Title", "Instructors", "Registration Open", "Course Launch", "Course Wrap", "course_id", "Empirical Course Wrap", "Andrew's Order", "certifies", "MinPassGrade", '4-way Category by name', "4-way (CS, STEM, HSocSciGov, HumHistRel)" ]) listings_keys.reverse() for lk in listings_keys: css_keys.insert(1, "listings_%s" % lk) COUNTS_TO_KEEP = [ 'discussion', 'problem', 'optionresponse', 'checkboxgroup', 'optioninput', 'choiceresponse', 'video', 'choicegroup', 'vertical', 'choice', 'sequential', 'multiplechoiceresponse', 'numericalresponse', 'chapter', 'solution', 'img', 'formulaequationinput', 'responseparam', 'selfassessment', 'track', 'task', 'rubric', 'stringresponse', 'combinedopenended', 'description', 'textline', 'prompt', 'category', 'option', 'lti', 'annotationresponse', 'annotatable', 'colgroup', 'tag_prompt', 'comment', 'annotationinput', 'image', 'options', 'comment_prompt', 'conditional', 'answer', 'poll_question', 'section', 'wrapper', 'map', 'area', 'customtag', 'transcript', 'split_test', 'word_cloud', 'openended', 'openendedparam', 'answer_display', 'code', 'drag_and_drop_input', 'customresponse', 'draggable', 'mentoring', 'textannotation', 'imageannotation', 'videosequence', 'feedbackprompt', 'assessments', 'openassessment', 'assessment', 'explanation', 'criterion' ] for entry in bqdat['data']: thekey = make_key(entry['key']) # if thekey.startswith('count_') and thekey[6:] not in COUNTS_TO_KEEP: # continue if thekey.startswith( 'listings_') and thekey[9:] not in listings_keys: # print "dropping key=%s for course_id=%s" % (thekey, entry['course_id']) continue c_sum_stats[entry['course_id']][thekey] = entry['value'] #if 'certifies' in thekey: # print "course_id=%s, key=%s, value=%s" % (entry['course_id'], thekey, entry['value']) if thekey not in css_keys: css_keys.append(thekey) # compute forum_posts_per_week for course_id, entry in c_sum_stats.items(): nfps = entry.get('nforum_posts_sum', 0) if nfps: fppw = int(nfps) / float(entry['nweeks']) entry['nforum_posts_per_week'] = fppw print " course: %s, assessments_per_week=%s, forum_posts_per_week=%s" % ( course_id, entry['total_assessments_per_week'], fppw) else: entry['nforum_posts_per_week'] = None css_keys.append('nforum_posts_per_week') # read in listings file and merge that in also if listings_file: if listings_file.endswith('.csv'): listings = csv.DictReader(open(listings_file)) else: listings = [json.loads(x) for x in open(listings_file)] for entry in listings: course_id = entry['course_id'] if course_id not in c_sum_stats: continue cmci = c_sum_stats[course_id] for field, value in entry.items(): lkey = "listings_%s" % make_key(field) if not (lkey in cmci) or (not cmci[lkey]): cmci[lkey] = value print "Storing these fields: %s" % css_keys # get schema mypath = os.path.dirname(os.path.realpath(__file__)) the_schema = json.loads( open('%s/schemas/schema_combined_course_summary_stats.json' % mypath).read()) schema_dict = {x['name']: x for x in the_schema} # write out CSV css_table = "course_summary_stats" ofn = "%s__%s.csv" % (dataset, css_table) ofn2 = "%s__%s.json" % (dataset, css_table) print "Writing data to %s and %s" % (ofn, ofn2) ofp = open(ofn, 'w') ofp2 = open(ofn2, 'w') dw = csv.DictWriter(ofp, fieldnames=css_keys) dw.writeheader() for cid, entry in c_sum_stats.items(): for ek in entry: if ek not in schema_dict: entry.pop(ek) # entry[ek] = str(entry[ek]) # coerce to be string ofp2.write(json.dumps(entry) + "\n") for key in css_keys: if key not in entry: entry[key] = None dw.writerow(entry) ofp.close() ofp2.close() # upload to bigquery # the_schema = [ { 'type': 'STRING', 'name': x } for x in css_keys ] if 1: gsfnp = gspath / dataset / (css_table + ".json") gsutil.upload_file_to_gs(ofn2, gsfnp) # bqutil.load_data_to_table(dataset, css_table, gsfnp, the_schema, wait=True, verbose=False, # format='csv', skiprows=1) bqutil.load_data_to_table(dataset, css_table, gsfnp, the_schema, wait=True, verbose=False) return print "-" * 60 + " %s" % course_id # get nweeks from listings lfn = path(listings_file) if not lfn.exists(): print "[analyze_content] course listings file %s doesn't exist!" % lfn return data = None if listings_file.endswith('.json'): data_feed = map(json.loads, open(lfn)) else: data_feed = csv.DictReader(open(lfn)) for k in data_feed: if not 'course_id' in k: print "Strange course listings row, no course_id in %s" % k raise Exception("Missing course_id") if k['course_id'] == course_id: data = k break if not data: print "[analyze_content] no entry for %s found in course listings file %s!" % ( course_id, lfn) return def date_parse(field): (m, d, y) = map(int, data[field].split('/')) return datetime.datetime(y, m, d) launch = date_parse('Course Launch') wrap = date_parse('Course Wrap') ndays = (wrap - launch).days nweeks = ndays / 7.0 print "Course length = %6.2f weeks (%d days)" % (nweeks, ndays) if pin_date: datedir = pin_date course_dir = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest and not pin_date) cfn = gsutil.path_from_course_id(course_id) xbfn = course_dir / ("xbundle_%s.xml" % cfn) if not xbfn.exists(): print "[analyze_content] cannot find xbundle file %s for %s!" % ( xbfn, course_id) if use_dataset_latest: # try looking in earlier directories for xbundle file import glob spath = course_dir / ("../*/xbundle_%s.xml" % cfn) files = list(glob.glob(spath)) if files: xbfn = path(files[-1]) if not xbfn.exists(): print " --> also cannot find any %s ; aborting!" % spath else: print " --> Found and using instead: %s " % xbfn if not xbfn.exists(): raise Exception("[analyze_content] missing xbundle file %s" % xbfn) # if there is an xbundle*.fixed file, use that instead of the normal one if os.path.exists(str(xbfn) + ".fixed"): xbfn = path(str(xbfn) + ".fixed") print "[analyze_content] For %s using %s" % (course_id, xbfn) # get module usage data mudata = get_stats_module_usage(course_id, basedir, datedir, use_dataset_latest) xml = etree.parse(open(xbfn)).getroot() counts = defaultdict(int) nexcluded = defaultdict(int) IGNORE = [ 'html', 'p', 'div', 'iframe', 'ol', 'li', 'ul', 'blockquote', 'h1', 'em', 'b', 'h2', 'h3', 'body', 'span', 'strong', 'a', 'sub', 'strike', 'table', 'td', 'tr', 's', 'tbody', 'sup', 'sub', 'strike', 'i', 's', 'pre', 'policy', 'metadata', 'grading_policy', 'br', 'center', 'wiki', 'course', 'font', 'tt', 'it', 'dl', 'startouttext', 'endouttext', 'h4', 'head', 'source', 'dt', 'hr', 'u', 'style', 'dd', 'script', 'th', 'p', 'P', 'TABLE', 'TD', 'small', 'text', 'title' ] problem_stats = defaultdict(int) def does_problem_have_random_script(problem): ''' return 1 if problem has a script with "random." in it else return 0 ''' for elem in problem.findall('.//script'): if elem.text and ('random.' in elem.text): return 1 return 0 # walk through xbundle def walk_tree(elem, policy=None): ''' Walk XML tree recursively. elem = current element policy = dict of attributes for children to inherit, with fields like due, graded, showanswer ''' policy = policy or {} if type(elem.tag) == str and (elem.tag.lower() not in IGNORE): counts[elem.tag.lower()] += 1 if elem.tag in [ "sequential", "problem", "problemset", "course", "chapter" ]: # very old courses may use inheritance from course & chapter keys = ["due", "graded", "format", "showanswer", "start"] for k in keys: # copy inheritable attributes, if they are specified val = elem.get(k) if val: policy[k] = val if elem.tag == "problem": # accumulate statistics about problems: how many have show_answer = [past_due, closed] ? have random. in script? problem_stats['n_capa_problems'] += 1 if policy.get('showanswer'): problem_stats["n_showanswer_%s" % policy.get('showanswer')] += 1 else: problem_stats[ 'n_shownanswer_finished'] += 1 # DEFAULT showanswer = finished (make sure this remains true) # see https://github.com/edx/edx-platform/blob/master/common/lib/xmodule/xmodule/capa_base.py#L118 # finished = Show the answer after the student has answered the problem correctly, the student has no attempts left, or the problem due date has passed. problem_stats[ 'n_random_script'] += does_problem_have_random_script(elem) if policy.get('graded') == 'true' or policy.get( 'graded') == 'True': problem_stats['n_capa_problems_graded'] += 1 problem_stats[ 'n_graded_random_script'] += does_problem_have_random_script( elem) if policy.get('showanswer'): problem_stats["n_graded_showanswer_%s" % policy.get('showanswer')] += 1 else: problem_stats[ 'n_graded_shownanswer_finished'] += 1 # DEFAULT showanswer = finished (make sure this remains true) for k in elem: midfrag = (k.tag, k.get('url_name_orig', None)) if (midfrag in mudata) and int(mudata[midfrag]['ncount']) < 20: nexcluded[k.tag] += 1 if verbose: try: print " -> excluding %s (%s), ncount=%s" % ( k.get('display_name', '<no_display_name>').encode('utf8'), midfrag, mudata.get(midfrag, {}).get('ncount')) except Exception as err: print " -> excluding ", k continue walk_tree(k, policy.copy()) walk_tree(xml) print "--> Count of individual element tags throughout XML: ", counts print "--> problem_stats:", json.dumps(problem_stats, indent=4) # combine some into "qual_axis" and others into "quant_axis" qual_axis = [ 'openassessment', 'optionresponse', 'multiplechoiceresponse', # 'discussion', 'choiceresponse', 'word_cloud', 'combinedopenended', 'choiceresponse', 'stringresponse', 'textannotation', 'openended', 'lti' ] quant_axis = [ 'formularesponse', 'numericalresponse', 'customresponse', 'symbolicresponse', 'coderesponse', 'imageresponse' ] nqual = 0 nquant = 0 for tag, count in counts.items(): if tag in qual_axis: nqual += count if tag in quant_axis: nquant += count print "nqual=%d, nquant=%d" % (nqual, nquant) nqual_per_week = nqual / nweeks nquant_per_week = nquant / nweeks total_per_week = nqual_per_week + nquant_per_week print "per week: nqual=%6.2f, nquant=%6.2f total=%6.2f" % ( nqual_per_week, nquant_per_week, total_per_week) # save this overall data in CCDATA lock_file(CCDATA) ccdfn = path(CCDATA) ccd = {} if ccdfn.exists(): for k in csv.DictReader(open(ccdfn)): ccd[k['course_id']] = k ccd[course_id] = { 'course_id': course_id, 'nweeks': nweeks, 'nqual_per_week': nqual_per_week, 'nquant_per_week': nquant_per_week, 'total_assessments_per_week': total_per_week, } # fields = ccd[ccd.keys()[0]].keys() fields = [ 'course_id', 'nquant_per_week', 'total_assessments_per_week', 'nqual_per_week', 'nweeks' ] cfp = open(ccdfn, 'w') dw = csv.DictWriter(cfp, fieldnames=fields) dw.writeheader() for cid, entry in ccd.items(): dw.writerow(entry) cfp.close() lock_file(CCDATA, release=True) # store data in course_metainfo table, which has one (course_id, key, value) on each line # keys include nweeks, nqual, nquant, count_* for module types * cmfields = OrderedDict() cmfields['course_id'] = course_id cmfields['course_length_days'] = str(ndays) cmfields.update( {make_key('listings_%s' % key): value for key, value in data.items()}) # from course listings cmfields.update(ccd[course_id].copy()) # cmfields.update({ ('count_%s' % key) : str(value) for key, value in counts.items() }) # from content counts cmfields['filename_xbundle'] = xbfn cmfields['filename_listings'] = lfn for key in sorted( counts ): # store counts in sorted order, so that the later generated CSV file can have a predictable structure value = counts[key] cmfields['count_%s' % key] = str(value) # from content counts for key in sorted(problem_stats): # store problem stats value = problem_stats[key] cmfields['problem_stat_%s' % key] = str(value) cmfields.update({('nexcluded_sub_20_%s' % key): str(value) for key, value in nexcluded.items() }) # from content counts course_dir = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest) csvfn = course_dir / CMINFO # manual overriding of the automatically computed fields can be done by storing course_id,key,value data # in the CMINFO_OVERRIDES file csvfn_overrides = course_dir / CMINFO_OVERRIDES if csvfn_overrides.exists(): print "--> Loading manual override information from %s" % csvfn_overrides for ovent in csv.DictReader(open(csvfn_overrides)): if not ovent['course_id'] == course_id: print "===> ERROR! override file has entry with wrong course_id: %s" % ovent continue print " overriding key=%s with value=%s" % (ovent['key'], ovent['value']) cmfields[ovent['key']] = ovent['value'] print "--> Course metainfo writing to %s" % csvfn fp = open(csvfn, 'w') cdw = csv.DictWriter(fp, fieldnames=['course_id', 'key', 'value']) cdw.writeheader() for k, v in cmfields.items(): cdw.writerow({'course_id': course_id, 'key': k, 'value': v}) fp.close() # build and output course_listings_and_metainfo dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) mypath = os.path.dirname(os.path.realpath(__file__)) clm_table = "course_listing_and_metainfo" clm_schema_file = '%s/schemas/schema_%s.json' % (mypath, clm_table) clm_schema = json.loads(open(clm_schema_file).read()) clm = {} for finfo in clm_schema: field = finfo['name'] clm[field] = cmfields.get(field) clm_fnb = clm_table + ".json" clm_fn = course_dir / clm_fnb open(clm_fn, 'w').write(json.dumps(clm)) gsfnp = gsutil.gs_path_from_course_id( course_id, use_dataset_latest=use_dataset_latest) / clm_fnb print "--> Course listing + metainfo uploading to %s then to %s.%s" % ( gsfnp, dataset, clm_table) sys.stdout.flush() gsutil.upload_file_to_gs(clm_fn, gsfnp) bqutil.load_data_to_table(dataset, clm_table, gsfnp, clm_schema, wait=True, verbose=False) # output course_metainfo table = 'course_metainfo' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) gsfnp = gsutil.gs_path_from_course_id( course_id, use_dataset_latest=use_dataset_latest) / CMINFO print "--> Course metainfo uploading to %s then to %s.%s" % ( gsfnp, dataset, table) sys.stdout.flush() gsutil.upload_file_to_gs(csvfn, gsfnp) mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_course_metainfo.json' % mypath the_schema = json.loads(open(SCHEMA_FILE).read())[table] bqutil.load_data_to_table(dataset, table, gsfnp, the_schema, wait=True, verbose=False, format='csv', skiprows=1)
def AnalyzeIDV(course_id, force_recompute=False, use_dataset_latest=False): tablename = "idv_analysis" dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) print "="*77 print "Creating %s.%s table for %s" % (dataset, tablename, course_id) print "-"*77 org = course_id.split('/',1)[0] dataset_cr = ('course_report_%s' % org) if use_dataset_latest: dataset_cr = 'course_report_latest' pcv = "person_course_viewed" try: tinfo = bqutil.get_bq_table_info(dataset_cr, "person_course_viewed") assert tinfo is not None except Exception as err: print " --> missing %s.%s ; using dummy instead" % (dataset_cr, pcv) sys.stdout.flush() dataset_cr = dataset pcv = "person_course" the_sql = """ # IDV and non-IDV enrollee engagement at point of last IDV enrollment, before course end, including # IDV and certificates in other courses SELECT "{course_id}" as course_id, OC.user_id as user_id, OC.username as username, (OC.verified_enroll_time is not NULL) as is_idv, sum(case when PCI.verified_enroll_time is not NULL then 1 else 0 end) as n_other_idv, sum(case when PCI.verified_enroll_time is not NULL and PCI.start_time > OC.start_time then 1 else 0 end) as n_previous_idv, sum(case when PCI.certified and PCI.start_time > OC.start_time then 1 else 0 end) as n_previous_certified, sum(case when PCI.viewed and PCI.start_time > OC.start_time then 1 else 0 end) as n_previous_participated, sum(case when (PCI.verified_enroll_time is not NULL and PCI.start_time > OC.start_time and PCI.certified) then 1 else 0 end) as n_previous_idv_certified, first(gender) as gender, first(YoB) as YoB, first(LoE) as LoE, OC.n_problem_records as n_problem_records, OC.n_correct as n_correct, OC.n_incorrect as n_incorrect, OC.total_problem_points as total_problem_points, OC.verified_enroll_time as verified_enroll_time, OC.verified_unenroll_time as verified_unenroll_time, OC.verified_enroll_date as verified_enroll_date, OC.verified_unenroll_date as verified_unenroll_date, OC.nforum_pinned as nforum_pinned, OC.is_forum_moderator as is_forum_moderator, OC.final_course_grade as final_course_grade, OC.earned_certificate as earned_certificate, OC.n_show_answer as n_show_answer, OC.nprogcheck as nprogcheck, OC.nvideo as nvideo, OC.nforum_reads as nforum_reads, OC.nforum_posts as nforum_posts, OC.hours_on_system as hours_on_system, OC.countryLabel as countryLabel, OC.start_time as start_time, FROM ( # engagement stats for NON verified ID versus verified ID, as of the date of the last IDV signup SELECT * FROM ( # stats for NON verified ID, as of the date of the last IDV signup SELECT PAC.user_id as user_id, PAC.username as username, PAC.n_problem_records as n_problem_records, PAC.n_correct as n_correct, PAC.n_incorrect as n_incorrect, PAC.total_problem_points as total_problem_points, PAC.verified_enroll_time as verified_enroll_time, PAC.verified_unenroll_time as verified_unenroll_time, DATE(PAC.verified_enroll_time) as verified_enroll_date, DATE(PAC.verified_unenroll_time) as verified_unenroll_date, PAC.nforum_pinned as nforum_pinned, PAC.is_forum_moderator as is_forum_moderator, PAC.final_course_grade as final_course_grade, PAC.earned_certificate as earned_certificate, PAC.countryLabel as countryLabel, PAC.start_time as start_time, sum(PCD.nshow_answer) as n_show_answer, sum(PCD.nprogcheck) as nprogcheck, sum(PCD.nvideo) as nvideo, sum(PCD.nforum_reads) as nforum_reads, sum(PCD.nforum_posts) as nforum_posts, sum(PCD.sum_dt / 60 / 60) as hours_on_system, FROM ( # get problem grade and activity counts up to date of verified ID enrollment SELECT PA.user_id as user_id, PC.username as username, count(*) as n_problem_records, sum(case when PA.item.correct_bool then 1 else 0 end) as n_correct, sum(case when PA.item.correct_bool==False then 1 else 0 end) as n_incorrect, sum(PA.grade) as total_problem_points, PC.verified_enroll_time as verified_enroll_time, PC.verified_unenroll_time as verified_unenroll_time, PC.nforum_pinned as nforum_pinned, PC.forumRoles_isModerator as is_forum_moderator, PC.grade as final_course_grade, PC.certified as earned_certificate, PC.countryLabel as countryLabel, PC.start_time as start_time, max_verified_enroll_time, FROM [{dataset}.problem_analysis] PA JOIN ( SELECT user_id, username, verified_enroll_time, verified_unenroll_time, nforum_pinned, forumRoles_isModerator, grade, certified, max_verified_enroll_time, countryLabel, start_time FROM [{dataset}.person_course] PC CROSS JOIN ( SELECT max(verified_enroll_time) as max_verified_enroll_time FROM [{dataset}.person_course] ) VET where viewed ) PC ON PA.user_id = PC.user_id where PA.created <= PC.max_verified_enroll_time and PC.verified_enroll_time is null group by user_id, username, verified_enroll_time, nforum_pinned, is_forum_moderator, final_course_grade, earned_certificate, verified_unenroll_time, max_verified_enroll_time, countryLabel, start_time order by user_id ) PAC JOIN [{dataset}.person_course_day] PCD ON PAC.username = PCD.username WHERE PCD.date < DATE(max_verified_enroll_time) group by user_id, username, verified_enroll_time, nforum_pinned, is_forum_moderator, final_course_grade, earned_certificate, verified_unenroll_time, n_problem_records, n_correct, n_incorrect, total_problem_points, nforum_pinned, is_forum_moderator, verified_enroll_date, verified_unenroll_date, countryLabel, start_time order by user_id ), ( # stats for those who DID enroll verified ID, as of the date of their IDV enrollment # include nprogcheck, nshow_answer, nproblem_check, nvideo, hours_on_system SELECT PAC.user_id as user_id, PAC.username as username, PAC.n_problem_records as n_problem_records, PAC.n_correct as n_correct, PAC.n_incorrect as n_incorrect, PAC.total_problem_points as total_problem_points, PAC.verified_enroll_time as verified_enroll_time, PAC.verified_unenroll_time as verified_unenroll_time, DATE(PAC.verified_enroll_time) as verified_enroll_date, DATE(PAC.verified_unenroll_time) as verified_unenroll_date, PAC.nforum_pinned as nforum_pinned, PAC.is_forum_moderator as is_forum_moderator, PAC.final_course_grade as final_course_grade, PAC.earned_certificate as earned_certificate, PAC.countryLabel as countryLabel, PAC.start_time as start_time, sum(PCD.nshow_answer) as n_show_answer, sum(PCD.nprogcheck) as nprogcheck, sum(PCD.nvideo) as nvideo, sum(PCD.nforum_reads) as nforum_reads, sum(PCD.nforum_posts) as nforum_posts, sum(PCD.sum_dt / 60 / 60) as hours_on_system, FROM ( # get problem grade and activity counts up to date of verified ID enrollment SELECT PA.user_id as user_id, PC.username as username, count(*) as n_problem_records, sum(case when PA.item.correct_bool then 1 else 0 end) as n_correct, sum(case when PA.item.correct_bool==False then 1 else 0 end) as n_incorrect, sum(PA.grade) as total_problem_points, PC.verified_enroll_time as verified_enroll_time, PC.verified_unenroll_time as verified_unenroll_time, PC.nforum_pinned as nforum_pinned, PC.forumRoles_isModerator as is_forum_moderator, PC.grade as final_course_grade, PC.certified as earned_certificate, PC.countryLabel as countryLabel, PC.start_time as start_time, FROM [{dataset}.problem_analysis] PA JOIN [{dataset}.person_course] PC ON PA.user_id = PC.user_id where PA.created <= PC.verified_enroll_time group by user_id, username, verified_enroll_time, nforum_pinned, is_forum_moderator, final_course_grade, earned_certificate, verified_unenroll_time, countryLabel, start_time order by user_id ) PAC JOIN [{dataset}.person_course_day] PCD ON PAC.username = PCD.username WHERE PCD.date < DATE(PAC.verified_enroll_time) group by user_id, username, verified_enroll_time, nforum_pinned, is_forum_moderator, final_course_grade, earned_certificate, verified_unenroll_time, n_problem_records, n_correct, n_incorrect, total_problem_points, nforum_pinned, is_forum_moderator, verified_enroll_date, verified_unenroll_date, countryLabel, start_time order by user_id ) order by verified_enroll_date, user_id ) OC LEFT JOIN [{dataset_cr}.{pcv}] PCI on OC.user_id = PCI.user_id #where (PCI.verified_enroll_time is null) or (PCI.verified_enroll_time <= OC.verified_enroll_time) group by user_id, username, verified_enroll_time, nforum_pinned, is_forum_moderator, final_course_grade, earned_certificate, verified_unenroll_time, n_problem_records, n_correct, n_incorrect, total_problem_points, nforum_pinned, is_forum_moderator, verified_enroll_date, verified_unenroll_date, n_show_answer, nprogcheck, nvideo, nforum_reads, nforum_posts, hours_on_system, countryLabel, start_time, is_idv order by verified_enroll_date, user_id """ the_sql = the_sql.format(dataset=dataset, dataset_cr=dataset_cr, pcv=pcv, course_id=course_id) try: bqdat = bqutil.get_bq_table(dataset, tablename, the_sql, force_query=force_recompute, depends_on=["%s.problem_course" % dataset, "%s.person_course_day" % dataset, "%s.problem_analysis" % dataset], allowLargeResults=True, startIndex=-2) except Exception as err: print "ERROR! Failed on SQL=" print the_sql raise print " --> created %s.%s" % (dataset, tablename) sys.stdout.flush()
def CreateForumPerson(course_id, force_recompute=False, use_dataset_latest=False, skip_last_day=False, end_date=None, has_hash_limit=False, hash_limit=HASH): ''' Create Forum Person table, based on forum events and forum posts tables. This table contains both read and writes for all forum posts, for all users. ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) table = TABLE_FORUM_PERSON original_the_sql = """ SELECT (case when PP.username_fp is not null then PP.username_fp else FE.username_fe end) as username, "{course_id}" as course_id, (case when PP.username_fp is not null then PP.slug_id else FE.slug_id end) as slug_id, (case when PP.username_fp is not null then PP.slug_type else FE.slug_type end) as slug_type, (case when PP.username_fp is not null then PP.thread_id else FE.thread_id end) as thread_id, (case when PP.username_fp is not null then PP.parent_id else FE.parent_id end) as parent_id, (case when PP.original_poster is not null then PP.original_poster else FE.original_poster end) as original_poster, (case when PP.responded_to is not null then PP.responded_to else FE.responded_to end) as responded_to, (case when PP.username_fp is not null then PP.title else FE.title end) as title, (case when PP.username_fp is not null then PP.wrote else 0 end) as wrote, FE.read as read, FE.pin as pinned, FE.upvote as upvoted, FE.unvote as unvoted, #FE.del as deleted, FE.follow as followed, (case when PP.first_time is not null and FE.last_time is not null and (TIMESTAMP(PP.first_time) <= TIMESTAMP(FE.last_time)) then TIMESTAMP(PP.first_time) else (case when PP.first_time is not null and FE.last_time is null then TIMESTAMP(PP.first_time) else (case when FE.first_time is not null then TIMESTAMP(FE.first_time) else FE.last_time end) end) end) as first_time, (case when PP.first_time is not null and FE.last_time is not null and (TIMESTAMP(PP.first_time) >= TIMESTAMP(FE.last_time)) then TIMESTAMP(PP.first_time) else (case when PP.first_time is not null and FE.last_time is null then TIMESTAMP(PP.first_time) else (case when FE.last_time is not null then TIMESTAMP(FE.last_time) else FE.first_time end) end) end) as last_time, FROM ( # Find 1st level posting => "original_post" SELECT username as username_fp, slug_id, slug_type, thread_id, parent_id, original_poster, responded_to, title, 1 as wrote, #created_at as first_time, first_time FROM [{dataset}.{forum_posts}] {hash_limit_where} ORDER by username_fp, first_time ) PP FULL OUTER JOIN EACH ( SELECT username as username_fe, MIN(TIMESTAMP(time)) as first_time, MAX(TIMESTAMP(time)) as last_time, slug_id, FE.thread_id as thread_id, FIRST(parent_id) as parent_id, F.slug_type as slug_type, F.original_poster as original_poster, F.responded_to as responded_to, F.title as title, #1 as read, sum(case when forum_action = "read" or forum_action = "read_inline" then 1 else 0 end) as read, sum(case when forum_action = "pin" then 1 else 0 end) as pin, sum(case when forum_action = "upvote" then 1 else 0 end) as upvote, sum(case when forum_action = "unvote" then 1 else 0 end) as unvote, #sum(case when forum_action = "delete" then 1 else 0 end) as del, sum(case when forum_action = "follow_thread" then 1 else 0 end) as follow, FROM [{dataset}.{forum_events}] FE JOIN EACH ( SELECT username as username_fe, slug_id, slug_type, thread_id, parent_id, original_poster, responded_to, title, first_time, FROM [{dataset}.{forum_posts}] {hash_limit_where} ) as F ON F.thread_id = FE.thread_id WHERE ((FE.forum_action = "read") or (FE.forum_action = "read_inline") or (FE.forum_action = "pin") or (FE.forum_action = "upvote") or (FE.forum_action = "unvote") or #(FE.forum_action = "delete") or (FE.forum_action = "follow_thread")) {hash_limit_and} GROUP BY username_fe, slug_id, thread_id, slug_type, original_poster, responded_to, title ) as FE ON (PP.username_fp = FE.username_fe) AND (PP.slug_id = FE.slug_id) WHERE (PP.username_fp is not null and PP.username_fp != '') or (FE.username_fe is not null and FE.username_fe != '') """ the_sql = original_the_sql.format(dataset=dataset, course_id=course_id, forum=TABLE_FORUM, forum_posts=TABLE_FORUM_POSTS, forum_events=TABLE_FORUM_EVENTS, hash_limit_and='', hash_limit_where='') print "[make_forum_analysis] Creating %s.%s table for %s" % ( dataset, TABLE_FORUM_PERSON, course_id) sys.stdout.flush() try: tinfo_fe = bqutil.get_bq_table_info(dataset, TABLE_FORUM_EVENTS) trows_fe = int(tinfo_fe['numRows']) print "[make_forum_analysis] %s Forum Events found " % trows_fe tinfo_fp = bqutil.get_bq_table_info(dataset, TABLE_FORUM_POSTS) trows_fp = int(tinfo_fp['numRows']) print "[make_forum_analysis] %s Forum Posts found " % trows_fp assert tinfo_fe is not None and trows_fe != 0, "[make_forum_analysis] %s table depends on %s, which does not exist" % ( TABLE_FORUM_PERSON, TABLE_FORUM_EVENTS) assert tinfo_fp is not None and trows_fp != 0, "[make_forum_analysis] %s table depends on %s, which does not exist" % ( TABLE_FORUM_PERSON, TABLE_FORUM_POSTS) except (AssertionError, Exception) as err: print " --> Err: missing %s.%s and/or %s (including 0 rows in table)? Skipping creation of %s" % ( dataset, TABLE_FORUM_POSTS, TABLE_FORUM_EVENTS, TABLE_FORUM_PERSON) sys.stdout.flush() return # Now try to create table try: if has_hash_limit: overwrite = True hash_limit = int(hash_limit) for k in range(hash_limit): hash_limit_where = "WHERE ABS(HASH(username)) %% %d = %d" % ( hash_limit, k) hash_limit_and = "and ABS(HASH(username)) %% %d = %d" % ( hash_limit, k) retry_the_sql = original_the_sql.format( dataset=dataset, forum=TABLE_FORUM, forum_posts=TABLE_FORUM_POSTS, forum_events=TABLE_FORUM_EVENTS, hash_limit_and=hash_limit_and, hash_limit_where=hash_limit_where) print "[make_forum_analysis] Retrying with this query...", retry_the_sql sys.stdout.flush() bqutil.create_bq_table(dataset, table, retry_the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) overwrite = "append" else: overwrite = True bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) except Exception as err: has_hash_limit = True if ((('Response too large to return.' in str(err)) or ('Resources exceeded during query execution' in str(err))) and has_hash_limit): # 'Resources exceeded during query execution' # try using hash limit on username # e.g. WHERE ABS(HASH(username)) % 4 = 0 print '[make_forum_analysis] Response too large to return. Attempting to break down into multiple queries and append instead... using hash of %s' % hash_limit try: for k in range(hash_limit): hash_limit_where = "WHERE ABS(HASH(username)) %% %d = %d" % ( hash_limit, k) hash_limit_and = "and ABS(HASH(username)) %% %d = %d" % ( hash_limit, k) retry_the_sql = original_the_sql.format( dataset=dataset, forum=TABLE_FORUM, forum_posts=TABLE_FORUM_POSTS, forum_events=TABLE_FORUM_EVENTS, hash_limit_and=hash_limit_and, hash_limit_where=hash_limit_where) print "[make_forum_analysis] Retrying with this query...", retry_the_sql sys.stdout.flush() bqutil.create_bq_table(dataset, table, retry_the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) overwrite = "append" except Exception as err: if ((('Response too large to return.' in str(err)) or ('Resources exceeded during query execution' in str(err))) and has_hash_limit): hash_limit = int(hash_limit * 2.0) print '[make_forum_analysis] Response too large to return. Attempting to break down into multiple queries and append instead... using hash of %s' % hash_limit CreateForumPerson(course_id, force_recompute, use_dataset_latest, skip_last_day, end_date, has_hash_limit=True, hash_limit=hash_limit) else: print '[make_forum_analysis] An error occurred with this query: %s' % the_sql raise else: print '[make_forum_analysis] An error occurred with this query: %s' % the_sql raise print "Done with Forum Person for %s (end %s)" % (course_id, datetime.datetime.now()) print "=" * 77 sys.stdout.flush() return
def create_course_item_table(course_id, force_recompute=False, use_dataset_latest=False): ''' the course_item dataset has these columns: Field Name Type Example Description item_id string i4x-MITx-8_MReV-problem-CheckPoint_1_Newton_s_First_Law_2_1 Unique ID for an assessment item (constructed using the problem module_id, and linked to problem_analysis table keys) problem_id string CheckPoint_1_Newton_s_First_Law Unique ID for an assessment problem (constructed using problem url_name) problem_nid integer 27 unique problem numerical id (equal to the sequential count of problems up to this one) assignment_short_id string HW_4 Unique short ID for assignment, using assignment short name + "_" + assignment_seq_num (should be same as what shows up in user's edX platform progress page) item_weight float 6.59E-05 Fraction of overall grade (between 0 and 1) contributed by this item n_user_responses integer 4868 Number of users who provided a response to this assessment item problem_name string CheckPoint 1: Newton's First Law Name of problem within which this item exists chapter_name string Chapter 1 Name of chapter within which the problem exists section_name string Section 1 Name of section (aka sequential) within which the problem exists assignment_id string Checkpoint_ch3 Unique ID for the assignment within which the problem exists n_problems_in_assignment integer 23 Number of problems within the assignment assignment_type string Checkpoint The assignment type within which the assignment exists assignment_type_weight float 0.1 Fraction of the overall grade contributed by the assignment type n_assignments_of_type integer 11 Number of assignments of this type assignment_seq_num integer 3 Sequential number of the assignment_type within the course chapter_number integer 3 Number of the chapter within which the problem exists section_number integer 3 Number of the section (aka sequential) within which the problem exists content_index integer 141 Index number of the problem within the content course axis problem_weight integer 1 Weight of the problem within the assignment item_points_possible float 1 Always 1 (used for debugging - number of points assigned to an item) problem_points_possible integer 6 Always equal to the number of items in the assignment (used for debugging) emperical_item_points_possible integer 1 Emperical value of point value of item, based on user data in problem_analysis table (for debugging) emperical_problem_points_possible integer 6 Emperical value of maximum number of points possible for problem based on problem_analysis (for debugging) item_number integer 1 Number of the item, within the problem (in order of presentation, starting from 1) n_items integer 6 Number of items within the problem start_date date 2013-06-01 00:01:00 UTC Date when problem was issued due_date date 2013-06-23 23:59:00 UTC Date when problem was due problem_path string /Unit_1/Newtons_First_Law/2/1 Path of problem within course content, specifying chapter and sequential problem_short_id string HW_7__3 short (and unique) problem ID, made using assignment short ID + "__" + problem number item_short_id string HW_7__3_1 short (and unique) item ID, made using problem short ID + "_" + item number item_nid integer 41 unique item numerical id (equal to the row number of this entry in the course_itm table) cumulative_item_weight float 6.59E-05 Cumulative fraction of item weights (for debugging: should increase to 1.0 by the end of table) is_split boolean False Boolean flag indicating if this item was within an A/B split_test or not split_name string CircMotionAB Name of the split_test within which this item is placed, if is_split is True ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) tablename = "course_item" the_sql = """ SELECT # '{course_id}' as course_id, *, CONCAT(assignment_short_id, "__", STRING(problem_number)) as problem_short_id, CONCAT(assignment_short_id, "__", STRING(problem_number), "_", STRING(item_number)) as item_short_id, row_number() over (order by content_index, item_number) as item_nid, sum(item_weight) over (order by content_index, item_number) cumulative_item_weight FROM ( # items with additional data about fraction_of_overall_grade from grading_policy SELECT item_id, problem_id, max(if(item_number=1, x_item_nid, null)) over (partition by problem_id) as problem_nid, CONCAT(GP.short_label, "_", STRING(assignment_seq_num)) as assignment_short_id, (problem_weight * GP.fraction_of_overall_grade / n_items / sum_problem_weight_in_assignment / n_assignments_of_type) as item_weight, n_user_responses, chapter_name, section_name, vertical_name, problem_name, CI.assignment_id as assignment_id, n_problems_in_assignment, CI.assignment_type as assignment_type, GP.fraction_of_overall_grade as assignment_type_weight, n_assignments_of_type, assignment_seq_num, chapter_number, content_index, section_number, problem_number, problem_weight, item_points_possible, problem_points_possible, emperical_item_points_possible, emperical_problem_points_possible, item_number, n_items, start_date, due_date, is_split, split_name, problem_path, FROM ( # items with number of problems per assignment SELECT item_id, item_number, n_items, problem_id, row_number() over (partition by item_number order by content_index) as x_item_nid, n_user_responses, chapter_name, section_name, vertical_name, problem_name, assignment_id, sum(if(assignment_id is not null and item_number=1, 1, 0)) over (partition by assignment_id) n_problems_in_assignment, sum(if(assignment_id is not null and item_number=1, problem_weight, 0)) over (partition by assignment_id) sum_problem_weight_in_assignment, assignment_type, n_assignments_of_type, assignment_seq_num, chapter_number, section_number, problem_number, problem_path, content_index, start_date, due_date, is_split, split_name, problem_weight, item_points_possible, problem_points_possible, emperical_item_points_possible, emperical_problem_points_possible, FROM ( # items from problem_analysis with metadata from course_axis SELECT item_id, item_number, n_items, problem_id, n_user_responses, CA.name as problem_name, chapter_name, section_name, vertical_name, assignment_id, assignment_type, n_assignments_of_type, CA.assignment_seq_num as assignment_seq_num, CA.chapter_number as chapter_number, CA.section_number as section_number, CA.problem_number as problem_number, CA.path as problem_path, CA.index as content_index, CA.start as start_date, CA.due as due_date, CA.is_split as is_split, CA.split_name as split_name, if(CA.weight is null, 1.0, CA.weight) as problem_weight, item_points_possible, problem_points_possible, emperical_item_points_possible, emperical_problem_points_possible, FROM ( # get items with item metadata from problem_analysis table SELECT item_id, item_number, n_items, problem_id, n_user_responses, 1.0 as item_points_possible, 1.0 * n_items as problem_points_possible, problem_points_possible / n_items as emperical_item_points_possible, problem_points_possible as emperical_problem_points_possible, FROM ( SELECT item_id, item_number, max(item_number) over (partition by problem_id) n_items, problem_id, problem_points_possible, n_user_responses, FROM ( SELECT item_id, row_number() over (partition by problem_id order by item_id) item_number, problem_id, problem_points_possible, n_user_responses, FROM ( SELECT item.answer_id as item_id, problem_url_name as problem_id, max_grade as problem_points_possible, count(*) as n_user_responses, FROM [{dataset}.problem_analysis] group by item_id, problem_id, problem_points_possible having n_user_responses > 5 # minimum cutoff for an item to be included ) ) ) order by item_id, item_number ) as PA JOIN ( # -------------------------------------------------- graded problems from course axis # master table of graded problems from course_axis, with assignment metadata SELECT module_id, url_name, index, weight, assignment_type, MAX(IF(problem_number=1, x_assignment_seq_num, null)) over (partition by assignment_id) as assignment_seq_num, problem_number, assignment_id, n_assignments_of_type, chapter_name, section_name, vertical_name, name, path, start, due, is_split, split_name, chapter_number, section_number, FROM ( # course_axis with chapter number and number of assignments of type SELECT *, # add column with number of assignments of type SUM(IF(problem_number=1, 1, 0)) over (partition by assignment_type) n_assignments_of_type, row_number() over (partition by assignment_type, problem_number order by index) as x_assignment_seq_num, FROM ( # ---------------------------------------- course axis with vertical name SELECT module_id, url_name, index, weight, assignment_type, chapter_number, section_number, assignment_id, chapter_name, section_name, vertical_name, name, path, start, due, is_split, split_name, # add column with problem number within assignment_id row_number() over (partition by assignment_id order by index) problem_number, FROM ( # course axis of problems which have non-null grading_format, including chapter number # and section (aka sequential) number (within the chapter) SELECT CAI.module_id as module_id, CAI.url_name as url_name, index, weight, assignment_type, chapter_number, section_number, # assignment_id = assignment_type + ch_chapter_number + sec_section_number CONCAT(assignment_type, "_ch", STRING(chapter_number), "_sec", STRING(section_number)) as assignment_id, chapter_name, section_name, name, path, start, due, is_split, split_name, parent, FROM ( # course axis entries of things which have non-null grading format, with section_mid from path SELECT module_id, url_name, index, If(data.weight is null, 1.0, data.weight) as weight, gformat as assignment_type, chapter_mid as chapter_mid, REGEXP_EXTRACT(path, '^/[^/]+/([^/]+)') as section_mid, name, path, start, due, is_split, split_url_name as split_name, parent, FROM [{dataset}.course_axis] CAI where gformat is not null and category = "problem" order by index ) CAI LEFT JOIN # join course_axis with itself to get chapter_number and section_number ( # get chapters and sections (aka sequentials) with module_id, chapter_number, and section_number # each assignment is identified by assignment_type + chapter_number + section_number # note in some previous calculations, the section_number was left out by mistake # see https://github.com/edx/edx-platform/blob/master/common/lib/xmodule/xmodule/course_module.py#L1305 SELECT module_id, url_name, name as section_name, max(if(category="chapter", x_chapter_number, null)) over (partition by chapter_mid order by index) as chapter_number, section_number, chapter_name, FROM ( SELECT module_id, url_name, row_number() over (partition by category order by index) as x_chapter_number, row_number() over (partition by chapter_mid, category order by index) as section_number, FIRST_VALUE(name) over (partition by chapter_mid order by index) as chapter_name, index, category, name, if(category="chapter", module_id, chapter_mid) as chapter_mid, FROM [{dataset}.course_axis] where category = "chapter" or category = "sequential" or category = "videosequence" order by index ) order by index ) CHN # ON CAI.chapter_mid = CHN.chapter_mid # old, for assignments by chapter ON CAI.section_mid = CHN.url_name # correct way, for assignments by section (aka sequential) # where gformat is not null ) CAPN LEFT JOIN # join with course_axis to get names of verticals in which problems reside ( # get verticals SELECT url_name as vertical_url_name, name as vertical_name, FROM [{dataset}.course_axis] where category = "vertical" ) CAV ON CAPN.parent = CAV.vertical_url_name # ---------------------------------------- END course axis with vertical_name ) ) order by index # -------------------------------------------------- END graded problems from course axis ) CA ON PA.problem_id = CA.url_name ) ) CI LEFT JOIN [{dataset}.grading_policy] GP ON CI.assignment_type = GP.assignment_type order by content_index, item_number ) order by content_index, item_number """.format(dataset=dataset, course_id=course_id) depends_on = [ "%s.course_axis" % dataset, "%s.grading_policy" % dataset, "%s.problem_analysis" % dataset ] try: bqdat = bqutil.get_bq_table(dataset, tablename, the_sql, newer_than=datetime.datetime(2015, 10, 31, 17, 00), depends_on=depends_on, force_query=force_recompute) except Exception as err: print "[make_course_item_table] ERR! failed in creating %s.%s using this sql:" % (dataset, tablename) print the_sql raise if not bqdat: nfound = 0 else: nfound = len(bqdat['data']) print "--> Done with %s for %s, %d entries found" % (tablename, course_id, nfound) sys.stdout.flush()
def __init__(self, course_id_set, output_project_id=None, nskip=0, output_dataset_id=None, output_bucket=None, use_dataset_latest=False, only_step=None, end_date=None, ): ''' Compute course report tables, based on combination of all person_course and other individual course tables. only_step: specify a single course report step to be executed; runs all reports, if None ''' if only_step and ',' in only_step: only_step = only_step.split(',') self.only_step = only_step self.end_date = end_date; if not course_id_set: print "ERROR! Must specify list of course_id's for report. Aborting." return org = course_id_set[0].split('/',1)[0] # extract org from first course_id self.org = org self.output_project_id = output_project_id crname = ('course_report_%s' % org) if use_dataset_latest: crname = 'course_report_latest' self.dataset = output_dataset_id or crname self.gsbucket = gsutil.gs_path_from_course_id(crname, gsbucket=output_bucket) self.course_id_set = course_id_set course_datasets = [ bqutil.course_id2dataset(x, use_dataset_latest=use_dataset_latest) for x in course_id_set] # check to see which datasets have person_course tables datasets_with_pc = [] self.all_pc_tables = OrderedDict() self.all_pcday_ip_counts_tables = OrderedDict() self.all_uic_tables = OrderedDict() self.all_tott_tables = OrderedDict() for cd in course_datasets: try: table = bqutil.get_bq_table_info(cd, 'person_course') except Exception as err: print "[make-course_report_tables] Err: %s" % str(err) table = None if table is not None: self.all_pc_tables[cd] = table datasets_with_pc.append(cd) try: table = bqutil.get_bq_table_info(cd, 'pcday_ip_counts') except Exception as err: table = None if table is not None: self.all_pcday_ip_counts_tables[cd] = table try: table = bqutil.get_bq_table_info(cd, 'user_info_combo') except Exception as err: table = None if table is not None: self.all_uic_tables[cd] = table try: table = bqutil.get_bq_table_info(cd, 'time_on_task_totals') except Exception as err: print "[make-course_report_tables] Err: %s" % str(err) table = None if table is not None: self.all_tott_tables[cd] = table pc_tables = ',\n'.join(['[%s.person_course]' % x for x in datasets_with_pc]) pcday_ip_counts_tables = ',\n'.join(['[%s.pcday_ip_counts]' % x for x in self.all_pcday_ip_counts_tables]) uic_tables = ',\n'.join(['[%s.user_info_combo]' % x for x in self.all_uic_tables]) tott_tables = ',\n'.join(['[%s.time_on_task_totals]' % x for x in self.all_tott_tables]) print "%d time_on_task tables: %s" % (len(self.all_tott_tables), tott_tables) sys.stdout.flush() # find latest combined person_course table cpc_tables = [ x for x in bqutil.get_list_of_table_ids(self.dataset) if x.startswith("person_course_") ] if cpc_tables: the_cpc_table = "[%s.%s]" % (self.dataset, max(cpc_tables)) else: the_cpc_table = None print "[make_course_report_tables] ==> Using %s as the latest combined person_course table" % the_cpc_table self.parameters = {'dataset': self.dataset, 'pc_tables': pc_tables, 'uic_tables': uic_tables, 'tott_tables': tott_tables, 'pcday_ip_counts_tables': pcday_ip_counts_tables, 'combined_person_course': the_cpc_table, } print "[make_course_report_tables] ==> Using these datasets (with person_course tables): %s" % datasets_with_pc self.course_datasets = course_datasets print "="*100 print "Generating course report tables -> dataset=%s, project=%s" % (self.dataset, self.output_project_id) sys.stdout.flush() bqutil.create_dataset_if_nonexistent(self.dataset, project_id=output_project_id) self.nskip = nskip if 1: self.combine_show_answer_stats_by_course() self.make_totals_by_course() self.make_medians_by_course() self.make_table_of_email_addresses() self.make_global_modal_ip_table() self.make_enrollment_by_day() self.make_time_on_task_stats_by_course() self.make_total_populations_by_course() self.make_table_of_n_courses_registered() self.make_geographic_distributions() # self.count_tracking_log_events() self.make_overall_totals() print "="*100 print "Done with course report tables" sys.stdout.flush()
def process_course(course_id, force_recompute=False, use_dataset_latest=False, end_date=None, check_dates=True, skip_last_day=False): ''' Make {course_id}.person_course_day table for specified course_id. This is a single course-specific table, which contains all day's data. It is incrementally updated when new tracking logs data comes in, by appending rows to the end. The rows are kept in time order. check_dates is disregarded. If skip_last_day is True then do not include the last day of tracking log data in the processing. This is done to avoid processing partial data, e.g. when tracking log data are incrementally loaded with a delta of less than one day. ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) videoTableExists = False try: tinfo_video = bqutil.get_bq_table_info(dataset, 'video_stats_day') assert tinfo_video is not None, "Video stats table missing... Not including video stats" videoTableExists = True except (AssertionError, Exception) as err: #print " --> Err: missing %s.%s? Skipping creation of chapter_grades" % (dataset, "course_axis") sys.stdout.flush() pass #return forumTableExists = False try: tinfo_forum = bqutil.get_bq_table_info(dataset, 'forum_events') assert tinfo_forum is not None, "Forum events table missing... Not including forum stats" forumTableExists = True except (AssertionError, Exception) as err: #print " --> Err: missing %s.%s? Skipping creation of chapter_grades" % (dataset, "course_axis") sys.stdout.flush() pass #return problemTableExists = False try: tinfo_personproblem = bqutil.get_bq_table_info(dataset, 'person_problem') tinfo_courseproblem = bqutil.get_bq_table_info(dataset, 'course_problem') tinfo_courseaxis = bqutil.get_bq_table_info(dataset, 'course_axis') tinfo_personcourse = bqutil.get_bq_table_info(dataset, 'person_course') # Check course axis and person course, course problem assert tinfo_personproblem is not None, "Person problem table missing... Not including problem stats" assert tinfo_courseproblem is not None, "Course problem table missing... Not including problem stats" assert tinfo_courseaxis is not None, "Course axis table missing... Not including problem stats" assert tinfo_personcourse is not None, "Person Course table missing... Not including problem stats" problemTableExists = True except (AssertionError, Exception) as err: #print " --> Err: missing %s.%s? Skipping creation of chapter_grades" % (dataset, "course_axis") sys.stdout.flush() pass PCDAY_SQL_BASE_SELECT = """ SELECT username, '{course_id}' AS course_id, DATE(time) AS date, SUM(bevent) AS nevents, SUM(bprogress) AS nprogcheck, SUM(bshow_answer) AS nshow_answer, SUM(bvideo) AS nvideo, SUM(bproblem_check) AS nproblem_check, SUM(bforum) AS nforum, SUM(bshow_transcript) AS ntranscript, SUM(bseq_goto) AS nseq_goto, SUM(bseek_video) AS nseek_video, SUM(bpause_video) AS npause_video, """ PCDAY_SQL_VIDEO_EXISTS = """ COUNT(DISTINCT video_id) AS nvideos_viewed, # New Video - Unique videos viewed SUM(case when position is not null then FLOAT(position) else FLOAT(0.0) end) AS nvideos_watched_sec, # New Video - # sec watched using max video position """ PCDAY_SQL_VIDEO_DNE = """ 0 AS nvideos_viewed, # New Video - Unique videos viewed FLOAT(0.0) AS nvideos_watched_sec, # New Video - # sec watched using max video position """ PCDAY_SQL_VIDEO_SELECT = PCDAY_SQL_VIDEO_EXISTS if videoTableExists else PCDAY_SQL_VIDEO_DNE PCDAY_SQL_FORUM_EXISTS = """ SUM(case when read is not null then read else 0 end) AS nforum_reads, # New discussion - Forum reads SUM(case when write is not null then write else 0 end) AS nforum_posts, # New discussion - Forum posts COUNT(DISTINCT thread_id ) AS nforum_threads, # New discussion - Unique forum threads interacted with """ PCDAY_SQL_FORUM_DNE = """ 0 AS nforum_reads, # New discussion - Forum reads 0 AS nforum_posts, # New discussion - Forum posts 0 AS nforum_threads, # New discussion - Unique forum threads interacted with """ PCDAY_SQL_FORUM_SELECT = PCDAY_SQL_FORUM_EXISTS if forumTableExists else PCDAY_SQL_FORUM_DNE PCDAY_SQL_PROBLEM_EXISTS = """ COUNT(DISTINCT problem_nid ) AS nproblems_answered, # New Problem - Unique problems attempted SUM(case when n_attempts is not null then n_attempts else 0 end) AS nproblems_attempted, # New Problem - Total attempts SUM(case when ncount_problem_multiplechoice is not null then ncount_problem_multiplechoice else 0 end) as nproblems_multiplechoice, SUM(case when ncount_problem_choice is not null then ncount_problem_choice else 0 end) as nproblems_choice, SUM(case when ncount_problem_numerical is not null then ncount_problem_numerical else 0 end) as nproblems_numerical, SUM(case when ncount_problem_option is not null then ncount_problem_option else 0 end) as nproblems_option, SUM(case when ncount_problem_custom is not null then ncount_problem_custom else 0 end) as nproblems_custom, SUM(case when ncount_problem_string is not null then ncount_problem_string else 0 end) as nproblems_string, SUM(case when ncount_problem_mixed is not null then ncount_problem_mixed else 0 end) as nproblems_mixed, SUM(case when ncount_problem_formula is not null then ncount_problem_formula else 0 end) as nproblems_forumula, SUM(case when ncount_problem_other is not null then ncount_problem_other else 0 end) as nproblems_other, """ PCDAY_SQL_PROBLEM_DNE = """ 0 AS nproblems_answered, # New Problem - Unique problems attempted 0 AS nproblems_attempted, # New Problem - Total attempts 0 AS nproblems_multiplechoice, 0 AS nproblems_choice, 0 AS nproblems_numerical, 0 AS nproblems_option, 0 AS nproblems_custom, 0 AS nproblems_string, 0 AS nproblems_mixed, 0 AS nproblems_forumula, 0 AS nproblems_other, """ PCDAY_SQL_PROBLEM_SELECT = PCDAY_SQL_PROBLEM_EXISTS if problemTableExists else PCDAY_SQL_PROBLEM_DNE PCDAY_SQL_MID = """ MAX(time) AS last_event, AVG( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS avg_dt, STDDEV( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS sdv_dt, MAX( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS max_dt, COUNT( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS n_dt, SUM( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS sum_dt FROM ( SELECT * FROM ( SELECT username, CASE WHEN event_type = "play_video" THEN 1 ELSE 0 END AS bvideo, CASE WHEN event_type = "problem_check" THEN 1 ELSE 0 END AS bproblem_check, CASE WHEN username != "" THEN 1 ELSE 0 END AS bevent, CASE WHEN REGEXP_MATCH(event_type, "^/courses/{course_id}/discussion/.*") then 1 else 0 end as bforum, CASE WHEN REGEXP_MATCH(event_type, "^/courses/{course_id}/progress") then 1 else 0 end as bprogress, CASE WHEN event_type IN ("show_answer", "showanswer") THEN 1 ELSE 0 END AS bshow_answer, CASE WHEN event_type = 'show_transcript' THEN 1 ELSE 0 END AS bshow_transcript, CASE WHEN event_type = 'seq_goto' THEN 1 ELSE 0 END AS bseq_goto, CASE WHEN event_type = 'seek_video' THEN 1 ELSE 0 END AS bseek_video, CASE WHEN event_type = 'pause_video' THEN 1 ELSE 0 END AS bpause_video, # case when event_type = 'edx.course.enrollment.activated' then 1 else 0 end as benroll, # case when event_type = 'edx.course.enrollment.deactivated' then 1 else 0 end as bunenroll time, LAG(time, 1) OVER (PARTITION BY username ORDER BY time) last_time FROM {DATASETS} WHERE NOT event_type CONTAINS "/xblock/" AND username != "" ) """ PCDAY_SQL_VIDEO = """ , ( # Video events SELECT TIMESTAMP(date) as time, '{course_id}' as course_id, username, video_id, position, FROM [{dataset}.video_stats_day] WHERE TIMESTAMP(date)>= TIMESTAMP("{min_date_start}") and TIMESTAMP(date) <= TIMESTAMP("{max_date_end}") ) """ PCDAY_SQL_ADD = PCDAY_SQL_VIDEO if videoTableExists else '' PCDAY_SQL_FORUM = """ , ( # Forum Events SELECT time, username, '{course_id}' as course_id, thread_id, (CASE WHEN (forum_action == "reply" or forum_action == "comment_reply" or forum_action == "created_thread" or forum_action == "created_response" or forum_action == "created_comment") THEN 1 ELSE 0 END) AS write, (CASE WHEN (forum_action == "read" or forum_action == "read_inline") THEN 1 ELSE 0 END) AS read, FROM [{dataset}.forum_events] WHERE (forum_action == "reply" or forum_action == "comment_reply" or forum_action == "created_thread" or forum_action == "created_response" or forum_action == "created_comment" or forum_action == "read" or forum_action == "read_inline") and ( time >= TIMESTAMP("{min_date_start}") and time <= TIMESTAMP("{max_date_end}") ) ) """ PCDAY_SQL_ADD = PCDAY_SQL_ADD + PCDAY_SQL_FORUM if forumTableExists else PCDAY_SQL_ADD PCDAY_SQL_PROBLEM = """, ( # Problems SELECT pc.username AS username, pp.problem_nid AS problem_nid, pp.n_attempts AS n_attempts, pp.time AS time, '{course_id}' as course_id, pp.ncount_problem_multiplechoice as ncount_problem_multiplechoice, pp.ncount_problem_choice as ncount_problem_choice, pp.ncount_problem_numerical as ncount_problem_numerical, pp.ncount_problem_option as ncount_problem_option, pp.ncount_problem_custom as ncount_problem_custom, pp.ncount_problem_string as ncount_problem_string, pp.ncount_problem_mixed as ncount_problem_mixed, pp.ncount_problem_formula as ncount_problem_formula, pp.ncount_problem_other as ncount_problem_other, FROM ( ( SELECT PP.user_id as user_id, PP.problem_nid AS problem_nid, PP.n_attempts as n_attempts, PP.date as time, (Case when CP_CA.data_itype == "multiplechoiceresponse" then 1 else 0 end) as ncount_problem_multiplechoice, # Choice (Case when CP_CA.data_itype == "choiceresponse" then 1 else 0 end) as ncount_problem_choice, # Choice (Case when CP_CA.data_itype == "numericalresponse" then 1 else 0 end) as ncount_problem_numerical, #input (Case when CP_CA.data_itype == "optionresponse" then 1 else 0 end) as ncount_problem_option, # Choice (Case when CP_CA.data_itype == "customresponse" then 1 else 0 end) as ncount_problem_custom, # Custom (Case when CP_CA.data_itype == "stringresponse" then 1 else 0 end) as ncount_problem_string, # Input (Case when CP_CA.data_itype == "mixed" then 1 else 0 end) as ncount_problem_mixed, # Mixed (Case when CP_CA.data_itype == "forumula" then 1 else 0 end) as ncount_problem_formula, # Input (Case when CP_CA.data_itype != "multiplechoiceresponse" and CP_CA.data_itype != "choiceresponse" and CP_CA.data_itype != "numericalresponse" and CP_CA.data_itype != "optionresponse" and CP_CA.data_itype != "customresponse" and CP_CA.data_itype != "stringresponse" and CP_CA.data_itype != "mixed" and CP_CA.data_itype != "forumula" then 1 else 0 end) as ncount_problem_other, # Input #MAX(n_attempts) AS n_attempts, #MAX(date) AS time, FROM [{dataset}.person_problem] PP LEFT JOIN ( SELECT CP.problem_nid as problem_nid, INTEGER(CP.problem_id) as problem_id, CA.data.itype as data_itype, FROM [{dataset}.course_problem] CP LEFT JOIN [{dataset}.course_axis] CA ON CP.problem_id == CA.url_name ) as CP_CA ON PP.problem_nid == CP_CA.problem_nid GROUP BY time, user_id, problem_nid, n_attempts, ncount_problem_multiplechoice, ncount_problem_choice, ncount_problem_choice, ncount_problem_numerical, ncount_problem_option, ncount_problem_custom, ncount_problem_string, ncount_problem_mixed, ncount_problem_formula, ncount_problem_other ) #FROM [{dataset}.person_item] PI #JOIN [{dataset}.course_item] CI #ON PI.item_nid = CI.item_nid #GROUP BY user_id, #problem_nid #ORDER BY #user_id, #problem_nid ) AS pp LEFT JOIN ( SELECT username, user_id FROM [{dataset}.person_course] ) AS pc ON pc.user_id = pp.user_id WHERE time >= TIMESTAMP("{min_date_start}") and time <= TIMESTAMP("{max_date_end}") ) """ PCDAY_SQL_ADD = PCDAY_SQL_ADD + PCDAY_SQL_PROBLEM if problemTableExists else PCDAY_SQL_ADD PCDAY_SQL_END = """ ) WHERE time > TIMESTAMP("{last_date}") GROUP BY course_id, username, date ORDER BY date """ PCDAY_SQL_NEW = PCDAY_SQL_BASE_SELECT + PCDAY_SQL_VIDEO_SELECT + PCDAY_SQL_FORUM_SELECT + PCDAY_SQL_PROBLEM_SELECT + PCDAY_SQL_MID + PCDAY_SQL_ADD + PCDAY_SQL_END PCDAY_SQL = PCDAY_SQL_NEW.format( dataset=dataset, course_id="{course_id}", DATASETS="{DATASETS}", last_date="{last_date}", min_date_start="{min_date_start}", max_date_end="{max_date_end}") table = 'person_course_day' def gdf(row): return datetime.datetime.strptime(row['date'], '%Y-%m-%d') print "=== Processing person_course_day for %s (start %s)" % (course_id, datetime.datetime.now()) sys.stdout.flush() # Major person_course_day schema revision 19-Jan-2016 adds new fields; if table exists, ensure it # has new schema, else force recompute. try: tinfo = bqutil.get_bq_table_info(dataset, table) except Exception as err: tinfo = None if tinfo: fields = tinfo['schema']['fields'] field_names = [x['name'] for x in fields] if not 'nvideos_viewed' in field_names: cdt = tinfo['creationTime'] print " --> person_course_day created %s; missing nvideos_viewed field in schema; forcing recompute - this may take a long time!" % cdt sys.stdout.flush() force_recompute = True process_tracking_logs.run_query_on_tracking_logs(PCDAY_SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=gdf, newer_than=datetime.datetime( 2016, 1, 19, 22, 30 ), skip_last_day=skip_last_day) print "Done with person_course_day for %s (end %s)" % (course_id, datetime.datetime.now()) print "="*77 sys.stdout.flush()
def obsolete_process_course(course_id, force_recompute=False, check_dates=True): ''' make person_course_day tables for specified course_id. This version produces one table for each day. It is inefficient when there are many days with very small daily tracking log tables. ''' PCDAY_SQL = """ select username, "{course_id}" as course_id, sum(bevent) as nevents, sum(bprogress) as nprogcheck, sum(bshow_answer) as nshow_answer, sum(bvideo) as nvideo, sum(bproblem_check) as nproblem_check, sum(bforum) as nforum, sum(bshow_transcript) as ntranscript, sum(bseq_goto) as nseq_goto, sum(bseek_video) as nseek_video, sum(bpause_video) as npause_video, MAX(time) as last_event, AVG( case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end ) as avg_dt, STDDEV( case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end ) as sdv_dt, MAX( case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end ) as max_dt, COUNT( case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end ) as n_dt, SUM( case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end ) as sum_dt from (SELECT username, case when event_type = "play_video" then 1 else 0 end as bvideo, case when event_type = "problem_check" then 1 else 0 end as bproblem_check, case when username != "" then 1 else 0 end as bevent, case when regexp_match(event_type, "^/courses/{course_id}/discussion/.*") then 1 else 0 end as bforum, case when regexp_match(event_type, "^/courses/{course_id}/progress") then 1 else 0 end as bprogress, case when event_type in ("show_answer", "showanswer") then 1 else 0 end as bshow_answer, case when event_type = 'show_transcript' then 1 else 0 end as bshow_transcript, case when event_type = 'seq_goto' then 1 else 0 end as bseq_goto, case when event_type = 'seek_video' then 1 else 0 end as bseek_video, case when event_type = 'pause_video' then 1 else 0 end as bpause_video, # case when event_type = 'edx.course.enrollment.activated' then 1 else 0 end as benroll, # case when event_type = 'edx.course.enrollment.deactivated' then 1 else 0 end as bunenroll time, lag(time, 1) over (partition by username order by time) last_time FROM [{dataset}.{table_id}] WHERE NOT event_type contains "/xblock/" AND username != "" ) group by course_id, username order by sdv_dt desc """ course_dir = course_id.replace('/','__') dataset = bqutil.course_id2dataset(course_id) log_dataset = bqutil.course_id2dataset(course_id, dtype="logs") pcd_dataset = bqutil.course_id2dataset(course_id, dtype="pcday") print "Processing course %s (start %s)" % (course_id, datetime.datetime.now()) sys.stdout.flush() log_tables = bqutil.get_tables(log_dataset) try: bqutil.create_dataset_if_nonexistent(pcd_dataset) except Exception as err: print "Oops, err when creating %s, err=%s" % (pcd_dataset, str(err)) pcday_tables_info = bqutil.get_tables(pcd_dataset) pcday_tables = [x['tableReference']['tableId'] for x in pcday_tables_info.get('tables', [])] print "pcday_tables = ", pcday_tables log_table_list = log_tables['tables'] log_table_list.sort() for table in log_table_list: tr = table['tableReference'] table_id = tr['tableId'] if not table_id.startswith('tracklog'): continue date = table_id[9:] table_out = 'pcday_%s' % date if (table_out in pcday_tables) and not force_recompute: skip = True if check_dates: table_out_date = bqutil.get_bq_table_last_modified_datetime(pcd_dataset, table_out) log_table_date = bqutil.get_bq_table_last_modified_datetime(log_dataset, table_id) if log_table_date > table_out_date: skip = False print "%s...already exists, but table_out date=%s and log_table date=%s, so re-computing" % (table_out, table_out_date, log_table_date) if skip: print "%s...already done, skipping" % table_out sys.stdout.flush() continue if bqutil.get_bq_table_size_rows(log_dataset, table_id)==0: print "...zero size table %s, skipping" % table_id sys.stdout.flush() continue print ("Creating %s " % table_out), the_sql = PCDAY_SQL.format(course_id=course_id, dataset=log_dataset, table_id=table_id) sys.stdout.flush() bqutil.create_bq_table(pcd_dataset, table_out, the_sql, wait=False) print "Done with course %s (end %s)" % (course_id, datetime.datetime.now()) print "="*77 sys.stdout.flush()
def upload_grades_persistent_data(cid, basedir, datedir, use_dataset_latest=False, subsection=False): """ Upload grades_persistent csv.gz to Google Storage, create the BigQuery table, then insert the data into the table. :param cid: the course id :param basedir: the base directory path :param datedir: the date directory name (represented as YYYY-MM-DD) :param use_dataset_latest: should the most recent dataset be used? :param subsection: should grades_persistentsubsection be uploaded? :type cid: str :type basedir: str :type datedir: str :type use_dataset_latest: bool :type subsection: bool """ gsdir = path( gsutil.gs_path_from_course_id(cid, use_dataset_latest=use_dataset_latest)) if subsection: csv_name = "grades_persistentsubsectiongrade.csv.gz" temp_name = "grades_persistentsubsectiongrade_temp.csv.gz" table = "grades_persistent_subsection" else: csv_name = "grades_persistentcoursegrade.csv.gz" temp_name = "grades_persistentcoursegrade_temp.csv.gz" table = "grades_persistent" sdir = load_course_sql.find_course_sql_dir( cid, basedir=basedir, datedir=datedir, use_dataset_latest=(use_dataset_latest), ) csvfn = sdir / csv_name tempfn = sdir / temp_name mypath = os.path.dirname(os.path.realpath(__file__)) the_schema = json.loads( open('%s/schemas/schema_%s.json' % (mypath, table)).read())[table] if not os.path.exists(csvfn): print "[edx2bigquery] make_grades_persistent: missing file %s, skipping" % csvfn return if not subsection: cleanup_rows_from_grade_persistent(csvfn, tempfn) else: cleanup_rows_from_grade_persistent(csvfn, tempfn, field_to_fix="first_attempted") gsutil.upload_file_to_gs(csvfn, gsdir, options="-z csv", verbose=True) dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent( dataset) # create dataset if not already existent bqutil.load_data_to_table(dataset, table, gsdir / csv_name, the_schema, format="csv", skiprows=1)
def CreateForumEvents(course_id, force_recompute=False, use_dataset_latest=False, skip_last_day=False, end_date=None): ''' Create forum events table, based on tracking logs. Extracts all forum-related events, including forum post reads, into the date-time ordered table. Repeated calls to this procedure will append new events to the table. If no new events are found, the existing table is left unchanged. ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) logs = bqutil.course_id2dataset(course_id, dtype='logs') table = TABLE_FORUM_EVENTS # event_type for forums may be like: # /courses/UnivX/123.4x/2T2015/discussion/forum/The_Subject/threads/5460c918a2a525003a0007fa # /courses/UnivX/123.4x/2T2015/discussion/forum/The_Subject/inline # /courses/UnivX/123.4x/2T2015/discussion/forum/users/4051854/followed # /courses/UnivX/123.4x/2T2015/discussion/comments/54593f21a2a525003a000351/reply # /courses/UnivX/123.4x/2T2015/discussion/threads/545e4f5da2a5251aac000672/reply # /courses/UnivX/123.4x/2T2015/discussion/threads/545770e9dad66c17cd0001d5/upvote # /courses/UnivX/123.4x/2T2015/discussion/threads/545770e9dad66c17cd0001d5/unvote # /courses/UnivX/123.4x/2T2015/discussion/threads/5447c22e892b213c7b0001f3/update # /courses/UnivX/123.4x/2T2015/discussion/threads/54493025892b2120a1000335/pin # /courses/UnivX/123.4x/2T2015/discussion/threads/54492e9c35c79cb03e00030c/delete # /courses/UnivX/123.4x/2T2015/discussion/forum/General/inline # /courses/UnivX/123.4x/2T2015/instructor/api/list_forum_members # /courses/UnivX/123.4x/2T2015/instructor/api/update_forum_role_membership # \"GET\": {\"action\": [\"allow\"], \"rolename\": [\"Administrator\"], \"unique_student_identifier\": [\"NEW_ADMIN_USER\"]}}"} # # module_id will be like: # "module_id": "UnivX/123.4x/forum/54492f0c892b21597e00030a" the_sql = """ SELECT time, username, '{course_id}' as course_id, (case when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/reply') then "reply" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/upvote') then "upvote" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/unvote') then "unvote" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/update') then "update" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/delete') then "delete" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/close') then "close" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/follow') then "follow_thread" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/unfollow') then "unfollow_thread" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/pin') then "pin" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/unpin') then "unpin" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/downvote') then "downvote" # does this happen? when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/reply') then "comment_reply" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/upvote') then "comment_upvote" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/update') then "comment_update" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/unvote') then "comment_unvote" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/delete') then "comment_delete" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/users/[^/]+/followed') then "follow_user" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/users/[^/]+$') then "target_user" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/[^/]+/threads/[^/]+') then "read" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/[^/]+/inline') then "read_inline" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/search') then "search" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum$') then "enter_forum" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/$') then "enter_forum" when REGEXP_MATCH(event_type, r'/courses/(.*)/instructor/api/(.*)') then REGEXP_EXTRACT(event_type, r'/courses/.*/instructor/api/(.*)') when event_type = "edx.forum.thread.created" then "created_thread" when event_type = "edx.forum.response.created" then "created_response" when event_type = "edx.forum.comment.created" then "created_comment" when event_type = "edx.forum.searched" then "searched" else event_type end) as forum_action, (case when module_id is not null then REGEXP_EXTRACT(module_id, r'[^/]+/[^/]+/forum/([^/]+)') # For old-school courses with transparent course ids else (case when module_id is null # otherwise, for new opaque course ids, use regex to find thread_id from event_type, since module_id is null then (case when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/[^/]+/threads/[^/]+') then REGEXP_EXTRACT(event_type, r'/courses/.*/discussion/forum/[^/]+/threads/([^/]+)') # read else (case when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/') then REGEXP_EXTRACT(event_type, r'/courses/.*/discussion/threads/([^/]+)') # upvote, pinned, upvoted, unvoted, deleted, followed else REGEXP_EXTRACT(event_type, r'/courses/.*/discussion/comments/([^/]+)/') end) # comment end) end) end) as thread_id, REGEXP_EXTRACT(event_type, r'/courses/.*/forum/([^/]+)/') as subject, REGEXP_EXTRACT(event_type, r'/courses/.*/forum/users/([^/]+)') as target_user_id, event_struct.query as search_query, # unavailable before June 1, 2015 event_struct.GET as event_GET, # unavailable before June 1, 2015 FROM {DATASETS} WHERE (REGEXP_MATCH(event_type ,r'^edx\.forum\..*') or event_type contains "/discussion/forum" or event_type contains "/discussion/threads" or event_type contains "/discussion/comments" or event_type contains "list-forum-" or event_type contains "list_forum_" or event_type contains "add-forum-" or event_type contains "add_forum_" or event_type contains "remove-forum-" or event_type contains "remove_forum_" or event_type contains "update_forum_" ) AND username is not null AND event is not null and time > TIMESTAMP("{last_date}") {hash_limit} order by time """ try: tinfo = bqutil.get_bq_table_info(dataset, table) assert tinfo is not None, "[make_forum_analysis] Creating %s.%s table for %s" % ( dataset, table, course_id) print "[make_forum_analysis] Appending latest data to %s.%s table for %s" % ( dataset, table, course_id) sys.stdout.flush() except (AssertionError, Exception) as err: print str(err) sys.stdout.flush() print " --> Missing %s.%s? Attempting to create..." % (dataset, table) sys.stdout.flush() pass print "=== Processing Forum Events for %s (start %s)" % ( course_id, datetime.datetime.now()) sys.stdout.flush() def gdf(row): return datetime.datetime.utcfromtimestamp(float(row['time'])) process_tracking_logs.run_query_on_tracking_logs( the_sql, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, get_date_function=gdf, has_hash_limit=True, end_date=end_date, skip_last_day=skip_last_day) print "Done with Forum Events for %s (end %s)" % (course_id, datetime.datetime.now()) print "=" * 77 sys.stdout.flush()
def __init__( self, course_id_set, basedir='', datedir='', output_project_id=None, nskip=0, output_dataset_id=None, output_bucket=None, use_dataset_latest=False, only_step=None, end_date=None, ): ''' Extract Research Datasets, based on defined list of tables ''' if only_step and ',' in only_step: only_step = only_step.split(',') self.only_step = only_step self.end_date = end_date if not course_id_set: print "ERROR! Must specify list of course_id's for report. Aborting." return org = course_id_set[0].split('/', 1)[0] # extract org from first course_id self.org = org self.output_project_id = output_project_id crname = ('course_report_%s' % org) if use_dataset_latest: crname = 'course_report_latest' self.dataset = output_dataset_id or crname self.gsbucket = gsutil.gs_path_from_course_id(crname, gsbucket=output_bucket) self.course_id_set = course_id_set course_id = course_id_set #course_datasets = [ bqutil.course_id2dataset(x, use_dataset_latest=use_dataset_latest) for x in course_id_set] #course_datasets_dict = { x:bqutil.course_id2dataset(x, use_dataset_latest=use_dataset_latest) for x in course_id_set} course_dataset = bqutil.course_id2dataset( course_id, use_dataset_latest=use_dataset_latest) self.rdp_matrix = collections.OrderedDict() #for course_id in course_datasets_dict.keys(): print "[researchData] Processing data for course %s" % (course_id) sys.stdout.flush() for rdp in RESEARCH_DATA_PRODUCTS.keys(): try: table = bqutil.get_bq_table_info(course_dataset, rdp) #table = bqutil.get_bq_table_info( course_id, rdp ) if table is not None: #[print "[researchData] %s found for %s dataset" % ( rdp, course_datasets_dict[ course_id ] ) print "[researchData] %s found" % (rdp) sys.stdout.flush() if rdp not in self.rdp_matrix: #self.rdp_matrix[ str(rdp) ] = cd self.rdp_matrix[str(rdp)] = (course_id, course_dataset) #self.rdp_matrix[ str(rdp) ] = ( course_id, course_id ) else: self.rdp_matrix[str(rdp)].append( (course_id, course_dataset)) #self.rdp_matrix[ str(rdp) ].append( (course_id, course_id ) ) except Exception as err: #print str(err) print "[researchData] Err: %s not found for %s dataset" % ( rdp, course_id) # Extract to archival storage for researchDataProduct in self.rdp_matrix: the_dataset = self.rdp_matrix[researchDataProduct][1] course_id = self.rdp_matrix[researchDataProduct][ 0] #the_dataset.replace( '__', '/' ) self.extractResearchData(course_id=course_id, tablename=researchDataProduct, the_dataset=the_dataset, rdp=researchDataProduct, rdp_format='csv', output_bucket=output_bucket, basedir=basedir, datedir=datedir) print "=" * 100 print "Done extracting Research Data tables -> %s" % RESEARCH_DATA_PRODUCTS.keys( ) print "=" * 100 sys.stdout.flush()
def run_external_script(extcmd, param, ecinfo, course_id): """ Run external script on specified course. extcmd = string specifying external command to run param = command line parameters, including extparam ecinfo = external command info from edx2bigquery_config course_id = course_id to run external command on """ # use default for base set of parameters ed_name = ecinfo.get('default_parameters', 'DEFAULT') settings = ecinfo.get(ed_name, {}) settings.update(ecinfo.get(extcmd)) # print "settings: ", json.dumps(settings, indent=4) print settings['name'] if param.verbose: print settings.get('description', '') cidns = course_id.replace('/', '__') cidns_nodots = course_id.replace('/', '__').replace('.', '_').replace('-', '_') mypath = path(os.path.realpath(__file__)).dirname() edx2bigquery_context = {'lib': mypath / "lib", 'bin': mypath / "bin", } the_template = settings['template'].format(**edx2bigquery_context) fnpre = settings['filename_prefix'] lfn = "%s-%s.log" % (fnpre, cidns) if settings.get('logs_dir'): lfn = path(settings['logs_dir']) / lfn try: ofn = settings['script_fn'].format(filename_prefix=fnpre, cidns=cidns) except Exception as err: print "oops, errr %s" % str(err) print "settings=", json.dumps(settings, indent=4) raise cwd = os.getcwd() the_date = str(datetime.datetime.now()) dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=param.use_dataset_latest) table_prefix = dataset if param.force_recompute: param.force_recompute = 1 else: param.force_recompute = 0 context = {'course_id': course_id, 'script_name': ofn, 'the_date': the_date, 'cidns': cidns, 'cidns_nodots': cidns, 'template_file': the_template, 'log_file': lfn, 'filename_prefix': fnpre, 'filename_prefix_cidns': "%s__%s" % (fnpre, cidns), 'working_dir': cwd, 'table_prefix': table_prefix, 'lib_dir': edx2bigquery_context['lib'], 'bin_dir': edx2bigquery_context['bin'], } context.update(settings) context.update(param.__dict__) rundir = settings['run_dir'].format(**context) runcmd = settings['script_cmd'].format(**context) tem = codecs.open(the_template).read() tem = unicode(tem) try: # script_file = tem.format(**context) script_file = Template(tem).render(**context) except Exception as err: print "Oops, cannot properly format template %s" % the_template print "Error %s" % str(err) print "context: ", json.dumps(context, indent=4) raise ofndir = path(ofn).dirname() if not os.path.exists(ofndir): print "[Warning] Directory %s doesn't exist - creating it" % ofndir os.mkdir(ofndir) fp = codecs.open(ofn, 'w', encoding="utf8") fp.write(script_file) fp.close() print "Generated %s" % ofn # if depends_on is defined, and force_recompute is not true, then skip # run if output already exists and is newer than all depends_on tables. depends_on = settings.get('depends_on') output_table = settings.get('output_table') if depends_on and not type(depends_on)==list: depends_on = [ depends_on ] do_compute = param.force_recompute if (not param.force_recompute) and depends_on and output_table: # does output already exist? has_output = False try: tinfo = bqutil.get_bq_table_info(dataset, output_table) if tinfo: has_output = True except: pass if not has_output: print "Output table %s.%s doesn't exist: running" % (dataset, output_table) do_compute = True else: table_date = tinfo['lastModifiedTime'] for deptab in depends_on: try: dtab_date = bqutil.get_bq_table_last_modified_datetime(dataset, deptab) except Exception as err: raise Exception("[run_external] missing dependent table %s.%s" % (dataset, deptab)) if not dtab_date: raise Exception("[run_external] missing dependent table %s.%s" % (dataset, deptab)) if table_date and dtab_date > table_date: do_compute = True break if not do_compute: print "Output table %s.%s exists and is newer than %s, skipping" % (dataset, output_table, depends_on) if do_compute: os.chdir(rundir) print "Working directory: %s" % rundir print "Logging to %s" % lfn print "Run command: %s" % runcmd sys.stdout.flush() if not param.skiprun: start = datetime.datetime.now() if param.submit_condor: condor_template_fn = settings.get('condor_job_template', '').format(**edx2bigquery_context) if not condor_template_fn: raise Exception("[run_external] missing condor_job_template specification for %s" % (extcmd)) condor_submit_fn = "CONDOR/{filename_prefix}-{cidns}.submit".format(**context) context.update({ 'MEMORY': 32768, 'arguments': '{script_name}'.format(**context), 'executable': context['script_cmd'], 'input_file': '', 'filename': condor_submit_fn, }) condor_template = Template(open(condor_template_fn).read()).render(**context) dirs = ['CONDOR', 'JOBS'] for dir in dirs: if not os.path.exists(dir): os.mkdir(dir) fp = open(condor_submit_fn, 'w') fp.write(condor_template) fp.close() cmd = "condor_submit %s" % condor_submit_fn print cmd jobid = None for k in os.popen(cmd): m = re.search('submitted to cluster ([0-9]+)', k) if m: jobid = m.group(1) dt = str(datetime.datetime.now()) jobfile = 'condor_jobs.csv' open(jobfile, 'a').write("%s,%s,%s,%s\n" % (course_id, dt, jobid, lfn)) print "[%s] Submitted as condor job %s at %s" % (course_id, jobid, dt) # print "[run_external] submitted %s, job=%s" % (extcmd, jobnum) return else: os.system(runcmd) if settings.get('type')=="stata": # cleanup leftover log file after stata batch run batch_log = ofn.split('.')[0] + ".log" if os.path.exists(batch_log): os.unlink(batch_log) print "Removed old log file %s" % batch_log end = datetime.datetime.now() has_output = False try: tinfo = bqutil.get_bq_table_info(dataset, output_table) if tinfo: has_output = True except: pass success = has_output dt = end-start print "[run_external] DONE WITH %s, success=%s, dt=%s" % (extcmd, success, dt) sys.stdout.flush() if param.parallel and not success: raise Exception("[run_external] External command %s failed on %s" % (extcmd, course_id))