import string import datetime import traceback from addmoduleid import add_module_id from check_schema_tracking_log import check_schema, schema2dict from load_course_sql import find_course_sql_dir from path import path from edx2course_axis import date_parse import bqutil import gsutil sfn = 'schema_forum.json' mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA = json.loads(open('%s/schemas/%s' % (mypath, sfn)).read())['forum'] SCHEMA_DICT = schema2dict(SCHEMA) def do_rephrase(data, do_schema_check=True, linecnt=0): if '_id' in data: data['mongoid'] = data['_id']['$oid'] data.pop('_id') if 'parent_id' in data: data['parent_id'] = data['parent_id']['$oid'] def fix_date(dstr): if dstr: try: dtime = int(dstr)
def make_video_stats(course_id, api_key, basedir, datedir, force_recompute, use_dataset_latest): ''' Create Video stats for Videos Viewed and Videos Watched. First create a video axis, based on course axis. Then use tracking logs to count up videos viewed and videos watched ''' assert api_key is not None, "[analyze videos]: Public API Key is missing from configuration file. Visit https://developers.google.com/console/help/new/#generatingdevkeys for details on how to generate public key, and then add to edx2bigquery_config.py as API_KEY variable" # Get Course Dir path basedir = path(basedir or '') course_dir = course_id.replace('/', '__') lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest) # get schema mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/%s' % (mypath, SCHEMA_VIDEO_AXIS) the_schema = json.loads(open(SCHEMA_FILE).read())[SCHEMA_VIDEO_AXIS_NAME] the_dict_schema = schema2dict(the_schema) # Create initial video axis videoAxisExists = False dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) va_date = None try: tinfo = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_AXIS) assert tinfo is not None, "[analyze videos] %s.%s does not exist. First time creating table" % ( dataset, TABLE_VIDEO_AXIS) videoAxisExists = True va_date = tinfo['lastModifiedTime'] # datetime except (AssertionError, Exception) as err: print "%s --> Attempting to process %s table" % (str(err), TABLE_VIDEO_AXIS) sys.stdout.flush() # get course axis time ca_date = None try: tinfo = bqutil.get_bq_table_info(dataset, TABLE_COURSE_AXIS) ca_date = tinfo['lastModifiedTime'] # datetime except (AssertionError, Exception) as err: pass if videoAxisExists and (not force_recompute) and ca_date and va_date and ( ca_date > va_date): force_recompute = True print "video_axis exists, but has date %s, older than course_axis date %s; forcing recompute" % ( va_date, ca_date) sys.stdout.flush() if not videoAxisExists or force_recompute: force_recompute = True createVideoAxis(course_id=course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest) # Get video lengths va = bqutil.get_table_data(dataset, TABLE_VIDEO_AXIS) assert va is not None, "[analyze videos] Possibly no data in video axis table. Check course axis table" va_bqdata = va['data'] fileoutput = lfp / FILENAME_VIDEO_AXIS getYoutubeDurations(dataset=dataset, bq_table_input=va_bqdata, api_key=api_key, outputfilename=fileoutput, schema=the_dict_schema, force_recompute=force_recompute) # upload and import video axis gsfn = gsutil.gs_path_from_course_id( course_id, use_dataset_latest=use_dataset_latest) / FILENAME_VIDEO_AXIS gsutil.upload_file_to_gs(fileoutput, gsfn) table = TABLE_VIDEO_AXIS bqutil.load_data_to_table(dataset, table, gsfn, the_schema, wait=True) else: print "[analyze videos] %s.%s already exists (and force recompute not specified). Skipping step to generate %s using latest course axis" % ( dataset, TABLE_VIDEO_AXIS, TABLE_VIDEO_AXIS) # Lastly, create video stats createVideoStats_day(course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest) createVideoStats(course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest)
def process_file(course_id, basedir=None, datedir=None, use_dataset_latest=False): basedir = path(basedir or '') course_dir = course_id.replace('/', '__') lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest) cdir = lfp print "Processing %s from files in %s" % (course_id, cdir) sys.stdout.flush() mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_user_info_combo.json' % mypath the_dict_schema = schema2dict( json.loads(open(SCHEMA_FILE).read())['user_info_combo']) uic = defaultdict( dict ) # dict with key = user_id, and val = dict to be written out as JSON line def copy_elements(src, dest, fields, prefix="", skip_empty=False): for key in fields: if skip_empty and (not key in src): src[key] = None if src[key] == 'NULL': continue if key == 'course_id' and src[key].startswith('course-v1:'): # special handling for mangled "opaque keys" version of course_id, e.g. course-v1:MITx+6.00.2x_3+1T2015 src[key] = src[key].split(':', 1)[1].replace('+', '/') dest[prefix + key] = src[key] def openfile(fn_in, mode='r', add_dir=True): if add_dir: fn = cdir / fn_in else: fn = fn_in if (not os.path.exists(fn)) and (not fn.endswith('.gz')): fn += ".gz" if mode == 'r' and not os.path.exists(fn): newfn = convert_sql( fn) # try converting from *.sql file, if that exists if not newfn: return None # failure, no file found, return None fn = newfn if fn.endswith('.gz'): return gzip.GzipFile(fn, mode) return open(fn, mode) def tsv2csv(fn_in, fn_out): import csv fp = openfile(fn_out, 'w', add_dir=False) csvfp = csv.writer(fp) for line in openfile(fn_in, add_dir=False): csvfp.writerow(line[:-1].split('\t')) fp.close() def convert_sql(fnroot): ''' Returns filename if suitable file exists or was created by conversion of tab separated values to comma separated values. Returns False otherwise. ''' if fnroot.endswith('.gz'): fnroot = fnroot[:-3] if fnroot.endswith('.csv'): fnroot = fnroot[:-4] if os.path.exists(fnroot + ".csv"): return fnroot + ".csv" if os.path.exists(fnroot + ".csv.gz"): return fnroot + ".csv.gz" if os.path.exists(fnroot + ".sql") or os.path.exists(fnroot + ".sql.gz"): infn = fnroot + '.sql' outfn = fnroot + '.csv.gz' print "--> Converting %s to %s" % (infn, outfn) tsv2csv(infn, outfn) return outfn return False nusers = 0 fields = ['username', 'email', 'is_staff', 'last_login', 'date_joined'] for line in csv.DictReader(openfile('users.csv')): uid = int(line['id']) copy_elements(line, uic[uid], fields) uic[uid]['user_id'] = uid nusers += 1 uic[uid]['y1_anomalous'] = None uic[uid]['edxinstructordash_Grade'] = None uic[uid]['edxinstructordash_Grade_timestamp'] = None print " %d users loaded from users.csv" % nusers fp = openfile('profiles.csv') if fp is None: print "--> Skipping profiles.csv, file does not exist" else: nprofiles = 0 fields = [ 'name', 'language', 'location', 'meta', 'courseware', 'gender', 'mailing_address', 'year_of_birth', 'level_of_education', 'goals', 'allow_certificate', 'country', 'city' ] for line in csv.DictReader(fp): uid = int(line['user_id']) copy_elements(line, uic[uid], fields, prefix="profile_") nprofiles += 1 print " %d profiles loaded from profiles.csv" % nprofiles fp = openfile('enrollment.csv') if fp is None: print "--> Skipping enrollment.csv, file does not exist" else: nenrollments = 0 fields = [ 'course_id', 'created', 'is_active', 'mode', ] for line in csv.DictReader(fp): uid = int(line['user_id']) copy_elements(line, uic[uid], fields, prefix="enrollment_") nenrollments += 1 print " %d enrollments loaded from profiles.csv" % nenrollments # see if from_mongodb files are present for this course; if so, merge in that data mongodir = cdir.dirname() / 'from_mongodb' if mongodir.exists(): print "--> %s exists, merging in users, profile, and enrollment data from mongodb" % mongodir sys.stdout.flush() fp = gzip.GzipFile(mongodir / "users.json.gz") fields = ['username', 'email', 'is_staff', 'last_login', 'date_joined'] nadded = 0 for line in fp: pdata = json.loads(line) uid = int(pdata['_id']) if not uid in uic: copy_elements(pdata, uic[uid], fields, skip_empty=True) uic[uid]['user_id'] = uid nadded += 1 fp.close() print " %d additional users loaded from %s/users.json.gz" % (nadded, mongodir) fp = gzip.GzipFile(mongodir / "profiles.json.gz") fields = [ 'name', 'language', 'location', 'meta', 'courseware', 'gender', 'mailing_address', 'year_of_birth', 'level_of_education', 'goals', 'allow_certificate', 'country', 'city' ] nadd_profiles = 0 def fix_unicode(elem, fields): for k in fields: if (k in elem) and elem[k]: elem[k] = elem[k].encode('utf8') for line in fp: pdata = json.loads(line.decode('utf8')) uid = int(pdata['user_id']) if not uic[uid].get('profile_name', None): copy_elements(pdata, uic[uid], fields, prefix="profile_", skip_empty=True) fix_unicode(uic[uid], [ 'profile_name', 'profile_mailing_address', 'profile_goals', 'profile_location', 'profile_language' ]) uic[uid]['y1_anomalous'] = 1 nadd_profiles += 1 fp.close() print " %d additional profiles loaded from %s/profiles.json.gz" % ( nadd_profiles, mongodir) # if datedir is specified, then do not add entries from mongodb where the enrollment happened after the datedir cutoff cutoff = None if datedir: cutoff = "%s 00:00:00" % datedir fp = gzip.GzipFile(mongodir / "enrollment.json.gz") fields = [ 'course_id', 'created', 'is_active', 'mode', ] nadd_enrollment = 0 n_removed_after_cutoff = 0 for line in fp: pdata = json.loads(line.decode('utf8')) uid = int(pdata['user_id']) if not uic[uid].get('enrollment_course_id', None): if cutoff and (pdata['created'] > cutoff) and ( uic[uid].get('y1_anomalous') == 1): # remove if enrolled after datedir cutoff uic.pop(uid) n_removed_after_cutoff += 1 else: copy_elements(pdata, uic[uid], fields, prefix="enrollment_", skip_empty=True) nadd_enrollment += 1 fp.close() print " %d additional enrollments loaded from %s/enrollment.json.gz" % ( nadd_enrollment, mongodir) print " from mongodb files, added %s (of %s) new users (%s profiles, %s enrollments, %s after cutoff %s)" % ( nadded - n_removed_after_cutoff, nadded, nadd_profiles, nadd_enrollment, n_removed_after_cutoff, cutoff) sys.stdout.flush() # See if instructor grade reports are present for this course; if so, merge in that data edxinstructordash = cdir.dirname() / 'from_edxinstructordash' if edxinstructordash.exists(): print "--> %s exists, merging in users, profile, and enrollment data from_edxinstructordash" % edxinstructordash sys.stdout.flush() grade_report_fn = (edxinstructordash / 'grade_report.csv') fp = openfile(grade_report_fn, add_dir=False) if fp is None: print "--> Skipping grade_report.csv, file does not exist in dir from_edxinstructordash" nadded = 0 print fp for line in csv.DictReader(fp): uid = int(line['Student ID']) fields = ['Grade', 'Grade_timestamp'] #['course_id','Student ID','Email','Username','Grade' ] #'Enrollment Track',' Verification Status','Certificate Eligible','Certificate Delivered','Certificate Type' ] copy_elements(line, uic[uid], fields, prefix="edxinstructordash_") nadded += 1 fp.close() print " %d grades loaded from %s/grade_report.csv" % ( nadded, edxinstructordash) sys.stdout.flush() fp = openfile('certificates.csv') if fp is None: print "--> Skipping certificates.csv, file does not exist" else: for line in csv.DictReader(fp): uid = int(line['user_id']) fields = [ 'download_url', 'grade', 'course_id', 'key', 'distinction', 'status', 'verify_uuid', 'download_uuid', 'name', 'created_date', 'modified_date', 'error_reason', 'mode', ] copy_elements(line, uic[uid], fields, prefix="certificate_") if 'user_id' not in uic[uid]: uic[uid]['user_id'] = uid # sanity check for entries with user_id but missing username nmissing_uname = 0 for uid, entry in uic.iteritems(): if (not 'username' in entry) or (not entry['username']): nmissing_uname += 1 if nmissing_uname < 10: print "missing username: %s" % entry print "--> %d entries missing username" % nmissing_uname sys.stdout.flush() # sanity check for entries missing course_id nmissing_cid = 0 for uid, entry in uic.iteritems(): if (not 'enrollment_course_id' in entry) or (not entry['enrollment_course_id']): nmissing_cid += 1 entry['enrollment_course_id'] = course_id print "--> %d entries missing enrollment_course_id (all fixed by setting to %s)" % ( nmissing_cid, course_id) sys.stdout.flush() fp = openfile('user_id_map.csv') if fp is None: print "--> Skipping user_id_map.csv, file does not exist" else: for line in csv.DictReader(fp): uid = int(line['id']) fields = ['hash_id'] copy_elements(line, uic[uid], fields, prefix="id_map_") # sort by userid uidset = uic.keys() uidset.sort() # write out result, checking schema along the way fieldnames = the_dict_schema.keys() ofp = openfile('user_info_combo.json.gz', 'w') ocsv = csv.DictWriter(openfile('user_info_combo.csv.gz', 'w'), fieldnames=fieldnames) ocsv.writeheader() for uid in uidset: data = uic[uid] check_schema(uid, data, the_ds=the_dict_schema, coerce=True) if ('enrollment_course_id' not in data) and ('certificate_course_id' not in data): print "Oops! missing course_id in user_info_combo line: inconsistent SQL?" print "data = %s" % data print "Suppressing this row" continue row_course_id = data.get('enrollment_course_id', data.get('certificate_course_id', '')) if not row_course_id == course_id: print "Oops! course_id=%s in user_info_combo line: inconsistent with expected=%s" % ( row_course_id, course_id) print "data = %s" % data print "Suppressing this row" continue ofp.write(json.dumps(data) + '\n') try: ocsv.writerow(data) except Exception as err: print "failed to write data=%s" % data raise print "Done with make_user_info_combo for %s" % course_id sys.stdout.flush()
def process_file(course_id, basedir=None, datedir=None, use_dataset_latest=False): basedir = path(basedir or '') course_dir = course_id.replace('/','__') lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest) cdir = lfp print "Processing %s from files in %s" % (course_id, cdir) sys.stdout.flush() mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_user_info_combo.json' % mypath the_dict_schema = schema2dict(json.loads(open(SCHEMA_FILE).read())['user_info_combo']) uic = defaultdict(dict) # dict with key = user_id, and val = dict to be written out as JSON line def copy_elements(src, dest, fields, prefix="", skip_empty=False): for key in fields: if skip_empty and (not key in src): src[key] = None if src[key]=='NULL': continue if key=='course_id' and src[key].startswith('course-v1:'): # special handling for mangled "opaque keys" version of course_id, e.g. course-v1:MITx+6.00.2x_3+1T2015 src[key] = src[key].split(':',1)[1].replace('+','/') dest[prefix + key] = src[key] def openfile(fn_in, mode='r', add_dir=True): if add_dir: fn = cdir / fn_in else: fn = fn_in if (not os.path.exists(fn)) and (not fn.endswith('.gz')): fn += ".gz" if mode=='r' and not os.path.exists(fn): newfn = convert_sql(fn) # try converting from *.sql file, if that exists if not newfn: return None # failure, no file found, return None fn = newfn if fn.endswith('.gz'): return gzip.GzipFile(fn, mode) return open(fn, mode) def tsv2csv(fn_in, fn_out): import csv fp = openfile(fn_out, 'w', add_dir=False) csvfp = csv.writer(fp) for line in openfile(fn_in, add_dir=False): csvfp.writerow(line[:-1].split('\t')) fp.close() def convert_sql(fnroot): ''' Returns filename if suitable file exists or was created by conversion of tab separated values to comma separated values. Returns False otherwise. ''' if fnroot.endswith('.gz'): fnroot = fnroot[:-3] if fnroot.endswith('.csv'): fnroot = fnroot[:-4] if os.path.exists(fnroot + ".csv"): return fnroot + ".csv" if os.path.exists(fnroot + ".csv.gz"): return fnroot + ".csv.gz" if os.path.exists(fnroot + ".sql") or os.path.exists(fnroot + ".sql.gz"): infn = fnroot + '.sql' outfn = fnroot + '.csv.gz' print "--> Converting %s to %s" % (infn, outfn) tsv2csv(infn, outfn) return outfn return False nusers = 0 fields = ['username', 'email', 'is_staff', 'last_login', 'date_joined'] for line in csv.DictReader(openfile('users.csv')): uid = int(line['id']) copy_elements(line, uic[uid], fields) uic[uid]['user_id'] = uid nusers += 1 uic[uid]['y1_anomalous'] = None print " %d users loaded from users.csv" % nusers fp = openfile('profiles.csv') if fp is None: print "--> Skipping profiles.csv, file does not exist" else: nprofiles = 0 fields = ['name', 'language', 'location', 'meta', 'courseware', 'gender', 'mailing_address', 'year_of_birth', 'level_of_education', 'goals', 'allow_certificate', 'country', 'city'] for line in csv.DictReader(fp): uid = int(line['user_id']) copy_elements(line, uic[uid], fields, prefix="profile_") nprofiles += 1 print " %d profiles loaded from profiles.csv" % nprofiles fp = openfile('enrollment.csv') if fp is None: print "--> Skipping enrollment.csv, file does not exist" else: nenrollments = 0 fields = ['course_id', 'created', 'is_active', 'mode', ] for line in csv.DictReader(fp): uid = int(line['user_id']) copy_elements(line, uic[uid], fields, prefix="enrollment_") nenrollments += 1 print " %d enrollments loaded from profiles.csv" % nenrollments # see if from_mongodb files are present for this course; if so, merge in that data mongodir = cdir.dirname() / 'from_mongodb' if mongodir.exists(): print "--> %s exists, merging in users, profile, and enrollment data from mongodb" % mongodir sys.stdout.flush() fp = gzip.GzipFile(mongodir / "users.json.gz") fields = ['username', 'email', 'is_staff', 'last_login', 'date_joined'] nadded = 0 for line in fp: pdata = json.loads(line) uid = int(pdata['_id']) if not uid in uic: copy_elements(pdata, uic[uid], fields, skip_empty=True) uic[uid]['user_id'] = uid nadded += 1 fp.close() print " %d additional users loaded from %s/users.json.gz" % (nadded, mongodir) fp = gzip.GzipFile(mongodir / "profiles.json.gz") fields = ['name', 'language', 'location', 'meta', 'courseware', 'gender', 'mailing_address', 'year_of_birth', 'level_of_education', 'goals', 'allow_certificate', 'country', 'city'] nadd_profiles = 0 def fix_unicode(elem, fields): for k in fields: if (k in elem) and elem[k]: elem[k] = elem[k].encode('utf8') for line in fp: pdata = json.loads(line.decode('utf8')) uid = int(pdata['user_id']) if not uic[uid].get('profile_name', None): copy_elements(pdata, uic[uid], fields, prefix="profile_", skip_empty=True) fix_unicode(uic[uid], ['profile_name', 'profile_mailing_address', 'profile_goals', 'profile_location', 'profile_language']) uic[uid]['y1_anomalous'] = 1 nadd_profiles += 1 fp.close() print " %d additional profiles loaded from %s/profiles.json.gz" % (nadd_profiles, mongodir) # if datedir is specified, then do not add entries from mongodb where the enrollment happened after the datedir cutoff cutoff = None if datedir: cutoff = "%s 00:00:00" % datedir fp = gzip.GzipFile(mongodir / "enrollment.json.gz") fields = ['course_id', 'created', 'is_active', 'mode', ] nadd_enrollment = 0 n_removed_after_cutoff = 0 for line in fp: pdata = json.loads(line.decode('utf8')) uid = int(pdata['user_id']) if not uic[uid].get('enrollment_course_id', None): if cutoff and (pdata['created'] > cutoff) and (uic[uid].get('y1_anomalous')==1): # remove if enrolled after datedir cutoff uic.pop(uid) n_removed_after_cutoff += 1 else: copy_elements(pdata, uic[uid], fields, prefix="enrollment_", skip_empty=True) nadd_enrollment += 1 fp.close() print " %d additional enrollments loaded from %s/enrollment.json.gz" % (nadd_enrollment, mongodir) print " from mongodb files, added %s (of %s) new users (%s profiles, %s enrollments, %s after cutoff %s)" % (nadded - n_removed_after_cutoff, nadded, nadd_profiles, nadd_enrollment, n_removed_after_cutoff, cutoff) sys.stdout.flush() fp = openfile('certificates.csv') if fp is None: print "--> Skipping certificates.csv, file does not exist" else: for line in csv.DictReader(fp): uid = int(line['user_id']) fields = ['download_url', 'grade', 'course_id', 'key', 'distinction', 'status', 'verify_uuid', 'download_uuid', 'name', 'created_date', 'modified_date', 'error_reason', 'mode',] copy_elements(line, uic[uid], fields, prefix="certificate_") if 'user_id' not in uic[uid]: uic[uid]['user_id'] = uid # sanity check for entries with user_id but missing username nmissing_uname = 0 for uid, entry in uic.iteritems(): if (not 'username' in entry) or (not entry['username']): nmissing_uname += 1 if nmissing_uname < 10: print "missing username: %s" % entry print "--> %d entries missing username" % nmissing_uname sys.stdout.flush() # sanity check for entries missing course_id nmissing_cid = 0 for uid, entry in uic.iteritems(): if (not 'enrollment_course_id' in entry) or (not entry['enrollment_course_id']): nmissing_cid += 1 entry['enrollment_course_id'] = course_id print "--> %d entries missing enrollment_course_id (all fixed by setting to %s)" % (nmissing_cid, course_id) sys.stdout.flush() fp = openfile('user_id_map.csv') if fp is None: print "--> Skipping user_id_map.csv, file does not exist" else: for line in csv.DictReader(fp): uid = int(line['id']) fields = ['hash_id'] copy_elements(line, uic[uid], fields, prefix="id_map_") # sort by userid uidset = uic.keys() uidset.sort() # write out result, checking schema along the way fieldnames = the_dict_schema.keys() ofp = openfile('user_info_combo.json.gz', 'w') ocsv = csv.DictWriter(openfile('user_info_combo.csv.gz', 'w'), fieldnames=fieldnames) ocsv.writeheader() for uid in uidset: data = uic[uid] check_schema(uid, data, the_ds=the_dict_schema, coerce=True) if ('enrollment_course_id' not in data) and ('certificate_course_id' not in data): print "Oops! missing course_id in user_info_combo line: inconsistent SQL?" print "data = %s" % data print "Suppressing this row" continue row_course_id = data.get('enrollment_course_id', data.get('certificate_course_id', '')) if not row_course_id==course_id: print "Oops! course_id=%s in user_info_combo line: inconsistent with expected=%s" % (row_course_id, course_id) print "data = %s" % data print "Suppressing this row" continue ofp.write(json.dumps(data) + '\n') try: ocsv.writerow(data) except Exception as err: print "failed to write data=%s" % data raise print "Done with make_user_info_combo for %s" % course_id sys.stdout.flush()
import string import datetime import traceback from addmoduleid import add_module_id from check_schema_tracking_log import check_schema, schema2dict from load_course_sql import find_course_sql_dir from path import path from edx2course_axis import date_parse import bqutil import gsutil sfn = 'schema_forum.json' mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA = json.loads(open('%s/schemas/%s' % (mypath, sfn)).read())['forum'] SCHEMA_DICT = schema2dict(SCHEMA) def do_rephrase(data, do_schema_check=True, linecnt=0): if '_id' in data: data['mongoid'] = data['_id']['$oid'] data.pop('_id') if 'parent_id' in data: data['parent_id'] = data['parent_id']['$oid'] def fix_date(dstr): if dstr: try: dtime = int(dstr) if dtime:
def make_video_stats(course_id, api_key, basedir, datedir, force_recompute, use_dataset_latest, use_latest_sql_dir): ''' Create Video stats for Videos Viewed and Videos Watched. First create a video axis, based on course axis. Then use tracking logs to count up videos viewed and videos watched ''' assert api_key is not None, "[analyze videos]: Public API Key is missing from configuration file. Visit https://developers.google.com/console/help/new/#generatingdevkeys for details on how to generate public key, and then add to edx2bigquery_config.py as API_KEY variable" # Get Course Dir path basedir = path(basedir or '') course_dir = course_id.replace('/','__') lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest or use_latest_sql_dir) # get schema mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/%s' % ( mypath, SCHEMA_VIDEO_AXIS ) the_schema = json.loads(open(SCHEMA_FILE).read())[ SCHEMA_VIDEO_AXIS_NAME ] the_dict_schema = schema2dict(the_schema) # Create initial video axis videoAxisExists = False dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) va_date = None try: tinfo = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_AXIS ) assert tinfo is not None, "[analyze videos] %s.%s does not exist. First time creating table" % ( dataset, TABLE_VIDEO_AXIS ) videoAxisExists = True va_date = tinfo['lastModifiedTime'] # datetime except (AssertionError, Exception) as err: print "%s --> Attempting to process %s table" % ( str(err), TABLE_VIDEO_AXIS ) sys.stdout.flush() # get course axis time ca_date = None try: tinfo = bqutil.get_bq_table_info(dataset, TABLE_COURSE_AXIS ) ca_date = tinfo['lastModifiedTime'] # datetime except (AssertionError, Exception) as err: pass if videoAxisExists and (not force_recompute) and ca_date and va_date and (ca_date > va_date): force_recompute = True print "video_axis exists, but has date %s, older than course_axis date %s; forcing recompute" % (va_date, ca_date) sys.stdout.flush() if not videoAxisExists or force_recompute: force_recompute = True createVideoAxis(course_id=course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest) # Get video lengths va = bqutil.get_table_data(dataset, TABLE_VIDEO_AXIS) assert va is not None, "[analyze videos] Possibly no data in video axis table. Check course axis table" va_bqdata = va['data'] fileoutput = lfp / FILENAME_VIDEO_AXIS getYoutubeDurations( dataset=dataset, bq_table_input=va_bqdata, api_key=api_key, outputfilename=fileoutput, schema=the_dict_schema, force_recompute=force_recompute ) # upload and import video axis gsfn = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest) / FILENAME_VIDEO_AXIS gsutil.upload_file_to_gs(fileoutput, gsfn) table = TABLE_VIDEO_AXIS bqutil.load_data_to_table(dataset, table, gsfn, the_schema, wait=True) else: print "[analyze videos] %s.%s already exists (and force recompute not specified). Skipping step to generate %s using latest course axis" % ( dataset, TABLE_VIDEO_AXIS, TABLE_VIDEO_AXIS ) # Lastly, create video stats createVideoStats_day( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest ) createVideoStats( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest ) # also create person_course_video_watched createPersonCourseVideo( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest )
def do_save(cid, caset_in, xbundle, datadir, log_msg, use_dataset_latest=False): ''' Save course axis data to bigquery cid = course_id caset = list of course axis data in dict format xbundle = XML bundle of course (everything except static files) datadir = directory where output files should be written log_msg = list of messages about processing errors and issues ''' # BigQuery requires data to fit within a schema; let's make sure our lines all fit the schema mypath = os.path.dirname(os.path.realpath(__file__)) the_schema = json.loads(open('%s/schemas/schema_course_axis.json' % mypath).read())['course_axis'] dict_schema = schema2dict(the_schema) caset = copy.deepcopy(caset_in) datadir = path(datadir) cafn = datadir / 'course_axis.json' xbfn = datadir / ('xbundle_%s.xml' % (cid.replace('/','__'))) fp = open(cafn, 'w') linecnt = 0 for ca in caset: linecnt += 1 ca['course_id'] = cid data = ca['data'] if data and not type(data)==dict: try: ca['data'] = json.loads(data) # make it native, for mongo except Exception as err: print "failed to create json for %s, error=%s" % (data, err) if ca['start'] is not None: ca['start'] = str(ca['start']) # datetime to string if ca['due'] is not None: ca['due'] = str(ca['due']) # datetime to string if (ca['data'] is None) or (ca['data']==''): ca.pop('data') check_schema(linecnt, ca, the_ds=dict_schema, coerce=True) try: # db.course_axis.insert(ca) fp.write(json.dumps(ca)+'\n') except Exception as err: print "Failed to save! Error=%s, data=%s" % (err, ca) fp.close() # upload axis.json file and course xbundle gsdir = path(gsutil.gs_path_from_course_id(cid, use_dataset_latest=use_dataset_latest)) if 1: gsutil.upload_file_to_gs(cafn, gsdir, options="-z json", verbose=False) gsutil.upload_file_to_gs(xbfn, gsdir, options='-z xml', verbose=False) # import into BigQuery dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent(dataset) # create dataset if not already existent table = "course_axis" bqutil.load_data_to_table(dataset, table, gsdir / (cafn.basename()), the_schema) msg = "="*100 + '\n' msg += "Course axis for %s\n" % (cid) msg += "="*100 + '\n' msg += '\n'.join(log_msg) msg = msg[:16184] # max message length 16384 bqutil.add_description_to_table(dataset, table, msg, append=True) print " Done - inserted %s records into course_axis" % len(caset)
def process_file(course_id, basedir=None, datedir=None, use_dataset_latest=False): basedir = path(basedir or '') course_dir = course_id.replace('/','__') lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest) cdir = lfp print "Processing %s role data in %s" % (course_id, cdir) sys.stdout.flush() mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_roles.json' % mypath the_dict_schema = schema2dict(json.loads(open(SCHEMA_FILE).read())['staff']) role_data = defaultdict(dict) # dict with key = user_id, and val = dict to be written out as JSON line def convertLongToWide( longData, unique_id, column_header, values, columns ): import numpy as np print " Currently %d roles assigned... Dropping duplicates" % len(longData) sys.stdout.flush() ld2 = longData.drop_duplicates(subset=[unique_id, column_header])#, keep='last') print " %d unique roles assigned" % len(ld2) print " Processing role data with these known roles: %s" % columns sys.stdout.flush() wideData = ld2.pivot( index=unique_id, columns=column_header, values=values ) wideData = pd.DataFrame( wideData , index=wideData.index ).reset_index() # Ensuring all specified columns exist cols = wideData.columns.tolist() for c in columns: if c not in cols: wideData[ c ] = pd.Series([np.nan]) return wideData def createUniqueId( longData ): import numpy as np if pd.notnull(longData['user_id']) and pd.notnull(['course_id']): uid = str(longData['user_id']) + str('__') + str(longData['course_id']) else: uid = 'NULL' return uid def splitUniqueId( wideData ): import numpy as np wideData['user_id'] = pd.Series([np.nan]) wideData['course_id'] = pd.Series([np.nan]) wideData[['user_id', 'course_id']] = wideData['uid'].apply( lambda x: pd.Series(x.split('__') ) ) wideData['course_id'] = wideData['course_id'].apply( forceTransparentCourseId ) return wideData def forceTransparentCourseId( course_id ): #if key=='course_id' and src[key].startswith('course-v1:'): if 'course-v1' in course_id: # special handling for mangled "opaque keys" version of course_id, e.g. course-v1:MITx+6.00.2x_3+1T2015 #src[key] = src[key].split(':',1)[1].replace('+','/') return course_id.split(':',1)[1].replace('+','/') else: return course_id def openfile(fn_in, mode='r', add_dir=True, pandas_df=False): if pandas_df: return pd.read_csv( gzip.GzipFile( cdir / fn_in ), sep=',' ) else: if add_dir: fn = cdir / fn_in else: fn = fn_in if fn.endswith('.gz'): return gzip.GzipFile(fn, mode) return open(fn, mode) def createRoleVar( wideData, all_roles ): if wideData[ all_roles ].notnull().values.any(): return str("Staff") else: return str("Student") def cleanRoles( longData, unique_id, column_header, values, columns, allfields ): longData[ values ] = int(1) longData['uid'] = longData.apply(createUniqueId, axis=1 ) wideData = convertLongToWide( longData, unique_id=unique_id, column_header=column_header, values=values, columns=columns ) wideData = splitUniqueId( wideData )[ allfields ] return wideData def copy_elements(src, dest, fields, prefix="", skip_empty=False): for key in fields: if skip_empty and (not key in src): src[key] = None if src[key]=='NULL': continue if key=='course_id' and src[key].startswith('course-v1:'): # special handling for mangled "opaque keys" version of course_id, e.g. course-v1:MITx+6.00.2x_3+1T2015 src[key] = src[key].split(':',1)[1].replace('+','/') # Ensure integer for floats, or null if pd.isnull(src[key]): copyKey = None elif type(src[key]) == float: copyKey = int(float(src[key])) else: copyKey = src[key] dest[prefix + key] = copyKey #roledata = pd.read_csv( cdir / ROLE_COURSE_ACCESS, sep=',') #rolediscdata = pd.read_csv( cdir / ROLE_FORUM_ACCESS, sep=',') roledata = openfile( fn_in=ROLE_COURSE_ACCESS, mode='w', pandas_df=True) rolediscdata = openfile( fn_in=ROLE_FORUM_ACCESS, mode='w', pandas_df=True) # Process Course Access Roles known_roles_course = OrderedDict([ ('beta_testers', 'roles_isBetaTester'), ('instructor', 'roles_isInstructor'), ('staff', 'roles_isStaff'), ('ccx_coach', 'roles_isCCX'), ('finance_admin', 'roles_isFinance'), ('library_user', 'roles_isLibrary'), ('sales_admin', 'roles_isSales') ]) base_fields = ['user_id', 'course_id'] fields = base_fields + known_roles_course.keys() print " Cleaning %s" % ROLE_COURSE_ACCESS wide_roledata = cleanRoles( roledata, 'uid', 'role', 'value', known_roles_course.keys(), fields ) # Process Forum Discussion Roles known_roles_disc = OrderedDict([ ('Administrator','forumRoles_isAdmin'), ('CommunityTA','forumRoles_isCommunityTA'), ('Moderator', 'forumRoles_isModerator'), ('Student', 'forumRoles_isStudent') ]) base_fields = ['user_id', 'course_id'] extra_fields = ['roles'] # To be used to create custom mapping based on existing course and discussion forum roles fields = base_fields + known_roles_disc.keys() print " Cleaning %s" % ROLE_FORUM_ACCESS wide_rolediscdata = cleanRoles( rolediscdata, 'uid', 'name', 'value', known_roles_disc.keys(), fields ) # Compile fields_all = base_fields + known_roles_course.keys() + known_roles_disc.keys() + extra_fields rename_dict = dict(known_roles_course, **known_roles_disc) wideData = pd.merge( wide_roledata, wide_rolediscdata, how='outer', on=["user_id", "course_id"], suffixes=['', '_disc'] ) # Create Roles var all_roles = known_roles_disc.keys() + known_roles_course.keys() all_roles.remove('Student') wideData['roles'] = wideData.apply( createRoleVar, args=[all_roles], axis=1 ) # Rename columns wideData = wideData[ fields_all ] wideData.rename( columns=rename_dict, inplace=True ) finalFields = wideData.columns.tolist() # Write out fieldnames = the_dict_schema.keys() ofp = openfile('roles.json.gz', 'w') ocsv = csv.DictWriter(openfile('roles.csv', 'w'), fieldnames=finalFields) ocsv.writeheader() wideData_dict = wideData.to_dict(orient='record') for line in wideData_dict: uid = int(line['user_id']) copy_elements(line, role_data[uid], finalFields, skip_empty=False ) role_data[uid]['user_id'] = uid data = role_data[uid] ofp.write(json.dumps(data) + '\n') try: ocsv.writerow(data) except Exception as err: print "failed to write data=%s" % data raise print "Done with make_roles for %s" % course_id sys.stdout.flush()